155 lines
5.4 KiB
Python
155 lines
5.4 KiB
Python
import time
|
||
import os
|
||
import datetime
|
||
from selenium import webdriver
|
||
from pathlib import Path
|
||
from Tools.domain_extract import Extracter
|
||
|
||
|
||
class Peeper:
|
||
|
||
def __init__(self):
|
||
# self.driver_path = '/Users/mazeyu/Downloads/chromedriver'
|
||
self.driver_path = os.path.dirname(__file__) + '/chromedriver'
|
||
# tested with ChromeDriver version 2.42
|
||
self.binary_path = os.path.dirname(__file__) + '/adg-osx/Chromium.app/Contents/MacOS/Chromium'
|
||
# command for saving graph data
|
||
self.log_extraction_script = "document.createCDATASection('NOTVERYUNIQUESTRING');"
|
||
# self.driver = self.driverInit(proxy_enable=False)
|
||
self.curl = None
|
||
|
||
def driverInit(self, proxy_enable=False, page_load_timeout=60):
|
||
"""
|
||
Init Chromium Driver, proxy_enable is set to use proxy
|
||
"""
|
||
chrome_options = webdriver.ChromeOptions()
|
||
if proxy_enable:
|
||
chrome_options.add_argument("--proxy-server=http://127.0.0.1:1087")
|
||
# chrome_options.add_argument('headless')
|
||
chrome_options.add_argument('--disable-application-cache')
|
||
chrome_options.add_argument('--disable-infobars')
|
||
chrome_options.add_argument('--no-sandbox')
|
||
chrome_options.add_argument('--chrome-binary=' + self.binary_path)
|
||
chrome_options.binary_location = self.binary_path
|
||
|
||
driver = webdriver.Chrome(self.driver_path, options=chrome_options)
|
||
driver.set_page_load_timeout(page_load_timeout)
|
||
return driver
|
||
|
||
@staticmethod
|
||
def findLoginButton(driver):
|
||
"""
|
||
Fuzzy matching the login button:
|
||
1 case: a xpath text contains "登录"
|
||
2 case: a xpath div tag contains class attribute, and it contains "login"
|
||
"""
|
||
# noinspection PyBroadException
|
||
try:
|
||
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录/注册")]')
|
||
# print("button1")
|
||
return login_button
|
||
except Exception as e:
|
||
pass
|
||
# noinspection PyBroadException
|
||
try:
|
||
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录")]')
|
||
# print("button1")
|
||
return login_button
|
||
except Exception as e:
|
||
pass
|
||
# noinspection PyBroadException
|
||
try:
|
||
login_button = driver.find_element_by_xpath('//div[contains(@class ,"login")]')
|
||
# print("button2")
|
||
return login_button
|
||
except Exception as e:
|
||
pass
|
||
# noinspection PyBroadException
|
||
try:
|
||
login_button = driver.find_element_by_xpath('//div[contains(@class ,"register")]')
|
||
# print("button2")
|
||
return login_button
|
||
except Exception as e:
|
||
pass
|
||
return None
|
||
|
||
def extractGraph(self, driver, file_write_time_out=3):
|
||
"""
|
||
extract graph to rendering_stream
|
||
"""
|
||
try:
|
||
driver.execute_script(self.log_extraction_script)
|
||
time.sleep(file_write_time_out)
|
||
except BaseException as e:
|
||
print('[Main Frame] Something went wrong: ' + str(e))
|
||
|
||
def peeping(self, url):
|
||
try:
|
||
driver = self.driverInit(proxy_enable=False)
|
||
except Exception as e:
|
||
print("error:", e, url)
|
||
return
|
||
try:
|
||
driver.get(url)
|
||
driver.maximize_window()
|
||
time.sleep(1)
|
||
"""
|
||
first, find the login button and click
|
||
then, Whether there is a login button or not, run 'extractGraph' for each window_handle
|
||
"""
|
||
cur_url = driver.current_url
|
||
is_click = True
|
||
rdir, _ = Extracter.extract(cur_url)
|
||
|
||
today = datetime.datetime.now().strftime('%y-%m-%d')
|
||
save_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + "/result/" + rdir + \
|
||
today + "/"
|
||
|
||
if not Path(save_path).exists():
|
||
os.mkdir(save_path)
|
||
f = open(save_path + rdir + ".html", "w")
|
||
f.write(driver.page_source)
|
||
f.close()
|
||
driver.save_screenshot(save_path + rdir + ".png")
|
||
"""
|
||
登录功能触发:
|
||
1、如果使能,则浪费时间
|
||
2、否则
|
||
"""
|
||
# login_button = self.findLoginButton(driver)
|
||
# if login_button:
|
||
# return 1
|
||
# else:
|
||
# return 0
|
||
# noinspection PyBroadException
|
||
# if login_button:
|
||
# try:
|
||
# login_button.click()
|
||
# time.sleep(2)
|
||
# except Exception as e:
|
||
# print("Error:", e)
|
||
# is_click = False
|
||
|
||
self.extractGraph(driver)
|
||
return rdir, today
|
||
# if is_click:
|
||
# handles = driver.window_handles
|
||
# if len(handles) > 1:
|
||
# driver.switch_to.window(handles[1])
|
||
# self.extractGraph(driver)
|
||
# elif cur_url != driver.current_url:
|
||
# driver.back()
|
||
# self.extractGraph(driver)
|
||
except Exception as e:
|
||
print('[Main Frame] Something went wrong: ', e)
|
||
|
||
finally:
|
||
driver.quit()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
a = Peeper()
|
||
rdir = a.peeping("http://www.baidu.com")
|
||
print(rdir)
|
||
|