154 lines
5.3 KiB
Python
154 lines
5.3 KiB
Python
|
|
import time
|
|||
|
|
import os
|
|||
|
|
import datetime
|
|||
|
|
from selenium import webdriver
|
|||
|
|
from pathlib import Path
|
|||
|
|
from Tools.domain_extract import Extracter
|
|||
|
|
|
|||
|
|
|
|||
|
|
class Peeper:
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.driver_path = os.path.dirname(__file__) + '/chromedriver'
|
|||
|
|
# tested with ChromeDriver version 2.42
|
|||
|
|
self.binary_path = os.path.dirname(__file__) + '/adg-osx/Chromium.app/Contents/MacOS/Chromium'
|
|||
|
|
# command for saving graph data
|
|||
|
|
self.log_extraction_script = "document.createCDATASection('NOTVERYUNIQUESTRING');"
|
|||
|
|
# self.driver = self.driverInit(proxy_enable=False)
|
|||
|
|
self.curl = None
|
|||
|
|
|
|||
|
|
def driverInit(self, proxy_enable=False, page_load_timeout=60):
|
|||
|
|
"""
|
|||
|
|
Init Chromium Driver, proxy_enable is set to use proxy
|
|||
|
|
"""
|
|||
|
|
chrome_options = webdriver.ChromeOptions()
|
|||
|
|
if proxy_enable:
|
|||
|
|
chrome_options.add_argument("--proxy-server=http://127.0.0.1:1087")
|
|||
|
|
# chrome_options.add_argument('headless')
|
|||
|
|
chrome_options.add_argument('--disable-application-cache')
|
|||
|
|
chrome_options.add_argument('--disable-infobars')
|
|||
|
|
chrome_options.add_argument('--no-sandbox')
|
|||
|
|
chrome_options.add_argument('--chrome-binary=' + self.binary_path)
|
|||
|
|
chrome_options.binary_location = self.binary_path
|
|||
|
|
|
|||
|
|
driver = webdriver.Chrome(self.driver_path, options=chrome_options)
|
|||
|
|
driver.set_page_load_timeout(page_load_timeout)
|
|||
|
|
return driver
|
|||
|
|
|
|||
|
|
@staticmethod
|
|||
|
|
def findLoginButton(driver):
|
|||
|
|
"""
|
|||
|
|
Fuzzy matching the login button:
|
|||
|
|
1 case: a xpath text contains "登录"
|
|||
|
|
2 case: a xpath div tag contains class attribute, and it contains "login"
|
|||
|
|
"""
|
|||
|
|
# noinspection PyBroadException
|
|||
|
|
try:
|
|||
|
|
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录/注册")]')
|
|||
|
|
# print("button1")
|
|||
|
|
return login_button
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
# noinspection PyBroadException
|
|||
|
|
try:
|
|||
|
|
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录")]')
|
|||
|
|
# print("button1")
|
|||
|
|
return login_button
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
# noinspection PyBroadException
|
|||
|
|
try:
|
|||
|
|
login_button = driver.find_element_by_xpath('//div[contains(@class ,"login")]')
|
|||
|
|
# print("button2")
|
|||
|
|
return login_button
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
# noinspection PyBroadException
|
|||
|
|
try:
|
|||
|
|
login_button = driver.find_element_by_xpath('//div[contains(@class ,"register")]')
|
|||
|
|
# print("button2")
|
|||
|
|
return login_button
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
return None
|
|||
|
|
|
|||
|
|
def extractGraph(self, driver, file_write_time_out=3):
|
|||
|
|
"""
|
|||
|
|
extract graph to rendering_stream
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
driver.execute_script(self.log_extraction_script)
|
|||
|
|
time.sleep(file_write_time_out)
|
|||
|
|
except BaseException as e:
|
|||
|
|
print('[Main Frame] Something went wrong: ' + str(e))
|
|||
|
|
|
|||
|
|
def peeping(self, url):
|
|||
|
|
try:
|
|||
|
|
driver = self.driverInit(proxy_enable=False)
|
|||
|
|
except Exception as e:
|
|||
|
|
print("error:", e, url)
|
|||
|
|
return 0
|
|||
|
|
try:
|
|||
|
|
driver.get(url)
|
|||
|
|
driver.maximize_window()
|
|||
|
|
time.sleep(1)
|
|||
|
|
"""
|
|||
|
|
first, find the login button and click
|
|||
|
|
then, Whether there is a login button or not, run 'extractGraph' for each window_handle
|
|||
|
|
"""
|
|||
|
|
cur_url = driver.current_url
|
|||
|
|
is_click = True
|
|||
|
|
rdir, _ = Extracter.extract(cur_url)
|
|||
|
|
|
|||
|
|
today = datetime.datetime.now().strftime('%y-%m-%d')
|
|||
|
|
save_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + "/result/" + rdir + \
|
|||
|
|
today + "/"
|
|||
|
|
|
|||
|
|
if not Path(save_path).exists():
|
|||
|
|
os.mkdir(save_path)
|
|||
|
|
f = open(save_path + rdir + ".html", "w")
|
|||
|
|
f.write(driver.page_source)
|
|||
|
|
f.close()
|
|||
|
|
driver.save_screenshot(save_path + rdir + ".png")
|
|||
|
|
"""
|
|||
|
|
登录功能触发:
|
|||
|
|
1、如果使能,则浪费时间
|
|||
|
|
2、否则
|
|||
|
|
"""
|
|||
|
|
# login_button = self.findLoginButton(driver)
|
|||
|
|
# if login_button:
|
|||
|
|
# return 1
|
|||
|
|
# else:
|
|||
|
|
# return 0
|
|||
|
|
# noinspection PyBroadException
|
|||
|
|
# if login_button:
|
|||
|
|
# try:
|
|||
|
|
# login_button.click()
|
|||
|
|
# time.sleep(2)
|
|||
|
|
# except Exception as e:
|
|||
|
|
# print("Error:", e)
|
|||
|
|
# is_click = False
|
|||
|
|
|
|||
|
|
self.extractGraph(driver)
|
|||
|
|
return rdir, today
|
|||
|
|
# if is_click:
|
|||
|
|
# handles = driver.window_handles
|
|||
|
|
# if len(handles) > 1:
|
|||
|
|
# driver.switch_to.window(handles[1])
|
|||
|
|
# self.extractGraph(driver)
|
|||
|
|
# elif cur_url != driver.current_url:
|
|||
|
|
# driver.back()
|
|||
|
|
# self.extractGraph(driver)
|
|||
|
|
except Exception as e:
|
|||
|
|
print('[Main Frame] Something went wrong: ', e)
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
driver.quit()
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
a = Peeper()
|
|||
|
|
rdir = a.peeping("http://www.baidu.com")
|
|||
|
|
print(rdir)
|
|||
|
|
|