This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
zhuyujia-webhopper/collector/Peeper.py

154 lines
5.3 KiB
Python
Raw Normal View History

2022-05-05 20:41:28 +08:00
import time
import os
import datetime
from selenium import webdriver
from pathlib import Path
from Tools.domain_extract import Extracter
class Peeper:
def __init__(self):
self.driver_path = os.path.dirname(__file__) + '/chromedriver'
# tested with ChromeDriver version 2.42
self.binary_path = os.path.dirname(__file__) + '/adg-osx/Chromium.app/Contents/MacOS/Chromium'
# command for saving graph data
self.log_extraction_script = "document.createCDATASection('NOTVERYUNIQUESTRING');"
# self.driver = self.driverInit(proxy_enable=False)
self.curl = None
def driverInit(self, proxy_enable=False, page_load_timeout=60):
"""
Init Chromium Driver, proxy_enable is set to use proxy
"""
chrome_options = webdriver.ChromeOptions()
if proxy_enable:
chrome_options.add_argument("--proxy-server=http://127.0.0.1:1087")
# chrome_options.add_argument('headless')
chrome_options.add_argument('--disable-application-cache')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--chrome-binary=' + self.binary_path)
chrome_options.binary_location = self.binary_path
driver = webdriver.Chrome(self.driver_path, options=chrome_options)
driver.set_page_load_timeout(page_load_timeout)
return driver
@staticmethod
def findLoginButton(driver):
"""
Fuzzy matching the login button:
1 case: a xpath text contains "登录"
2 case: a xpath div tag contains class attribute, and it contains "login"
"""
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录/注册")]')
# print("button1")
return login_button
except Exception as e:
pass
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录")]')
# print("button1")
return login_button
except Exception as e:
pass
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//div[contains(@class ,"login")]')
# print("button2")
return login_button
except Exception as e:
pass
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//div[contains(@class ,"register")]')
# print("button2")
return login_button
except Exception as e:
pass
return None
def extractGraph(self, driver, file_write_time_out=3):
"""
extract graph to rendering_stream
"""
try:
driver.execute_script(self.log_extraction_script)
time.sleep(file_write_time_out)
except BaseException as e:
print('[Main Frame] Something went wrong: ' + str(e))
def peeping(self, url):
try:
driver = self.driverInit(proxy_enable=False)
except Exception as e:
print("error:", e, url)
return 0
try:
driver.get(url)
driver.maximize_window()
time.sleep(1)
"""
first, find the login button and click
then, Whether there is a login button or not, run 'extractGraph' for each window_handle
"""
cur_url = driver.current_url
is_click = True
rdir, _ = Extracter.extract(cur_url)
today = datetime.datetime.now().strftime('%y-%m-%d')
save_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + "/result/" + rdir + \
today + "/"
if not Path(save_path).exists():
os.mkdir(save_path)
f = open(save_path + rdir + ".html", "w")
f.write(driver.page_source)
f.close()
driver.save_screenshot(save_path + rdir + ".png")
"""
登录功能触发
1如果使能则浪费时间
2否则
"""
# login_button = self.findLoginButton(driver)
# if login_button:
# return 1
# else:
# return 0
# noinspection PyBroadException
# if login_button:
# try:
# login_button.click()
# time.sleep(2)
# except Exception as e:
# print("Error:", e)
# is_click = False
self.extractGraph(driver)
return rdir, today
# if is_click:
# handles = driver.window_handles
# if len(handles) > 1:
# driver.switch_to.window(handles[1])
# self.extractGraph(driver)
# elif cur_url != driver.current_url:
# driver.back()
# self.extractGraph(driver)
except Exception as e:
print('[Main Frame] Something went wrong: ', e)
finally:
driver.quit()
if __name__ == "__main__":
a = Peeper()
rdir = a.peeping("http://www.baidu.com")
print(rdir)