This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
zhuyujia-webhopper/collector/Peeper.py

155 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import time
import os
import datetime
from selenium import webdriver
from pathlib import Path
from Tools.domain_extract import Extracter
class Peeper:
def __init__(self):
# self.driver_path = '/Users/mazeyu/Downloads/chromedriver'
self.driver_path = os.path.dirname(__file__) + '/chromedriver'
# tested with ChromeDriver version 2.42
self.binary_path = os.path.dirname(__file__) + '/adg-osx/Chromium.app/Contents/MacOS/Chromium'
# command for saving graph data
self.log_extraction_script = "document.createCDATASection('NOTVERYUNIQUESTRING');"
# self.driver = self.driverInit(proxy_enable=False)
self.curl = None
def driverInit(self, proxy_enable=False, page_load_timeout=60):
"""
Init Chromium Driver, proxy_enable is set to use proxy
"""
chrome_options = webdriver.ChromeOptions()
if proxy_enable:
chrome_options.add_argument("--proxy-server=http://127.0.0.1:1087")
# chrome_options.add_argument('headless')
chrome_options.add_argument('--disable-application-cache')
chrome_options.add_argument('--disable-infobars')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--chrome-binary=' + self.binary_path)
chrome_options.binary_location = self.binary_path
driver = webdriver.Chrome(self.driver_path, options=chrome_options)
driver.set_page_load_timeout(page_load_timeout)
return driver
@staticmethod
def findLoginButton(driver):
"""
Fuzzy matching the login button:
1 case: a xpath text contains "登录"
2 case: a xpath div tag contains class attribute, and it contains "login"
"""
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录/注册")]')
# print("button1")
return login_button
except Exception as e:
pass
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//*[contains(text(),"登录")]')
# print("button1")
return login_button
except Exception as e:
pass
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//div[contains(@class ,"login")]')
# print("button2")
return login_button
except Exception as e:
pass
# noinspection PyBroadException
try:
login_button = driver.find_element_by_xpath('//div[contains(@class ,"register")]')
# print("button2")
return login_button
except Exception as e:
pass
return None
def extractGraph(self, driver, file_write_time_out=3):
"""
extract graph to rendering_stream
"""
try:
driver.execute_script(self.log_extraction_script)
time.sleep(file_write_time_out)
except BaseException as e:
print('[Main Frame] Something went wrong: ' + str(e))
def peeping(self, url):
try:
driver = self.driverInit(proxy_enable=False)
except Exception as e:
print("error:", e, url)
return
try:
driver.get(url)
driver.maximize_window()
time.sleep(1)
"""
first, find the login button and click
then, Whether there is a login button or not, run 'extractGraph' for each window_handle
"""
cur_url = driver.current_url
is_click = True
rdir, _ = Extracter.extract(cur_url)
today = datetime.datetime.now().strftime('%y-%m-%d')
save_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + "/result/" + rdir + \
today + "/"
if not Path(save_path).exists():
os.mkdir(save_path)
f = open(save_path + rdir + ".html", "w")
f.write(driver.page_source)
f.close()
driver.save_screenshot(save_path + rdir + ".png")
"""
登录功能触发:
1、如果使能则浪费时间
2、否则
"""
# login_button = self.findLoginButton(driver)
# if login_button:
# return 1
# else:
# return 0
# noinspection PyBroadException
# if login_button:
# try:
# login_button.click()
# time.sleep(2)
# except Exception as e:
# print("Error:", e)
# is_click = False
self.extractGraph(driver)
return rdir, today
# if is_click:
# handles = driver.window_handles
# if len(handles) > 1:
# driver.switch_to.window(handles[1])
# self.extractGraph(driver)
# elif cur_url != driver.current_url:
# driver.back()
# self.extractGraph(driver)
except Exception as e:
print('[Main Frame] Something went wrong: ', e)
finally:
driver.quit()
if __name__ == "__main__":
a = Peeper()
rdir = a.peeping("http://www.baidu.com")
print(rdir)