import time import os import datetime from selenium import webdriver from pathlib import Path from Tools.domain_extract import Extracter class Peeper: def __init__(self): # self.driver_path = '/Users/mazeyu/Downloads/chromedriver' self.driver_path = os.path.dirname(__file__) + '/chromedriver' # tested with ChromeDriver version 2.42 self.binary_path = os.path.dirname(__file__) + '/adg-osx/Chromium.app/Contents/MacOS/Chromium' # command for saving graph data self.log_extraction_script = "document.createCDATASection('NOTVERYUNIQUESTRING');" # self.driver = self.driverInit(proxy_enable=False) self.curl = None def driverInit(self, proxy_enable=False, page_load_timeout=60): """ Init Chromium Driver, proxy_enable is set to use proxy """ chrome_options = webdriver.ChromeOptions() if proxy_enable: chrome_options.add_argument("--proxy-server=http://127.0.0.1:1087") # chrome_options.add_argument('headless') chrome_options.add_argument('--disable-application-cache') chrome_options.add_argument('--disable-infobars') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--chrome-binary=' + self.binary_path) chrome_options.binary_location = self.binary_path driver = webdriver.Chrome(self.driver_path, options=chrome_options) driver.set_page_load_timeout(page_load_timeout) return driver @staticmethod def findLoginButton(driver): """ Fuzzy matching the login button: 1 case: a xpath text contains "登录" 2 case: a xpath div tag contains class attribute, and it contains "login" """ # noinspection PyBroadException try: login_button = driver.find_element_by_xpath('//*[contains(text(),"登录/注册")]') # print("button1") return login_button except Exception as e: pass # noinspection PyBroadException try: login_button = driver.find_element_by_xpath('//*[contains(text(),"登录")]') # print("button1") return login_button except Exception as e: pass # noinspection PyBroadException try: login_button = driver.find_element_by_xpath('//div[contains(@class ,"login")]') # print("button2") return login_button except Exception as e: pass # noinspection PyBroadException try: login_button = driver.find_element_by_xpath('//div[contains(@class ,"register")]') # print("button2") return login_button except Exception as e: pass return None def extractGraph(self, driver, file_write_time_out=3): """ extract graph to rendering_stream """ try: driver.execute_script(self.log_extraction_script) time.sleep(file_write_time_out) except BaseException as e: print('[Main Frame] Something went wrong: ' + str(e)) def peeping(self, url): try: driver = self.driverInit(proxy_enable=False) except Exception as e: print("error:", e, url) return try: driver.get(url) driver.maximize_window() time.sleep(1) """ first, find the login button and click then, Whether there is a login button or not, run 'extractGraph' for each window_handle """ cur_url = driver.current_url is_click = True rdir, _ = Extracter.extract(cur_url) today = datetime.datetime.now().strftime('%y-%m-%d') save_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + "/result/" + rdir + \ today + "/" if not Path(save_path).exists(): os.mkdir(save_path) f = open(save_path + rdir + ".html", "w") f.write(driver.page_source) f.close() driver.save_screenshot(save_path + rdir + ".png") """ 登录功能触发: 1、如果使能,则浪费时间 2、否则 """ # login_button = self.findLoginButton(driver) # if login_button: # return 1 # else: # return 0 # noinspection PyBroadException # if login_button: # try: # login_button.click() # time.sleep(2) # except Exception as e: # print("Error:", e) # is_click = False self.extractGraph(driver) return rdir, today # if is_click: # handles = driver.window_handles # if len(handles) > 1: # driver.switch_to.window(handles[1]) # self.extractGraph(driver) # elif cur_url != driver.current_url: # driver.back() # self.extractGraph(driver) except Exception as e: print('[Main Frame] Something went wrong: ', e) finally: driver.quit() if __name__ == "__main__": a = Peeper() rdir = a.peeping("http://www.baidu.com") print(rdir)