zhuyujia-webhopper/collector/Peeper.py

import time
import os
import datetime
from selenium import webdriver
from pathlib import Path
from Tools.domain_extract import Extracter


class Peeper:

    def __init__(self):
        # self.driver_path = '/Users/mazeyu/Downloads/chromedriver'
        self.driver_path = os.path.dirname(__file__) + '/chromedriver'
        # tested with ChromeDriver version 2.42
        self.binary_path = os.path.dirname(__file__) + '/adg-osx/Chromium.app/Contents/MacOS/Chromium'
        # command for saving graph data
        self.log_extraction_script = "document.createCDATASection('NOTVERYUNIQUESTRING');"
        # self.driver = self.driverInit(proxy_enable=False)
        self.curl = None

    def driverInit(self, proxy_enable=False, page_load_timeout=60):
        """
        Init Chromium Driver, proxy_enable is set to use proxy
        """
        chrome_options = webdriver.ChromeOptions()
        if proxy_enable:
            chrome_options.add_argument("--proxy-server=http://127.0.0.1:1087")
        # chrome_options.add_argument('headless')
        chrome_options.add_argument('--disable-application-cache')
        chrome_options.add_argument('--disable-infobars')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--chrome-binary=' + self.binary_path)
        chrome_options.binary_location = self.binary_path

        driver = webdriver.Chrome(self.driver_path, options=chrome_options)
        driver.set_page_load_timeout(page_load_timeout)
        return driver

    @staticmethod
    def findLoginButton(driver):
        """
        Fuzzy matching the login button:
        1 case: a xpath text contains "登录"
        2 case: a xpath div tag contains class attribute, and it contains "login"
        """
        # noinspection PyBroadException
        try:
            login_button = driver.find_element_by_xpath('//*[contains(text(),"登录/注册")]')
            # print("button1")
            return login_button
        except Exception as e:
            pass
        # noinspection PyBroadException
        try:
            login_button = driver.find_element_by_xpath('//*[contains(text(),"登录")]')
            # print("button1")
            return login_button
        except Exception as e:
            pass
        # noinspection PyBroadException
        try:
            login_button = driver.find_element_by_xpath('//div[contains(@class ,"login")]')
            # print("button2")
            return login_button
        except Exception as e:
            pass
        # noinspection PyBroadException
        try:
            login_button = driver.find_element_by_xpath('//div[contains(@class ,"register")]')
            # print("button2")
            return login_button
        except Exception as e:
            pass
        return None

    def extractGraph(self, driver, file_write_time_out=3):
        """
        extract graph to rendering_stream
        """
        try:
            driver.execute_script(self.log_extraction_script)
            time.sleep(file_write_time_out)
        except BaseException as e:
            print('[Main Frame] Something went wrong: ' + str(e))

    def peeping(self, url):
        try:
            driver = self.driverInit(proxy_enable=False)
        except Exception as e:
            print("error:", e, url)
            return
        try:
            driver.get(url)
            driver.maximize_window()
            time.sleep(1)
            """
            first, find the login button and click
            then, Whether there is a login button or not, run 'extractGraph' for each window_handle
            """
            cur_url = driver.current_url
            is_click = True
            rdir, _ = Extracter.extract(cur_url)

            today = datetime.datetime.now().strftime('%y-%m-%d')
            save_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + "/result/" + rdir + \
                        today + "/"

            if not Path(save_path).exists():
                os.mkdir(save_path)
            f = open(save_path + rdir + ".html", "w")
            f.write(driver.page_source)
            f.close()
            driver.save_screenshot(save_path + rdir + ".png")
            """
            登录功能触发：
            1、如果使能，则浪费时间
            2、否则
            """
            # login_button = self.findLoginButton(driver)
            # if login_button:
            #     return 1
            # else:
            #     return 0
            # noinspection PyBroadException
            # if login_button:
            #     try:
            #         login_button.click()
            #         time.sleep(2)
            #     except Exception as e:
            #         print("Error:", e)
            #         is_click = False

            self.extractGraph(driver)
            return rdir, today
            # if is_click:
            #     handles = driver.window_handles
            #     if len(handles) > 1:
            #         driver.switch_to.window(handles[1])
            #         self.extractGraph(driver)
            #     elif cur_url != driver.current_url:
            #         driver.back()
            #         self.extractGraph(driver)
        except Exception as e:
            print('[Main Frame] Something went wrong: ', e)

        finally:
            driver.quit()


if __name__ == "__main__":
    a = Peeper()
    rdir = a.peeping("http://www.baidu.com")
    print(rdir)