zhuyujia-webhopper/analyzer/get_chain.py

import collections
import pandas as pd
import numpy as np
import tqdm
import time
import threading
from Tools.domain_extract import Extracter
from DBopt.service2db import Service2DB
from analyzer.file_loader import Loader


class NumGen:

    def __iter__(self):
        self.i = 0
        return self

    def __next__(self):
        self.i += 1
        return "r" + str(self.i)


class GetService:

    def __init__(self):
        self.chain = None
        self.cursor = Service2DB()
        self.loader = Loader()
        self.bitmap = {
            "NodeCreation": 0,
            "NodeInsertion": 1,
            "NodeRemoval": 2,
            "NodeAttachLater": 3,
            "AttrAddition": 4,
            "AttrModification": 5,
            "AttrRemoval": 6,
            "AttrStyleTextAddition": 7,
            "AttrStyleRemoval": 8,
            "NetworkScriptRequest": 9,
            "NetworkImageRequest": 10,
            "NetworkIframeRequest": 11,
            "NetworkXMLHTTPRequest": 12,
            "NetworkLinkRequest": 13,
            "NetworkVideoRequest": 14
        }

    def toSQL(self, service=None, checker=None):
        dat = pd.DataFrame(self.cursor.dat)
        for key, value in service.items():
            li = list(value.values())
            if li[3] != "0" and li[3] in service:
                li[3] = service[li[3]]["resource_url"]
            # events
            events = li.pop(-1)
            # pop score
            score = li.pop(-1)
            li.extend(list(events.values()))
            # score
            li.append(score)
            # category
            li.append("unknown")
            # website
            li.append(checker.host)
            # service host
            host, _ = checker.isThirdParty(li[0])
            li.insert(0, host)
            # insert into dataframe
            dat.loc[dat.shape[0]] = li

        self.cursor.writeDB(dat)

    @staticmethod
    def addNewNode(url=None, actor=None, rtype=None, isthird=None, isEval=0):
        """
        :param url:     the resource url
        :param actor:   the resource's parent, usually they come from a javascript
        :param rtype:   the type of the resource: img(1), css(2), js(3), iframe(4), video(5), xml(6),
        :param isthird: whether the resource is a third-party resource
        :param isEval:  whether the resource is ScriptEval
        :return: node:  presented by a num, if it's rtype is not 3, the events of node is none
        """
        node = {
            "resource_url": url,
            "resource_type": rtype,
            "isThirdParty": isthird,
            "parent": actor,
            "isEvalNode": isEval,
            "score": 0.0,
            "events": {
                "NodeCreation": 0,
                "NodeInsertion": 0,
                "NodeRemoval": 0,
                "NodeAttachLater": 0,
                "AttrAddition": 0,
                "AttrModification": 0,
                "AttrRemoval": 0,
                "AttrStyleTextAddition": 0,
                "AttrStyleRemoval": 0,
                "NetworkScriptRequest": 0,
                "NetworkImageRequest": 0,
                "NetworkIframeRequest": 0,
                "NetworkXMLHTTPRequest": 0,
                "NetworkLinkRequest": 0,
                "NetworkVideoRequest": 0
            }
        }
        return node

    def calEventsNum(self, graph):
        """
        calculate all events number in the graph
        ipt: the graph , data["timeline"]
        opt: the denominator, a 14-dimension vector
        """
        denominator = np.array([1] * 15)
        for node in graph:
            event_type = node.get("event_type")
            i = self.bitmap.get(event_type, -1)
            if i == -1:
                continue
            denominator[i] += 1
        return denominator

    @staticmethod
    def calScore(service, denominator, showProcess):
        """
        calculate the score for each resource:
        for javascript: we use events' 14-dimention vector divide denominator and add them together
        e.g.  [1 2 3 4]/[2 3 4 5], the score is 1/2 + 2/3 + 3/4 + 4/5

        for other resource: just calculate 1/the num of resource
        e.g. for a image node, the total number of image is x, then the score is 1/x

        for the chain: The parent's score is equal to its own score plus all of its children's scores
        e.g. a.com -> b.com -> c.com:   a's score = a's score + b's score + c'score
                                        b's score = b's score + c'score
                                        c's score = c's score
        """
        if showProcess:
            print("计算js评分:")
            print("计算资源渲染行为比重：NodeCreation, NodeInsertion, "
                  "NodeRemoval, NodeAttachLater, AttrAddition, AttrModification, AttrRemoval, "
                  "AttrStyleTextAddition, NetworkScriptRequest, NetworkImageRequest, "
                  "NetworkIframeRequest, NetworkXMLHTTPRequest, NetworkLinkRequest, NetworkVideoRequest")
        if showProcess:
            for key, prop in tqdm.tqdm(service.items()):
                time.sleep(0.05)
                # other resource
                if key.startswith("r"):
                    offset = prop["resource_type"]
                    prop["score"] = 1 / denominator[offset - 7]
                    continue

                # script resource
                event_vec = np.array(list(prop["events"].values()))
                # print(event_vec)
                prop["score"] = sum(np.divide(event_vec, denominator))

                # the chain solution
                parent = prop["parent"]
                if parent not in service:
                    continue
                if parent != "0":
                    service[parent]["score"] += prop["score"]
        else:
            for key, prop in service.items():
                # other resource
                if key.startswith("r"):
                    offset = prop["resource_type"]
                    prop["score"] = 1 / denominator[offset - 7]
                    continue

                # script resource
                event_vec = np.array(list(prop["events"].values()))
                # print(event_vec)
                prop["score"] = sum(np.divide(event_vec, denominator))

                # the chain solution
                parent = prop["parent"]
                if parent not in service:
                    continue
                if parent != "0":
                    service[parent]["score"] += prop["score"]

    @staticmethod
    def event2Num(event_type):
        """
        network request node type transform to num
        :param event_type:
        :return: num
        """
        e2n = {
            "NetworkScriptRequest": 1,
            "NetworkImageRequest": 2,
            "NetworkIframeRequest": 3,
            "NetworkXMLHTTPRequest": 4,
            "NetworkLinkRequest": 5,
            "NetworkVideoRequest": 6
        }
        return e2n.get(event_type, 0)

    def run(self, filename, showProcess):
        print(threading.current_thread().name + ':' + filename)
        service = dict()
        # to find a script's parent, we need to map script_url to script_id
        script2id = collections.defaultdict(str)
        data = self.loader.readGraph(filename)

        # resource serial number generator
        num_gen = NumGen()
        niter = iter(num_gen)

        # third-parth checker
        checker = Extracter(data["url"])

        # find all script node and add them into service
        for node in data["timeline"]:
            script_id = node.get("script_id", None)
            script_url = node.get("script_url", None)
            event_type = node.get("event_type")

            # the script eval node is special, it is some script's son, and it actually did something in the page
            if event_type == "ScriptEval":
                parent = node["script_parent_id"]
                new_node = self.addNewNode(rtype=1, isEval=1, actor=parent)
                service[script_id] = new_node
                script2id[script_url] = script_id
                continue

            if not script_url:
                continue

            # the normal script node, compile and execute
            if script_id not in service:
                _, isthird = checker.isThirdParty(script_url)
                new_node = self.addNewNode(url=script_url, rtype=1, isthird=isthird, actor="0")
                service[script_id] = new_node
                script2id[script_url] = script_id

        # second, find all network request node and build service dependence chain
        # third, add events to script node
        for node in data["timeline"]:
            event_type = node["event_type"]
            if event_type == "ScriptEval":
                continue

            rtype = self.event2Num(event_type)
            actor = node.get("actor_id", "0")

            # network request node
            if rtype != 0:
                url = node["request_url"]

                if not url:
                    continue
                # deal with network link request   ??
                if rtype == 5 and not url.endswith(".css"):
                    continue
                # deal with network script request  ??
                if rtype == 1:
                    if url not in script2id:
                        continue
                    script_id = script2id[url]
                    service[script_id]["parent"] = actor
                    continue

                _, isthird = checker.isThirdParty(url)
                new_node = self.addNewNode(url, actor, rtype, isthird)
                service[next(niter)] = new_node

            if actor != "0" and actor in service:
                service[actor]["events"][event_type] += 1

        denominator = self.calEventsNum(data["timeline"])

        self.calScore(service, denominator, showProcess)

        """
        save to mysql
        """
        # f = open("./failed.txt", "a")
        # try:
        #     self.toSQL(service, checker)
        # except Exception as e:
        #     print(e)
        #     f.write(filename + "\n")
        # f.close()

        """
        save to json file
        """
        # self.loader.dumpFile(checker.host + ".json", service)
        return service


if __name__ == "__main__":
    # path = "../log_yiminjiayuan.com_1636696236.549183的副本.json"
    path = "/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json"
    a = GetService()
    res = a.run(path)