import collections import pandas as pd import numpy as np import tqdm import time import threading from Tools.domain_extract import Extracter from DBopt.service2db import Service2DB from analyzer.file_loader import Loader class NumGen: def __iter__(self): self.i = 0 return self def __next__(self): self.i += 1 return "r" + str(self.i) class GetService: def __init__(self): self.chain = None self.cursor = Service2DB() self.loader = Loader() self.bitmap = { "NodeCreation": 0, "NodeInsertion": 1, "NodeRemoval": 2, "NodeAttachLater": 3, "AttrAddition": 4, "AttrModification": 5, "AttrRemoval": 6, "AttrStyleTextAddition": 7, "AttrStyleRemoval": 8, "NetworkScriptRequest": 9, "NetworkImageRequest": 10, "NetworkIframeRequest": 11, "NetworkXMLHTTPRequest": 12, "NetworkLinkRequest": 13, "NetworkVideoRequest": 14 } def toSQL(self, service=None, checker=None): dat = pd.DataFrame(self.cursor.dat) for key, value in service.items(): li = list(value.values()) if li[3] != "0" and li[3] in service: li[3] = service[li[3]]["resource_url"] # events events = li.pop(-1) # pop score score = li.pop(-1) li.extend(list(events.values())) # score li.append(score) # category li.append("unknown") # website li.append(checker.host) # service host host, _ = checker.isThirdParty(li[0]) li.insert(0, host) # insert into dataframe dat.loc[dat.shape[0]] = li self.cursor.writeDB(dat) @staticmethod def addNewNode(url=None, actor=None, rtype=None, isthird=None, isEval=0): """ :param url: the resource url :param actor: the resource's parent, usually they come from a javascript :param rtype: the type of the resource: img(1), css(2), js(3), iframe(4), video(5), xml(6), :param isthird: whether the resource is a third-party resource :param isEval: whether the resource is ScriptEval :return: node: presented by a num, if it's rtype is not 3, the events of node is none """ node = { "resource_url": url, "resource_type": rtype, "isThirdParty": isthird, "parent": actor, "isEvalNode": isEval, "score": 0.0, "events": { "NodeCreation": 0, "NodeInsertion": 0, "NodeRemoval": 0, "NodeAttachLater": 0, "AttrAddition": 0, "AttrModification": 0, "AttrRemoval": 0, "AttrStyleTextAddition": 0, "AttrStyleRemoval": 0, "NetworkScriptRequest": 0, "NetworkImageRequest": 0, "NetworkIframeRequest": 0, "NetworkXMLHTTPRequest": 0, "NetworkLinkRequest": 0, "NetworkVideoRequest": 0 } } return node def calEventsNum(self, graph): """ calculate all events number in the graph ipt: the graph , data["timeline"] opt: the denominator, a 14-dimension vector """ denominator = np.array([1] * 15) for node in graph: event_type = node.get("event_type") i = self.bitmap.get(event_type, -1) if i == -1: continue denominator[i] += 1 return denominator @staticmethod def calScore(service, denominator, showProcess): """ calculate the score for each resource: for javascript: we use events' 14-dimention vector divide denominator and add them together e.g. [1 2 3 4]/[2 3 4 5], the score is 1/2 + 2/3 + 3/4 + 4/5 for other resource: just calculate 1/the num of resource e.g. for a image node, the total number of image is x, then the score is 1/x for the chain: The parent's score is equal to its own score plus all of its children's scores e.g. a.com -> b.com -> c.com: a's score = a's score + b's score + c'score b's score = b's score + c'score c's score = c's score """ if showProcess: print("计算js评分:") print("计算资源渲染行为比重:NodeCreation, NodeInsertion, " "NodeRemoval, NodeAttachLater, AttrAddition, AttrModification, AttrRemoval, " "AttrStyleTextAddition, NetworkScriptRequest, NetworkImageRequest, " "NetworkIframeRequest, NetworkXMLHTTPRequest, NetworkLinkRequest, NetworkVideoRequest") if showProcess: for key, prop in tqdm.tqdm(service.items()): time.sleep(0.05) # other resource if key.startswith("r"): offset = prop["resource_type"] prop["score"] = 1 / denominator[offset - 7] continue # script resource event_vec = np.array(list(prop["events"].values())) # print(event_vec) prop["score"] = sum(np.divide(event_vec, denominator)) # the chain solution parent = prop["parent"] if parent not in service: continue if parent != "0": service[parent]["score"] += prop["score"] else: for key, prop in service.items(): # other resource if key.startswith("r"): offset = prop["resource_type"] prop["score"] = 1 / denominator[offset - 7] continue # script resource event_vec = np.array(list(prop["events"].values())) # print(event_vec) prop["score"] = sum(np.divide(event_vec, denominator)) # the chain solution parent = prop["parent"] if parent not in service: continue if parent != "0": service[parent]["score"] += prop["score"] @staticmethod def event2Num(event_type): """ network request node type transform to num :param event_type: :return: num """ e2n = { "NetworkScriptRequest": 1, "NetworkImageRequest": 2, "NetworkIframeRequest": 3, "NetworkXMLHTTPRequest": 4, "NetworkLinkRequest": 5, "NetworkVideoRequest": 6 } return e2n.get(event_type, 0) def run(self, filename, showProcess): print(threading.current_thread().name + ':' + filename) service = dict() # to find a script's parent, we need to map script_url to script_id script2id = collections.defaultdict(str) data = self.loader.readGraph(filename) # resource serial number generator num_gen = NumGen() niter = iter(num_gen) # third-parth checker checker = Extracter(data["url"]) # find all script node and add them into service for node in data["timeline"]: script_id = node.get("script_id", None) script_url = node.get("script_url", None) event_type = node.get("event_type") # the script eval node is special, it is some script's son, and it actually did something in the page if event_type == "ScriptEval": parent = node["script_parent_id"] new_node = self.addNewNode(rtype=1, isEval=1, actor=parent) service[script_id] = new_node script2id[script_url] = script_id continue if not script_url: continue # the normal script node, compile and execute if script_id not in service: _, isthird = checker.isThirdParty(script_url) new_node = self.addNewNode(url=script_url, rtype=1, isthird=isthird, actor="0") service[script_id] = new_node script2id[script_url] = script_id # second, find all network request node and build service dependence chain # third, add events to script node for node in data["timeline"]: event_type = node["event_type"] if event_type == "ScriptEval": continue rtype = self.event2Num(event_type) actor = node.get("actor_id", "0") # network request node if rtype != 0: url = node["request_url"] if not url: continue # deal with network link request ?? if rtype == 5 and not url.endswith(".css"): continue # deal with network script request ?? if rtype == 1: if url not in script2id: continue script_id = script2id[url] service[script_id]["parent"] = actor continue _, isthird = checker.isThirdParty(url) new_node = self.addNewNode(url, actor, rtype, isthird) service[next(niter)] = new_node if actor != "0" and actor in service: service[actor]["events"][event_type] += 1 denominator = self.calEventsNum(data["timeline"]) self.calScore(service, denominator, showProcess) """ save to mysql """ # f = open("./failed.txt", "a") # try: # self.toSQL(service, checker) # except Exception as e: # print(e) # f.write(filename + "\n") # f.close() """ save to json file """ # self.loader.dumpFile(checker.host + ".json", service) return service if __name__ == "__main__": # path = "../log_yiminjiayuan.com_1636696236.549183的副本.json" path = "/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json" a = GetService() res = a.run(path)