This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
zhuyujia-webhopper/analyzer/get_chain.py

303 lines
10 KiB
Python
Raw Normal View History

2022-05-05 20:41:28 +08:00
import collections
import pandas as pd
import numpy as np
import tqdm
import time
2022-05-05 20:41:28 +08:00
import threading
from Tools.domain_extract import Extracter
from DBopt.service2db import Service2DB
from analyzer.file_loader import Loader
class NumGen:
def __iter__(self):
self.i = 0
return self
def __next__(self):
self.i += 1
return "r" + str(self.i)
class GetService:
def __init__(self):
self.chain = None
self.cursor = Service2DB()
self.loader = Loader()
self.bitmap = {
"NodeCreation": 0,
"NodeInsertion": 1,
"NodeRemoval": 2,
"NodeAttachLater": 3,
"AttrAddition": 4,
"AttrModification": 5,
"AttrRemoval": 6,
"AttrStyleTextAddition": 7,
"AttrStyleRemoval": 8,
"NetworkScriptRequest": 9,
"NetworkImageRequest": 10,
"NetworkIframeRequest": 11,
"NetworkXMLHTTPRequest": 12,
"NetworkLinkRequest": 13,
"NetworkVideoRequest": 14
}
def toSQL(self, service=None, checker=None):
dat = pd.DataFrame(self.cursor.dat)
for key, value in service.items():
li = list(value.values())
if li[3] != "0" and li[3] in service:
li[3] = service[li[3]]["resource_url"]
# events
events = li.pop(-1)
# pop score
score = li.pop(-1)
li.extend(list(events.values()))
# score
li.append(score)
# category
li.append("unknown")
# website
li.append(checker.host)
# service host
host, _ = checker.isThirdParty(li[0])
li.insert(0, host)
# insert into dataframe
dat.loc[dat.shape[0]] = li
self.cursor.writeDB(dat)
@staticmethod
def addNewNode(url=None, actor=None, rtype=None, isthird=None, isEval=0):
"""
:param url: the resource url
:param actor: the resource's parent, usually they come from a javascript
:param rtype: the type of the resource: img(1), css(2), js(3), iframe(4), video(5), xml(6),
:param isthird: whether the resource is a third-party resource
:param isEval: whether the resource is ScriptEval
:return: node: presented by a num, if it's rtype is not 3, the events of node is none
"""
node = {
"resource_url": url,
"resource_type": rtype,
"isThirdParty": isthird,
"parent": actor,
"isEvalNode": isEval,
"score": 0.0,
"events": {
"NodeCreation": 0,
"NodeInsertion": 0,
"NodeRemoval": 0,
"NodeAttachLater": 0,
"AttrAddition": 0,
"AttrModification": 0,
"AttrRemoval": 0,
"AttrStyleTextAddition": 0,
"AttrStyleRemoval": 0,
"NetworkScriptRequest": 0,
"NetworkImageRequest": 0,
"NetworkIframeRequest": 0,
"NetworkXMLHTTPRequest": 0,
"NetworkLinkRequest": 0,
"NetworkVideoRequest": 0
}
}
return node
def calEventsNum(self, graph):
"""
calculate all events number in the graph
ipt: the graph , data["timeline"]
opt: the denominator, a 14-dimension vector
"""
denominator = np.array([1] * 15)
for node in graph:
event_type = node.get("event_type")
i = self.bitmap.get(event_type, -1)
if i == -1:
continue
denominator[i] += 1
return denominator
@staticmethod
def calScore(service, denominator, showProcess):
2022-05-05 20:41:28 +08:00
"""
calculate the score for each resource:
for javascript: we use events' 14-dimention vector divide denominator and add them together
e.g. [1 2 3 4]/[2 3 4 5], the score is 1/2 + 2/3 + 3/4 + 4/5
for other resource: just calculate 1/the num of resource
e.g. for a image node, the total number of image is x, then the score is 1/x
for the chain: The parent's score is equal to its own score plus all of its children's scores
e.g. a.com -> b.com -> c.com: a's score = a's score + b's score + c'score
b's score = b's score + c'score
c's score = c's score
"""
if showProcess:
print("计算js评分:")
print("计算资源渲染行为比重NodeCreation, NodeInsertion, "
"NodeRemoval, NodeAttachLater, AttrAddition, AttrModification, AttrRemoval, "
"AttrStyleTextAddition, NetworkScriptRequest, NetworkImageRequest, "
"NetworkIframeRequest, NetworkXMLHTTPRequest, NetworkLinkRequest, NetworkVideoRequest")
if showProcess:
for key, prop in tqdm.tqdm(service.items()):
time.sleep(0.05)
# other resource
if key.startswith("r"):
offset = prop["resource_type"]
prop["score"] = 1 / denominator[offset - 7]
continue
2022-05-05 20:41:28 +08:00
# script resource
event_vec = np.array(list(prop["events"].values()))
# print(event_vec)
prop["score"] = sum(np.divide(event_vec, denominator))
2022-05-05 20:41:28 +08:00
# the chain solution
parent = prop["parent"]
if parent not in service:
continue
if parent != "0":
service[parent]["score"] += prop["score"]
else:
for key, prop in service.items():
# other resource
if key.startswith("r"):
offset = prop["resource_type"]
prop["score"] = 1 / denominator[offset - 7]
continue
# script resource
event_vec = np.array(list(prop["events"].values()))
# print(event_vec)
prop["score"] = sum(np.divide(event_vec, denominator))
# the chain solution
parent = prop["parent"]
if parent not in service:
continue
if parent != "0":
service[parent]["score"] += prop["score"]
2022-05-05 20:41:28 +08:00
@staticmethod
def event2Num(event_type):
"""
network request node type transform to num
:param event_type:
:return: num
"""
e2n = {
"NetworkScriptRequest": 1,
"NetworkImageRequest": 2,
"NetworkIframeRequest": 3,
"NetworkXMLHTTPRequest": 4,
"NetworkLinkRequest": 5,
"NetworkVideoRequest": 6
}
return e2n.get(event_type, 0)
def run(self, filename, showProcess):
2022-05-05 20:41:28 +08:00
print(threading.current_thread().name + ':' + filename)
service = dict()
# to find a script's parent, we need to map script_url to script_id
script2id = collections.defaultdict(str)
data = self.loader.readGraph(filename)
# resource serial number generator
num_gen = NumGen()
niter = iter(num_gen)
# third-parth checker
checker = Extracter(data["url"])
# find all script node and add them into service
for node in data["timeline"]:
script_id = node.get("script_id", None)
script_url = node.get("script_url", None)
event_type = node.get("event_type")
# the script eval node is special, it is some script's son, and it actually did something in the page
if event_type == "ScriptEval":
parent = node["script_parent_id"]
new_node = self.addNewNode(rtype=1, isEval=1, actor=parent)
service[script_id] = new_node
script2id[script_url] = script_id
continue
if not script_url:
continue
# the normal script node, compile and execute
if script_id not in service:
_, isthird = checker.isThirdParty(script_url)
new_node = self.addNewNode(url=script_url, rtype=1, isthird=isthird, actor="0")
service[script_id] = new_node
script2id[script_url] = script_id
# second, find all network request node and build service dependence chain
# third, add events to script node
for node in data["timeline"]:
event_type = node["event_type"]
if event_type == "ScriptEval":
continue
rtype = self.event2Num(event_type)
actor = node.get("actor_id", "0")
# network request node
if rtype != 0:
url = node["request_url"]
if not url:
continue
# deal with network link request ??
if rtype == 5 and not url.endswith(".css"):
continue
# deal with network script request ??
if rtype == 1:
if url not in script2id:
continue
script_id = script2id[url]
service[script_id]["parent"] = actor
continue
_, isthird = checker.isThirdParty(url)
new_node = self.addNewNode(url, actor, rtype, isthird)
service[next(niter)] = new_node
if actor != "0" and actor in service:
service[actor]["events"][event_type] += 1
denominator = self.calEventsNum(data["timeline"])
self.calScore(service, denominator, showProcess)
2022-05-05 20:41:28 +08:00
"""
save to mysql
"""
# f = open("./failed.txt", "a")
# try:
# self.toSQL(service, checker)
# except Exception as e:
# print(e)
# f.write(filename + "\n")
# f.close()
"""
save to json file
"""
# self.loader.dumpFile(checker.host + ".json", service)
return service
if __name__ == "__main__":
# path = "../log_yiminjiayuan.com_1636696236.549183的副本.json"
path = "/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json"
a = GetService()
res = a.run(path)