This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
zhuyujia-webhopper/analyzer/get_chain.py

303 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import collections
import pandas as pd
import numpy as np
import tqdm
import time
import threading
from Tools.domain_extract import Extracter
from DBopt.service2db import Service2DB
from analyzer.file_loader import Loader
class NumGen:
def __iter__(self):
self.i = 0
return self
def __next__(self):
self.i += 1
return "r" + str(self.i)
class GetService:
def __init__(self):
self.chain = None
self.cursor = Service2DB()
self.loader = Loader()
self.bitmap = {
"NodeCreation": 0,
"NodeInsertion": 1,
"NodeRemoval": 2,
"NodeAttachLater": 3,
"AttrAddition": 4,
"AttrModification": 5,
"AttrRemoval": 6,
"AttrStyleTextAddition": 7,
"AttrStyleRemoval": 8,
"NetworkScriptRequest": 9,
"NetworkImageRequest": 10,
"NetworkIframeRequest": 11,
"NetworkXMLHTTPRequest": 12,
"NetworkLinkRequest": 13,
"NetworkVideoRequest": 14
}
def toSQL(self, service=None, checker=None):
dat = pd.DataFrame(self.cursor.dat)
for key, value in service.items():
li = list(value.values())
if li[3] != "0" and li[3] in service:
li[3] = service[li[3]]["resource_url"]
# events
events = li.pop(-1)
# pop score
score = li.pop(-1)
li.extend(list(events.values()))
# score
li.append(score)
# category
li.append("unknown")
# website
li.append(checker.host)
# service host
host, _ = checker.isThirdParty(li[0])
li.insert(0, host)
# insert into dataframe
dat.loc[dat.shape[0]] = li
self.cursor.writeDB(dat)
@staticmethod
def addNewNode(url=None, actor=None, rtype=None, isthird=None, isEval=0):
"""
:param url: the resource url
:param actor: the resource's parent, usually they come from a javascript
:param rtype: the type of the resource: img(1), css(2), js(3), iframe(4), video(5), xml(6),
:param isthird: whether the resource is a third-party resource
:param isEval: whether the resource is ScriptEval
:return: node: presented by a num, if it's rtype is not 3, the events of node is none
"""
node = {
"resource_url": url,
"resource_type": rtype,
"isThirdParty": isthird,
"parent": actor,
"isEvalNode": isEval,
"score": 0.0,
"events": {
"NodeCreation": 0,
"NodeInsertion": 0,
"NodeRemoval": 0,
"NodeAttachLater": 0,
"AttrAddition": 0,
"AttrModification": 0,
"AttrRemoval": 0,
"AttrStyleTextAddition": 0,
"AttrStyleRemoval": 0,
"NetworkScriptRequest": 0,
"NetworkImageRequest": 0,
"NetworkIframeRequest": 0,
"NetworkXMLHTTPRequest": 0,
"NetworkLinkRequest": 0,
"NetworkVideoRequest": 0
}
}
return node
def calEventsNum(self, graph):
"""
calculate all events number in the graph
ipt: the graph , data["timeline"]
opt: the denominator, a 14-dimension vector
"""
denominator = np.array([1] * 15)
for node in graph:
event_type = node.get("event_type")
i = self.bitmap.get(event_type, -1)
if i == -1:
continue
denominator[i] += 1
return denominator
@staticmethod
def calScore(service, denominator, showProcess):
"""
calculate the score for each resource:
for javascript: we use events' 14-dimention vector divide denominator and add them together
e.g. [1 2 3 4]/[2 3 4 5], the score is 1/2 + 2/3 + 3/4 + 4/5
for other resource: just calculate 1/the num of resource
e.g. for a image node, the total number of image is x, then the score is 1/x
for the chain: The parent's score is equal to its own score plus all of its children's scores
e.g. a.com -> b.com -> c.com: a's score = a's score + b's score + c'score
b's score = b's score + c'score
c's score = c's score
"""
if showProcess:
print("计算js评分:")
print("计算资源渲染行为比重NodeCreation, NodeInsertion, "
"NodeRemoval, NodeAttachLater, AttrAddition, AttrModification, AttrRemoval, "
"AttrStyleTextAddition, NetworkScriptRequest, NetworkImageRequest, "
"NetworkIframeRequest, NetworkXMLHTTPRequest, NetworkLinkRequest, NetworkVideoRequest")
if showProcess:
for key, prop in tqdm.tqdm(service.items()):
time.sleep(0.05)
# other resource
if key.startswith("r"):
offset = prop["resource_type"]
prop["score"] = 1 / denominator[offset - 7]
continue
# script resource
event_vec = np.array(list(prop["events"].values()))
# print(event_vec)
prop["score"] = sum(np.divide(event_vec, denominator))
# the chain solution
parent = prop["parent"]
if parent not in service:
continue
if parent != "0":
service[parent]["score"] += prop["score"]
else:
for key, prop in service.items():
# other resource
if key.startswith("r"):
offset = prop["resource_type"]
prop["score"] = 1 / denominator[offset - 7]
continue
# script resource
event_vec = np.array(list(prop["events"].values()))
# print(event_vec)
prop["score"] = sum(np.divide(event_vec, denominator))
# the chain solution
parent = prop["parent"]
if parent not in service:
continue
if parent != "0":
service[parent]["score"] += prop["score"]
@staticmethod
def event2Num(event_type):
"""
network request node type transform to num
:param event_type:
:return: num
"""
e2n = {
"NetworkScriptRequest": 1,
"NetworkImageRequest": 2,
"NetworkIframeRequest": 3,
"NetworkXMLHTTPRequest": 4,
"NetworkLinkRequest": 5,
"NetworkVideoRequest": 6
}
return e2n.get(event_type, 0)
def run(self, filename, showProcess):
print(threading.current_thread().name + ':' + filename)
service = dict()
# to find a script's parent, we need to map script_url to script_id
script2id = collections.defaultdict(str)
data = self.loader.readGraph(filename)
# resource serial number generator
num_gen = NumGen()
niter = iter(num_gen)
# third-parth checker
checker = Extracter(data["url"])
# find all script node and add them into service
for node in data["timeline"]:
script_id = node.get("script_id", None)
script_url = node.get("script_url", None)
event_type = node.get("event_type")
# the script eval node is special, it is some script's son, and it actually did something in the page
if event_type == "ScriptEval":
parent = node["script_parent_id"]
new_node = self.addNewNode(rtype=1, isEval=1, actor=parent)
service[script_id] = new_node
script2id[script_url] = script_id
continue
if not script_url:
continue
# the normal script node, compile and execute
if script_id not in service:
_, isthird = checker.isThirdParty(script_url)
new_node = self.addNewNode(url=script_url, rtype=1, isthird=isthird, actor="0")
service[script_id] = new_node
script2id[script_url] = script_id
# second, find all network request node and build service dependence chain
# third, add events to script node
for node in data["timeline"]:
event_type = node["event_type"]
if event_type == "ScriptEval":
continue
rtype = self.event2Num(event_type)
actor = node.get("actor_id", "0")
# network request node
if rtype != 0:
url = node["request_url"]
if not url:
continue
# deal with network link request ??
if rtype == 5 and not url.endswith(".css"):
continue
# deal with network script request ??
if rtype == 1:
if url not in script2id:
continue
script_id = script2id[url]
service[script_id]["parent"] = actor
continue
_, isthird = checker.isThirdParty(url)
new_node = self.addNewNode(url, actor, rtype, isthird)
service[next(niter)] = new_node
if actor != "0" and actor in service:
service[actor]["events"][event_type] += 1
denominator = self.calEventsNum(data["timeline"])
self.calScore(service, denominator, showProcess)
"""
save to mysql
"""
# f = open("./failed.txt", "a")
# try:
# self.toSQL(service, checker)
# except Exception as e:
# print(e)
# f.write(filename + "\n")
# f.close()
"""
save to json file
"""
# self.loader.dumpFile(checker.host + ".json", service)
return service
if __name__ == "__main__":
# path = "../log_yiminjiayuan.com_1636696236.549183的副本.json"
path = "/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json"
a = GetService()
res = a.run(path)