303 lines
10 KiB
Python
303 lines
10 KiB
Python
import collections
|
||
import pandas as pd
|
||
import numpy as np
|
||
import tqdm
|
||
import time
|
||
import threading
|
||
from Tools.domain_extract import Extracter
|
||
from DBopt.service2db import Service2DB
|
||
from analyzer.file_loader import Loader
|
||
|
||
|
||
class NumGen:
|
||
|
||
def __iter__(self):
|
||
self.i = 0
|
||
return self
|
||
|
||
def __next__(self):
|
||
self.i += 1
|
||
return "r" + str(self.i)
|
||
|
||
|
||
class GetService:
|
||
|
||
def __init__(self):
|
||
self.chain = None
|
||
self.cursor = Service2DB()
|
||
self.loader = Loader()
|
||
self.bitmap = {
|
||
"NodeCreation": 0,
|
||
"NodeInsertion": 1,
|
||
"NodeRemoval": 2,
|
||
"NodeAttachLater": 3,
|
||
"AttrAddition": 4,
|
||
"AttrModification": 5,
|
||
"AttrRemoval": 6,
|
||
"AttrStyleTextAddition": 7,
|
||
"AttrStyleRemoval": 8,
|
||
"NetworkScriptRequest": 9,
|
||
"NetworkImageRequest": 10,
|
||
"NetworkIframeRequest": 11,
|
||
"NetworkXMLHTTPRequest": 12,
|
||
"NetworkLinkRequest": 13,
|
||
"NetworkVideoRequest": 14
|
||
}
|
||
|
||
def toSQL(self, service=None, checker=None):
|
||
dat = pd.DataFrame(self.cursor.dat)
|
||
for key, value in service.items():
|
||
li = list(value.values())
|
||
if li[3] != "0" and li[3] in service:
|
||
li[3] = service[li[3]]["resource_url"]
|
||
# events
|
||
events = li.pop(-1)
|
||
# pop score
|
||
score = li.pop(-1)
|
||
li.extend(list(events.values()))
|
||
# score
|
||
li.append(score)
|
||
# category
|
||
li.append("unknown")
|
||
# website
|
||
li.append(checker.host)
|
||
# service host
|
||
host, _ = checker.isThirdParty(li[0])
|
||
li.insert(0, host)
|
||
# insert into dataframe
|
||
dat.loc[dat.shape[0]] = li
|
||
|
||
self.cursor.writeDB(dat)
|
||
|
||
@staticmethod
|
||
def addNewNode(url=None, actor=None, rtype=None, isthird=None, isEval=0):
|
||
"""
|
||
:param url: the resource url
|
||
:param actor: the resource's parent, usually they come from a javascript
|
||
:param rtype: the type of the resource: img(1), css(2), js(3), iframe(4), video(5), xml(6),
|
||
:param isthird: whether the resource is a third-party resource
|
||
:param isEval: whether the resource is ScriptEval
|
||
:return: node: presented by a num, if it's rtype is not 3, the events of node is none
|
||
"""
|
||
node = {
|
||
"resource_url": url,
|
||
"resource_type": rtype,
|
||
"isThirdParty": isthird,
|
||
"parent": actor,
|
||
"isEvalNode": isEval,
|
||
"score": 0.0,
|
||
"events": {
|
||
"NodeCreation": 0,
|
||
"NodeInsertion": 0,
|
||
"NodeRemoval": 0,
|
||
"NodeAttachLater": 0,
|
||
"AttrAddition": 0,
|
||
"AttrModification": 0,
|
||
"AttrRemoval": 0,
|
||
"AttrStyleTextAddition": 0,
|
||
"AttrStyleRemoval": 0,
|
||
"NetworkScriptRequest": 0,
|
||
"NetworkImageRequest": 0,
|
||
"NetworkIframeRequest": 0,
|
||
"NetworkXMLHTTPRequest": 0,
|
||
"NetworkLinkRequest": 0,
|
||
"NetworkVideoRequest": 0
|
||
}
|
||
}
|
||
return node
|
||
|
||
def calEventsNum(self, graph):
|
||
"""
|
||
calculate all events number in the graph
|
||
ipt: the graph , data["timeline"]
|
||
opt: the denominator, a 14-dimension vector
|
||
"""
|
||
denominator = np.array([1] * 15)
|
||
for node in graph:
|
||
event_type = node.get("event_type")
|
||
i = self.bitmap.get(event_type, -1)
|
||
if i == -1:
|
||
continue
|
||
denominator[i] += 1
|
||
return denominator
|
||
|
||
@staticmethod
|
||
def calScore(service, denominator, showProcess):
|
||
"""
|
||
calculate the score for each resource:
|
||
for javascript: we use events' 14-dimention vector divide denominator and add them together
|
||
e.g. [1 2 3 4]/[2 3 4 5], the score is 1/2 + 2/3 + 3/4 + 4/5
|
||
|
||
for other resource: just calculate 1/the num of resource
|
||
e.g. for a image node, the total number of image is x, then the score is 1/x
|
||
|
||
for the chain: The parent's score is equal to its own score plus all of its children's scores
|
||
e.g. a.com -> b.com -> c.com: a's score = a's score + b's score + c'score
|
||
b's score = b's score + c'score
|
||
c's score = c's score
|
||
"""
|
||
if showProcess:
|
||
print("计算js评分:")
|
||
print("计算资源渲染行为比重:NodeCreation, NodeInsertion, "
|
||
"NodeRemoval, NodeAttachLater, AttrAddition, AttrModification, AttrRemoval, "
|
||
"AttrStyleTextAddition, NetworkScriptRequest, NetworkImageRequest, "
|
||
"NetworkIframeRequest, NetworkXMLHTTPRequest, NetworkLinkRequest, NetworkVideoRequest")
|
||
if showProcess:
|
||
for key, prop in tqdm.tqdm(service.items()):
|
||
time.sleep(0.05)
|
||
# other resource
|
||
if key.startswith("r"):
|
||
offset = prop["resource_type"]
|
||
prop["score"] = 1 / denominator[offset - 7]
|
||
continue
|
||
|
||
# script resource
|
||
event_vec = np.array(list(prop["events"].values()))
|
||
# print(event_vec)
|
||
prop["score"] = sum(np.divide(event_vec, denominator))
|
||
|
||
# the chain solution
|
||
parent = prop["parent"]
|
||
if parent not in service:
|
||
continue
|
||
if parent != "0":
|
||
service[parent]["score"] += prop["score"]
|
||
else:
|
||
for key, prop in service.items():
|
||
# other resource
|
||
if key.startswith("r"):
|
||
offset = prop["resource_type"]
|
||
prop["score"] = 1 / denominator[offset - 7]
|
||
continue
|
||
|
||
# script resource
|
||
event_vec = np.array(list(prop["events"].values()))
|
||
# print(event_vec)
|
||
prop["score"] = sum(np.divide(event_vec, denominator))
|
||
|
||
# the chain solution
|
||
parent = prop["parent"]
|
||
if parent not in service:
|
||
continue
|
||
if parent != "0":
|
||
service[parent]["score"] += prop["score"]
|
||
|
||
@staticmethod
|
||
def event2Num(event_type):
|
||
"""
|
||
network request node type transform to num
|
||
:param event_type:
|
||
:return: num
|
||
"""
|
||
e2n = {
|
||
"NetworkScriptRequest": 1,
|
||
"NetworkImageRequest": 2,
|
||
"NetworkIframeRequest": 3,
|
||
"NetworkXMLHTTPRequest": 4,
|
||
"NetworkLinkRequest": 5,
|
||
"NetworkVideoRequest": 6
|
||
}
|
||
return e2n.get(event_type, 0)
|
||
|
||
def run(self, filename, showProcess):
|
||
print(threading.current_thread().name + ':' + filename)
|
||
service = dict()
|
||
# to find a script's parent, we need to map script_url to script_id
|
||
script2id = collections.defaultdict(str)
|
||
data = self.loader.readGraph(filename)
|
||
|
||
# resource serial number generator
|
||
num_gen = NumGen()
|
||
niter = iter(num_gen)
|
||
|
||
# third-parth checker
|
||
checker = Extracter(data["url"])
|
||
|
||
# find all script node and add them into service
|
||
for node in data["timeline"]:
|
||
script_id = node.get("script_id", None)
|
||
script_url = node.get("script_url", None)
|
||
event_type = node.get("event_type")
|
||
|
||
# the script eval node is special, it is some script's son, and it actually did something in the page
|
||
if event_type == "ScriptEval":
|
||
parent = node["script_parent_id"]
|
||
new_node = self.addNewNode(rtype=1, isEval=1, actor=parent)
|
||
service[script_id] = new_node
|
||
script2id[script_url] = script_id
|
||
continue
|
||
|
||
if not script_url:
|
||
continue
|
||
|
||
# the normal script node, compile and execute
|
||
if script_id not in service:
|
||
_, isthird = checker.isThirdParty(script_url)
|
||
new_node = self.addNewNode(url=script_url, rtype=1, isthird=isthird, actor="0")
|
||
service[script_id] = new_node
|
||
script2id[script_url] = script_id
|
||
|
||
# second, find all network request node and build service dependence chain
|
||
# third, add events to script node
|
||
for node in data["timeline"]:
|
||
event_type = node["event_type"]
|
||
if event_type == "ScriptEval":
|
||
continue
|
||
|
||
rtype = self.event2Num(event_type)
|
||
actor = node.get("actor_id", "0")
|
||
|
||
# network request node
|
||
if rtype != 0:
|
||
url = node["request_url"]
|
||
|
||
if not url:
|
||
continue
|
||
# deal with network link request ??
|
||
if rtype == 5 and not url.endswith(".css"):
|
||
continue
|
||
# deal with network script request ??
|
||
if rtype == 1:
|
||
if url not in script2id:
|
||
continue
|
||
script_id = script2id[url]
|
||
service[script_id]["parent"] = actor
|
||
continue
|
||
|
||
_, isthird = checker.isThirdParty(url)
|
||
new_node = self.addNewNode(url, actor, rtype, isthird)
|
||
service[next(niter)] = new_node
|
||
|
||
if actor != "0" and actor in service:
|
||
service[actor]["events"][event_type] += 1
|
||
|
||
denominator = self.calEventsNum(data["timeline"])
|
||
|
||
self.calScore(service, denominator, showProcess)
|
||
|
||
"""
|
||
save to mysql
|
||
"""
|
||
# f = open("./failed.txt", "a")
|
||
# try:
|
||
# self.toSQL(service, checker)
|
||
# except Exception as e:
|
||
# print(e)
|
||
# f.write(filename + "\n")
|
||
# f.close()
|
||
|
||
"""
|
||
save to json file
|
||
"""
|
||
# self.loader.dumpFile(checker.host + ".json", service)
|
||
return service
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# path = "../log_yiminjiayuan.com_1636696236.549183的副本.json"
|
||
path = "/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json"
|
||
a = GetService()
|
||
res = a.run(path)
|
||
|
||
|