2022-05-05 20:41:28 +08:00
|
|
|
|
import collections
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
import numpy as np
|
2022-06-14 15:48:57 +08:00
|
|
|
|
import tqdm
|
|
|
|
|
|
import time
|
2022-05-05 20:41:28 +08:00
|
|
|
|
import threading
|
|
|
|
|
|
from Tools.domain_extract import Extracter
|
|
|
|
|
|
from DBopt.service2db import Service2DB
|
|
|
|
|
|
from analyzer.file_loader import Loader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NumGen:
|
|
|
|
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
|
|
self.i = 0
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
|
|
def __next__(self):
|
|
|
|
|
|
self.i += 1
|
|
|
|
|
|
return "r" + str(self.i)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GetService:
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
|
self.chain = None
|
|
|
|
|
|
self.cursor = Service2DB()
|
|
|
|
|
|
self.loader = Loader()
|
|
|
|
|
|
self.bitmap = {
|
|
|
|
|
|
"NodeCreation": 0,
|
|
|
|
|
|
"NodeInsertion": 1,
|
|
|
|
|
|
"NodeRemoval": 2,
|
|
|
|
|
|
"NodeAttachLater": 3,
|
|
|
|
|
|
"AttrAddition": 4,
|
|
|
|
|
|
"AttrModification": 5,
|
|
|
|
|
|
"AttrRemoval": 6,
|
|
|
|
|
|
"AttrStyleTextAddition": 7,
|
|
|
|
|
|
"AttrStyleRemoval": 8,
|
|
|
|
|
|
"NetworkScriptRequest": 9,
|
|
|
|
|
|
"NetworkImageRequest": 10,
|
|
|
|
|
|
"NetworkIframeRequest": 11,
|
|
|
|
|
|
"NetworkXMLHTTPRequest": 12,
|
|
|
|
|
|
"NetworkLinkRequest": 13,
|
|
|
|
|
|
"NetworkVideoRequest": 14
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def toSQL(self, service=None, checker=None):
|
|
|
|
|
|
dat = pd.DataFrame(self.cursor.dat)
|
|
|
|
|
|
for key, value in service.items():
|
|
|
|
|
|
li = list(value.values())
|
|
|
|
|
|
if li[3] != "0" and li[3] in service:
|
|
|
|
|
|
li[3] = service[li[3]]["resource_url"]
|
|
|
|
|
|
# events
|
|
|
|
|
|
events = li.pop(-1)
|
|
|
|
|
|
# pop score
|
|
|
|
|
|
score = li.pop(-1)
|
|
|
|
|
|
li.extend(list(events.values()))
|
|
|
|
|
|
# score
|
|
|
|
|
|
li.append(score)
|
|
|
|
|
|
# category
|
|
|
|
|
|
li.append("unknown")
|
|
|
|
|
|
# website
|
|
|
|
|
|
li.append(checker.host)
|
|
|
|
|
|
# service host
|
|
|
|
|
|
host, _ = checker.isThirdParty(li[0])
|
|
|
|
|
|
li.insert(0, host)
|
|
|
|
|
|
# insert into dataframe
|
|
|
|
|
|
dat.loc[dat.shape[0]] = li
|
|
|
|
|
|
|
|
|
|
|
|
self.cursor.writeDB(dat)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def addNewNode(url=None, actor=None, rtype=None, isthird=None, isEval=0):
|
|
|
|
|
|
"""
|
|
|
|
|
|
:param url: the resource url
|
|
|
|
|
|
:param actor: the resource's parent, usually they come from a javascript
|
|
|
|
|
|
:param rtype: the type of the resource: img(1), css(2), js(3), iframe(4), video(5), xml(6),
|
|
|
|
|
|
:param isthird: whether the resource is a third-party resource
|
|
|
|
|
|
:param isEval: whether the resource is ScriptEval
|
|
|
|
|
|
:return: node: presented by a num, if it's rtype is not 3, the events of node is none
|
|
|
|
|
|
"""
|
|
|
|
|
|
node = {
|
|
|
|
|
|
"resource_url": url,
|
|
|
|
|
|
"resource_type": rtype,
|
|
|
|
|
|
"isThirdParty": isthird,
|
|
|
|
|
|
"parent": actor,
|
|
|
|
|
|
"isEvalNode": isEval,
|
|
|
|
|
|
"score": 0.0,
|
|
|
|
|
|
"events": {
|
|
|
|
|
|
"NodeCreation": 0,
|
|
|
|
|
|
"NodeInsertion": 0,
|
|
|
|
|
|
"NodeRemoval": 0,
|
|
|
|
|
|
"NodeAttachLater": 0,
|
|
|
|
|
|
"AttrAddition": 0,
|
|
|
|
|
|
"AttrModification": 0,
|
|
|
|
|
|
"AttrRemoval": 0,
|
|
|
|
|
|
"AttrStyleTextAddition": 0,
|
|
|
|
|
|
"AttrStyleRemoval": 0,
|
|
|
|
|
|
"NetworkScriptRequest": 0,
|
|
|
|
|
|
"NetworkImageRequest": 0,
|
|
|
|
|
|
"NetworkIframeRequest": 0,
|
|
|
|
|
|
"NetworkXMLHTTPRequest": 0,
|
|
|
|
|
|
"NetworkLinkRequest": 0,
|
|
|
|
|
|
"NetworkVideoRequest": 0
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return node
|
|
|
|
|
|
|
|
|
|
|
|
def calEventsNum(self, graph):
|
|
|
|
|
|
"""
|
|
|
|
|
|
calculate all events number in the graph
|
|
|
|
|
|
ipt: the graph , data["timeline"]
|
|
|
|
|
|
opt: the denominator, a 14-dimension vector
|
|
|
|
|
|
"""
|
|
|
|
|
|
denominator = np.array([1] * 15)
|
|
|
|
|
|
for node in graph:
|
|
|
|
|
|
event_type = node.get("event_type")
|
|
|
|
|
|
i = self.bitmap.get(event_type, -1)
|
|
|
|
|
|
if i == -1:
|
|
|
|
|
|
continue
|
|
|
|
|
|
denominator[i] += 1
|
|
|
|
|
|
return denominator
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
2022-06-14 15:48:57 +08:00
|
|
|
|
def calScore(service, denominator, showProcess):
|
2022-05-05 20:41:28 +08:00
|
|
|
|
"""
|
|
|
|
|
|
calculate the score for each resource:
|
|
|
|
|
|
for javascript: we use events' 14-dimention vector divide denominator and add them together
|
|
|
|
|
|
e.g. [1 2 3 4]/[2 3 4 5], the score is 1/2 + 2/3 + 3/4 + 4/5
|
|
|
|
|
|
|
|
|
|
|
|
for other resource: just calculate 1/the num of resource
|
|
|
|
|
|
e.g. for a image node, the total number of image is x, then the score is 1/x
|
|
|
|
|
|
|
|
|
|
|
|
for the chain: The parent's score is equal to its own score plus all of its children's scores
|
|
|
|
|
|
e.g. a.com -> b.com -> c.com: a's score = a's score + b's score + c'score
|
|
|
|
|
|
b's score = b's score + c'score
|
|
|
|
|
|
c's score = c's score
|
|
|
|
|
|
"""
|
2022-06-14 15:48:57 +08:00
|
|
|
|
if showProcess:
|
|
|
|
|
|
print("计算js评分:")
|
|
|
|
|
|
print("计算资源渲染行为比重:NodeCreation, NodeInsertion, "
|
|
|
|
|
|
"NodeRemoval, NodeAttachLater, AttrAddition, AttrModification, AttrRemoval, "
|
|
|
|
|
|
"AttrStyleTextAddition, NetworkScriptRequest, NetworkImageRequest, "
|
|
|
|
|
|
"NetworkIframeRequest, NetworkXMLHTTPRequest, NetworkLinkRequest, NetworkVideoRequest")
|
|
|
|
|
|
if showProcess:
|
|
|
|
|
|
for key, prop in tqdm.tqdm(service.items()):
|
|
|
|
|
|
time.sleep(0.05)
|
|
|
|
|
|
# other resource
|
|
|
|
|
|
if key.startswith("r"):
|
|
|
|
|
|
offset = prop["resource_type"]
|
|
|
|
|
|
prop["score"] = 1 / denominator[offset - 7]
|
|
|
|
|
|
continue
|
2022-05-05 20:41:28 +08:00
|
|
|
|
|
2022-06-14 15:48:57 +08:00
|
|
|
|
# script resource
|
|
|
|
|
|
event_vec = np.array(list(prop["events"].values()))
|
|
|
|
|
|
# print(event_vec)
|
|
|
|
|
|
prop["score"] = sum(np.divide(event_vec, denominator))
|
2022-05-05 20:41:28 +08:00
|
|
|
|
|
2022-06-14 15:48:57 +08:00
|
|
|
|
# the chain solution
|
|
|
|
|
|
parent = prop["parent"]
|
|
|
|
|
|
if parent not in service:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if parent != "0":
|
|
|
|
|
|
service[parent]["score"] += prop["score"]
|
|
|
|
|
|
else:
|
|
|
|
|
|
for key, prop in service.items():
|
|
|
|
|
|
# other resource
|
|
|
|
|
|
if key.startswith("r"):
|
|
|
|
|
|
offset = prop["resource_type"]
|
|
|
|
|
|
prop["score"] = 1 / denominator[offset - 7]
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# script resource
|
|
|
|
|
|
event_vec = np.array(list(prop["events"].values()))
|
|
|
|
|
|
# print(event_vec)
|
|
|
|
|
|
prop["score"] = sum(np.divide(event_vec, denominator))
|
|
|
|
|
|
|
|
|
|
|
|
# the chain solution
|
|
|
|
|
|
parent = prop["parent"]
|
|
|
|
|
|
if parent not in service:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if parent != "0":
|
|
|
|
|
|
service[parent]["score"] += prop["score"]
|
2022-05-05 20:41:28 +08:00
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def event2Num(event_type):
|
|
|
|
|
|
"""
|
|
|
|
|
|
network request node type transform to num
|
|
|
|
|
|
:param event_type:
|
|
|
|
|
|
:return: num
|
|
|
|
|
|
"""
|
|
|
|
|
|
e2n = {
|
|
|
|
|
|
"NetworkScriptRequest": 1,
|
|
|
|
|
|
"NetworkImageRequest": 2,
|
|
|
|
|
|
"NetworkIframeRequest": 3,
|
|
|
|
|
|
"NetworkXMLHTTPRequest": 4,
|
|
|
|
|
|
"NetworkLinkRequest": 5,
|
|
|
|
|
|
"NetworkVideoRequest": 6
|
|
|
|
|
|
}
|
|
|
|
|
|
return e2n.get(event_type, 0)
|
|
|
|
|
|
|
2022-06-14 15:48:57 +08:00
|
|
|
|
def run(self, filename, showProcess):
|
2022-05-05 20:41:28 +08:00
|
|
|
|
print(threading.current_thread().name + ':' + filename)
|
|
|
|
|
|
service = dict()
|
|
|
|
|
|
# to find a script's parent, we need to map script_url to script_id
|
|
|
|
|
|
script2id = collections.defaultdict(str)
|
|
|
|
|
|
data = self.loader.readGraph(filename)
|
|
|
|
|
|
|
|
|
|
|
|
# resource serial number generator
|
|
|
|
|
|
num_gen = NumGen()
|
|
|
|
|
|
niter = iter(num_gen)
|
|
|
|
|
|
|
|
|
|
|
|
# third-parth checker
|
|
|
|
|
|
checker = Extracter(data["url"])
|
|
|
|
|
|
|
|
|
|
|
|
# find all script node and add them into service
|
|
|
|
|
|
for node in data["timeline"]:
|
|
|
|
|
|
script_id = node.get("script_id", None)
|
|
|
|
|
|
script_url = node.get("script_url", None)
|
|
|
|
|
|
event_type = node.get("event_type")
|
|
|
|
|
|
|
|
|
|
|
|
# the script eval node is special, it is some script's son, and it actually did something in the page
|
|
|
|
|
|
if event_type == "ScriptEval":
|
|
|
|
|
|
parent = node["script_parent_id"]
|
|
|
|
|
|
new_node = self.addNewNode(rtype=1, isEval=1, actor=parent)
|
|
|
|
|
|
service[script_id] = new_node
|
|
|
|
|
|
script2id[script_url] = script_id
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
if not script_url:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# the normal script node, compile and execute
|
|
|
|
|
|
if script_id not in service:
|
|
|
|
|
|
_, isthird = checker.isThirdParty(script_url)
|
|
|
|
|
|
new_node = self.addNewNode(url=script_url, rtype=1, isthird=isthird, actor="0")
|
|
|
|
|
|
service[script_id] = new_node
|
|
|
|
|
|
script2id[script_url] = script_id
|
|
|
|
|
|
|
|
|
|
|
|
# second, find all network request node and build service dependence chain
|
|
|
|
|
|
# third, add events to script node
|
|
|
|
|
|
for node in data["timeline"]:
|
|
|
|
|
|
event_type = node["event_type"]
|
|
|
|
|
|
if event_type == "ScriptEval":
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
rtype = self.event2Num(event_type)
|
|
|
|
|
|
actor = node.get("actor_id", "0")
|
|
|
|
|
|
|
|
|
|
|
|
# network request node
|
|
|
|
|
|
if rtype != 0:
|
|
|
|
|
|
url = node["request_url"]
|
|
|
|
|
|
|
|
|
|
|
|
if not url:
|
|
|
|
|
|
continue
|
|
|
|
|
|
# deal with network link request ??
|
|
|
|
|
|
if rtype == 5 and not url.endswith(".css"):
|
|
|
|
|
|
continue
|
|
|
|
|
|
# deal with network script request ??
|
|
|
|
|
|
if rtype == 1:
|
|
|
|
|
|
if url not in script2id:
|
|
|
|
|
|
continue
|
|
|
|
|
|
script_id = script2id[url]
|
|
|
|
|
|
service[script_id]["parent"] = actor
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
_, isthird = checker.isThirdParty(url)
|
|
|
|
|
|
new_node = self.addNewNode(url, actor, rtype, isthird)
|
|
|
|
|
|
service[next(niter)] = new_node
|
|
|
|
|
|
|
|
|
|
|
|
if actor != "0" and actor in service:
|
|
|
|
|
|
service[actor]["events"][event_type] += 1
|
|
|
|
|
|
|
|
|
|
|
|
denominator = self.calEventsNum(data["timeline"])
|
|
|
|
|
|
|
2022-06-14 15:48:57 +08:00
|
|
|
|
self.calScore(service, denominator, showProcess)
|
2022-05-05 20:41:28 +08:00
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
save to mysql
|
|
|
|
|
|
"""
|
|
|
|
|
|
# f = open("./failed.txt", "a")
|
|
|
|
|
|
# try:
|
|
|
|
|
|
# self.toSQL(service, checker)
|
|
|
|
|
|
# except Exception as e:
|
|
|
|
|
|
# print(e)
|
|
|
|
|
|
# f.write(filename + "\n")
|
|
|
|
|
|
# f.close()
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
save to json file
|
|
|
|
|
|
"""
|
|
|
|
|
|
# self.loader.dumpFile(checker.host + ".json", service)
|
|
|
|
|
|
return service
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
# path = "../log_yiminjiayuan.com_1636696236.549183的副本.json"
|
|
|
|
|
|
path = "/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json"
|
|
|
|
|
|
a = GetService()
|
|
|
|
|
|
res = a.run(path)
|
|
|
|
|
|
|
|
|
|
|
|
|