删除组件检测功能,增加Js打印,增加easylist更新

This commit is contained in:
little_stone
2022-06-14 15:48:57 +08:00
parent bd2d50cf35
commit cc4f977db2
110 changed files with 386 additions and 15386 deletions

View File

@@ -3,10 +3,10 @@ import os
import pandas as pd
import threading
import requests
import builtwith
import whois
import argparse
import collections
import tqdm
from collector.Peeper import Peeper
from analyzer.get_chain import GetService
from Infra_analyzer.caLookup import CertResolver
@@ -14,7 +14,6 @@ from Infra_analyzer.dnsLookup import DNSResolver
from concurrent.futures import ThreadPoolExecutor
from Tools.adt.ATFilter import AdFilter, TrackerFilter
lock = threading.Lock()
@@ -22,20 +21,23 @@ class ToCSV:
def __init__(self):
self.df = pd.DataFrame(columns=["resource_url", "isThirdParty", "resource_type",
"CA_url", "Issuer", "OCSP", "CDP", "NS", "isAd", "isTracker"])
"CA_url", "Issuer", "OCSP", "CDP", "NS", "isAd", "isTracker", "score"])
self.crtlook = CertResolver()
self.dnslook = DNSResolver()
self.ad = AdFilter()
self.tr = TrackerFilter()
self.ad = AdFilter(0) # 要更新里面就填1建议一周更一次
self.tr = TrackerFilter(0) # 同上
def lookUp(self, value):
ca_url, issuer, ocsp, crl = self.crtlook.get_CRL_OSCP(value["resource_url"])
ns = self.dnslook.get_NS(value["resource_url"])
"""
判断是否为广告/跟踪器,没啥必要就注掉,慢得很,浪费时间
"""
# is_ad = self.ad.blocker.should_block(value["resource_url"])
# is_tr = self.tr.blocker.should_block(value["resource_url"])
lock.acquire()
self.df.loc[self.df.shape[0]] = [value["resource_url"], value["isThirdParty"], value["resource_type"],
ca_url, issuer, ocsp, str(crl), str(ns), False, False]
ca_url, issuer, ocsp, str(crl), str(ns), False, False, round(value["score"], 4)]
lock.release()
@@ -65,7 +67,7 @@ def chainAna(results):
def page_resource(path, dirname, sp):
dumper = ToCSV()
ana = GetService()
results = ana.run(path)
results = ana.run(path, 0) # 0代表不打印评分过程1代表打印评分过程只是做个除法而已打印出来也没什么信息量好看一点罢了
js_rank = []
pool = ThreadPoolExecutor(max_workers=7)
seen = set()
@@ -78,9 +80,9 @@ def page_resource(path, dirname, sp):
pool.submit(dumper.lookUp, value)
pool.shutdown()
js_rank.sort(key=lambda x: x[1], reverse=True)
print("-----------------js排名情况------------------")
print("-----------------js排名情况(评分保留4为小数)-----------------")
for js, _ in js_rank:
print(js)
print("score is:", round(_, 4), "js:", js)
dumper.df.to_csv(sp + dirname + ".csv", index=False)
print("-----------------引用链------------------")
@@ -115,20 +117,12 @@ def run(domain):
"headers": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
"Version/14.1.2 Safari/605.1.15"
}
req = requests.get(url, headers=header)
for key, value in req.headers.items():
print(key, value)
f = open(sp + "header", "w")
f.write(str(req.headers))
f.close()
print("-----------------组件使用情况------------------")
components = builtwith.parse(url)
for key, value in components.items():
print(key, value)
f = open(sp + "component", "w")
f.write(str(components))
f.close()
# req = requests.get(url, headers=header)
# for key, value in req.headers.items():
# print(key, value)
# f = open(sp + "header", "w")
# f.write(str(req.headers))
# f.close()
# page_resource("/Users/mazeyu/rendering_stream/www.shandong-energy.com/log_www.shandong-energy.com_1649208675.692799.json", "www.shandong-energy.com")
@@ -143,8 +137,8 @@ if __name__ == "__main__":
easylist更新
域名
"""
# run("www.baidu.com")
run("wanfangdata.com.cn")
# run("csdn.net")
# run("wanfangdata.com.cn")
# run("wanfangdata.com.cn")
run("csdn.net")
# run("www.bilibili.com")
# run("www.piaoliang.com")