This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
wujiating-detection/main.py
nanwct 8165bf52b6 abc
2022-05-19 14:40:58 +08:00

65 lines
2.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 输入一系列pcap和pcap对应的标签输出一个dataframe格式的csv
from time import time
from configparser import ConfigParser
from multiprocessing import cpu_count
import pandas as pd
import sys
import os
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
import re
lock = Lock()
pcap_name_complier = re.compile(".*/(?P<domain>.*)_(?P<doh_uri>.*)_(?P<browser>.*)_(?P<time>.*)\.pcap")
def multi_write(input_filename, id):
match_result = pcap_name_complier.match(input_filename)
if match_result:
domain = match_result["domain"]
doh_uri = match_result["doh_uri"]
browser = match_result["browser"]
else:
domain = None
doh_uri = None
browser = None
lengths, time_lags, directions = trans_pcap_to_row(input_filename)
if len(lengths) == 0 or sum(time_lags) < 1e-7:
return
flow_serial = [lengths[i] * directions[i] for i in range(len(lengths))]
time_serial = [time_lags[i] * directions[i] for i in range(len(time_lags))]
features = extract_statistic_feature(lengths, time_lags, directions)
lock.acquire()
df.loc[id] = [domain, doh_uri, browser, flow_serial, time_serial, features]
lock.release()
def run(input_filepath):
files = os.listdir(input_filepath)
with ThreadPoolExecutor(max_workers=30) as thread_pool:
id = 0
for i in range(len(files)):
file = files[i]
full_file = os.path.join(input_filepath, file)
if not full_file.endswith(".pcap"):
continue
# multi_write(full_file, id)
thread_pool.submit(multi_write, full_file, id)
id += 1
print(id)
if __name__ == "__main__":
# input_filepath = sys.argv[1]
# print(input_filepath)
# output_filename = sys.argv[2]
input_filepath = "E:\\doh&web\\201102\\win10\\web_traffic_after_predeal"
output_filename = "./result/web_feature_2.csv"
start_time = time()
df = pd.DataFrame(columns=["domain", "doh_uri", "browser", "flow_serial", "time_serial", "features"])
run(input_filepath)
df.to_csv(output_filename)
end_time = time()
print("总耗时:", end_time - start_time)