65 lines
2.2 KiB
Python
65 lines
2.2 KiB
Python
|
|
# 输入一系列pcap和pcap对应的标签,输出一个dataframe格式的csv
|
|||
|
|
from time import time
|
|||
|
|
from configparser import ConfigParser
|
|||
|
|
from multiprocessing import cpu_count
|
|||
|
|
import pandas as pd
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
from concurrent.futures import ThreadPoolExecutor
|
|||
|
|
from threading import Lock
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
lock = Lock()
|
|||
|
|
pcap_name_complier = re.compile(".*/(?P<domain>.*)_(?P<doh_uri>.*)_(?P<browser>.*)_(?P<time>.*)\.pcap")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def multi_write(input_filename, id):
|
|||
|
|
match_result = pcap_name_complier.match(input_filename)
|
|||
|
|
if match_result:
|
|||
|
|
domain = match_result["domain"]
|
|||
|
|
doh_uri = match_result["doh_uri"]
|
|||
|
|
browser = match_result["browser"]
|
|||
|
|
else:
|
|||
|
|
domain = None
|
|||
|
|
doh_uri = None
|
|||
|
|
browser = None
|
|||
|
|
lengths, time_lags, directions = trans_pcap_to_row(input_filename)
|
|||
|
|
if len(lengths) == 0 or sum(time_lags) < 1e-7:
|
|||
|
|
return
|
|||
|
|
flow_serial = [lengths[i] * directions[i] for i in range(len(lengths))]
|
|||
|
|
time_serial = [time_lags[i] * directions[i] for i in range(len(time_lags))]
|
|||
|
|
features = extract_statistic_feature(lengths, time_lags, directions)
|
|||
|
|
|
|||
|
|
lock.acquire()
|
|||
|
|
df.loc[id] = [domain, doh_uri, browser, flow_serial, time_serial, features]
|
|||
|
|
lock.release()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run(input_filepath):
|
|||
|
|
files = os.listdir(input_filepath)
|
|||
|
|
with ThreadPoolExecutor(max_workers=30) as thread_pool:
|
|||
|
|
id = 0
|
|||
|
|
for i in range(len(files)):
|
|||
|
|
file = files[i]
|
|||
|
|
full_file = os.path.join(input_filepath, file)
|
|||
|
|
if not full_file.endswith(".pcap"):
|
|||
|
|
continue
|
|||
|
|
# multi_write(full_file, id)
|
|||
|
|
thread_pool.submit(multi_write, full_file, id)
|
|||
|
|
id += 1
|
|||
|
|
print(id)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
# input_filepath = sys.argv[1]
|
|||
|
|
# print(input_filepath)
|
|||
|
|
# output_filename = sys.argv[2]
|
|||
|
|
input_filepath = "E:\\doh&web\\201102\\win10\\web_traffic_after_predeal"
|
|||
|
|
output_filename = "./result/web_feature_2.csv"
|
|||
|
|
start_time = time()
|
|||
|
|
df = pd.DataFrame(columns=["domain", "doh_uri", "browser", "flow_serial", "time_serial", "features"])
|
|||
|
|
run(input_filepath)
|
|||
|
|
df.to_csv(output_filename)
|
|||
|
|
end_time = time()
|
|||
|
|
print("总耗时:", end_time - start_time)
|