This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
dengzeyi-sequenceshield/代码/sequenceShield/cicids2017/parse_file.py
2022-11-21 12:08:58 +08:00

104 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import os
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
packet_path = "cicids2017/friday-dos.csv"
packet_label_path = "cicids2017/friday-dos-label.csv"
flow = {}
feature_latitude = 40 + 1 # 第一个元素是label
feature_ls = []
def wed_get_label(srcIP, dstIP, time):
# for wednesday
if time >= 9*60+47 and time <= 10*60+10:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
if time >= 10*60+14 and time <= 10*60+35:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
if time >= 10*60+43 and time <= 11*60+0:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
if time >= 11*60+10 and time <= 11*60+23:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
return False
def fri_get_label(srcIP, dstIP, time):
# for friday
if time >= 15*60+56 and time <= 16*60+16:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
return False
def pre_process(input_path, output_path):
time_diff = 0 # s
df = pd.read_csv(input_path)
for index, row in df.iterrows():
if index % 20000 == 0:
print(index)
srcIP = row["Source"]
dstIP = row["Destination"]
time = row["Time"]
time = time.split(" ")[-1]
hour = int(time.split(":")[0])
min = int(time.split(":")[1])
time = hour * 60+min
protocol = row["Protocol"]
length = row["Length"]
# get label
label = fri_get_label(srcIP, dstIP, time)
# key
key = srcIP+dstIP
reverse_key = dstIP+srcIP
if not flow.__contains__(key) and not flow.__contains__(reverse_key):
# 创建一个flow
flow[key] = [label, "-"+protocol]
elif flow.__contains__(key):
# key添加到flow里面
if len(flow[key]) > feature_latitude:
feature_ls.append(flow.pop(key))
flow[key] = [label, "-"+protocol]
flow[key].append("-"+protocol)
# 只要有一个包是True那么整个都是True
flow[key][0] = flow[key][0] or label
else:
# reverse key
if len(flow[reverse_key]) > feature_latitude:
feature_ls.append(flow.pop(reverse_key))
flow[reverse_key] = [label, "+"+protocol]
flow[reverse_key].append("+"+protocol)
# 只要有一个包是True那么整个都是True
flow[reverse_key][0] = flow[reverse_key][0] or label
print("字典中剩余个数:%d" % (len(flow)))
true_num = 0
false_num = 0
for k, v in flow.items():
if v[0] == True:
true_num += 1
else:
false_num += 1
print("true num:%d" % (true_num))
print("false num:%d" % (false_num))
# write to csv file
total_data = pd.DataFrame(data=feature_ls)
# print(total_data)
total_data.to_csv(output_path, index=False,
encoding="utf-8", sep=',', mode='w', header=True)
return
pre_process(packet_path, packet_label_path)