This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
dengzeyi-sequenceshield/代码/sequenceShield/cicids2017/parse_file.py

104 lines
3.5 KiB
Python
Raw Normal View History

2022-11-21 12:08:58 +08:00
import pandas as pd
import os
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
packet_path = "cicids2017/friday-dos.csv"
packet_label_path = "cicids2017/friday-dos-label.csv"
flow = {}
feature_latitude = 40 + 1 # 第一个元素是label
feature_ls = []
def wed_get_label(srcIP, dstIP, time):
# for wednesday
if time >= 9*60+47 and time <= 10*60+10:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
if time >= 10*60+14 and time <= 10*60+35:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
if time >= 10*60+43 and time <= 11*60+0:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
if time >= 11*60+10 and time <= 11*60+23:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
return False
def fri_get_label(srcIP, dstIP, time):
# for friday
if time >= 15*60+56 and time <= 16*60+16:
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
return True
return False
def pre_process(input_path, output_path):
time_diff = 0 # s
df = pd.read_csv(input_path)
for index, row in df.iterrows():
if index % 20000 == 0:
print(index)
srcIP = row["Source"]
dstIP = row["Destination"]
time = row["Time"]
time = time.split(" ")[-1]
hour = int(time.split(":")[0])
min = int(time.split(":")[1])
time = hour * 60+min
protocol = row["Protocol"]
length = row["Length"]
# get label
label = fri_get_label(srcIP, dstIP, time)
# key
key = srcIP+dstIP
reverse_key = dstIP+srcIP
if not flow.__contains__(key) and not flow.__contains__(reverse_key):
# 创建一个flow
flow[key] = [label, "-"+protocol]
elif flow.__contains__(key):
# key添加到flow里面
if len(flow[key]) > feature_latitude:
feature_ls.append(flow.pop(key))
flow[key] = [label, "-"+protocol]
flow[key].append("-"+protocol)
# 只要有一个包是True那么整个都是True
flow[key][0] = flow[key][0] or label
else:
# reverse key
if len(flow[reverse_key]) > feature_latitude:
feature_ls.append(flow.pop(reverse_key))
flow[reverse_key] = [label, "+"+protocol]
flow[reverse_key].append("+"+protocol)
# 只要有一个包是True那么整个都是True
flow[reverse_key][0] = flow[reverse_key][0] or label
print("字典中剩余个数:%d" % (len(flow)))
true_num = 0
false_num = 0
for k, v in flow.items():
if v[0] == True:
true_num += 1
else:
false_num += 1
print("true num:%d" % (true_num))
print("false num:%d" % (false_num))
# write to csv file
total_data = pd.DataFrame(data=feature_ls)
# print(total_data)
total_data.to_csv(output_path, index=False,
encoding="utf-8", sep=',', mode='w', header=True)
return
pre_process(packet_path, packet_label_path)