104 lines
3.5 KiB
Python
104 lines
3.5 KiB
Python
import pandas as pd
|
||
import os
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
||
|
||
|
||
from sklearn.metrics import confusion_matrix
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
|
||
|
||
packet_path = "cicids2017/friday-dos.csv"
|
||
packet_label_path = "cicids2017/friday-dos-label.csv"
|
||
|
||
flow = {}
|
||
feature_latitude = 40 + 1 # 第一个元素是label
|
||
feature_ls = []
|
||
|
||
|
||
def wed_get_label(srcIP, dstIP, time):
|
||
# for wednesday
|
||
if time >= 9*60+47 and time <= 10*60+10:
|
||
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
||
return True
|
||
if time >= 10*60+14 and time <= 10*60+35:
|
||
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
||
return True
|
||
if time >= 10*60+43 and time <= 11*60+0:
|
||
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
||
return True
|
||
if time >= 11*60+10 and time <= 11*60+23:
|
||
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
||
return True
|
||
return False
|
||
|
||
def fri_get_label(srcIP, dstIP, time):
|
||
# for friday
|
||
if time >= 15*60+56 and time <= 16*60+16:
|
||
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
||
return True
|
||
return False
|
||
|
||
def pre_process(input_path, output_path):
|
||
time_diff = 0 # s
|
||
df = pd.read_csv(input_path)
|
||
for index, row in df.iterrows():
|
||
if index % 20000 == 0:
|
||
print(index)
|
||
srcIP = row["Source"]
|
||
dstIP = row["Destination"]
|
||
time = row["Time"]
|
||
time = time.split(" ")[-1]
|
||
hour = int(time.split(":")[0])
|
||
min = int(time.split(":")[1])
|
||
time = hour * 60+min
|
||
protocol = row["Protocol"]
|
||
length = row["Length"]
|
||
# get label
|
||
label = fri_get_label(srcIP, dstIP, time)
|
||
# key
|
||
key = srcIP+dstIP
|
||
reverse_key = dstIP+srcIP
|
||
|
||
if not flow.__contains__(key) and not flow.__contains__(reverse_key):
|
||
# 创建一个flow
|
||
flow[key] = [label, "-"+protocol]
|
||
elif flow.__contains__(key):
|
||
# key添加到flow里面
|
||
if len(flow[key]) > feature_latitude:
|
||
feature_ls.append(flow.pop(key))
|
||
flow[key] = [label, "-"+protocol]
|
||
flow[key].append("-"+protocol)
|
||
# 只要有一个包是True,那么整个都是True
|
||
flow[key][0] = flow[key][0] or label
|
||
else:
|
||
# reverse key
|
||
if len(flow[reverse_key]) > feature_latitude:
|
||
feature_ls.append(flow.pop(reverse_key))
|
||
flow[reverse_key] = [label, "+"+protocol]
|
||
flow[reverse_key].append("+"+protocol)
|
||
# 只要有一个包是True,那么整个都是True
|
||
flow[reverse_key][0] = flow[reverse_key][0] or label
|
||
|
||
print("字典中剩余个数:%d" % (len(flow)))
|
||
true_num = 0
|
||
false_num = 0
|
||
for k, v in flow.items():
|
||
if v[0] == True:
|
||
true_num += 1
|
||
else:
|
||
false_num += 1
|
||
print("true num:%d" % (true_num))
|
||
print("false num:%d" % (false_num))
|
||
# write to csv file
|
||
total_data = pd.DataFrame(data=feature_ls)
|
||
# print(total_data)
|
||
total_data.to_csv(output_path, index=False,
|
||
encoding="utf-8", sep=',', mode='w', header=True)
|
||
return
|
||
|
||
|
||
pre_process(packet_path, packet_label_path)
|