104 lines
3.5 KiB
Python
104 lines
3.5 KiB
Python
|
|
import pandas as pd
|
|||
|
|
import os
|
|||
|
|
from sklearn.metrics import classification_report
|
|||
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|||
|
|
|
|||
|
|
|
|||
|
|
from sklearn.metrics import confusion_matrix
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
|
|||
|
|
|
|||
|
|
packet_path = "cicids2017/friday-dos.csv"
|
|||
|
|
packet_label_path = "cicids2017/friday-dos-label.csv"
|
|||
|
|
|
|||
|
|
flow = {}
|
|||
|
|
feature_latitude = 40 + 1 # 第一个元素是label
|
|||
|
|
feature_ls = []
|
|||
|
|
|
|||
|
|
|
|||
|
|
def wed_get_label(srcIP, dstIP, time):
|
|||
|
|
# for wednesday
|
|||
|
|
if time >= 9*60+47 and time <= 10*60+10:
|
|||
|
|
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
|||
|
|
return True
|
|||
|
|
if time >= 10*60+14 and time <= 10*60+35:
|
|||
|
|
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
|||
|
|
return True
|
|||
|
|
if time >= 10*60+43 and time <= 11*60+0:
|
|||
|
|
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
|||
|
|
return True
|
|||
|
|
if time >= 11*60+10 and time <= 11*60+23:
|
|||
|
|
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def fri_get_label(srcIP, dstIP, time):
|
|||
|
|
# for friday
|
|||
|
|
if time >= 15*60+56 and time <= 16*60+16:
|
|||
|
|
if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
def pre_process(input_path, output_path):
|
|||
|
|
time_diff = 0 # s
|
|||
|
|
df = pd.read_csv(input_path)
|
|||
|
|
for index, row in df.iterrows():
|
|||
|
|
if index % 20000 == 0:
|
|||
|
|
print(index)
|
|||
|
|
srcIP = row["Source"]
|
|||
|
|
dstIP = row["Destination"]
|
|||
|
|
time = row["Time"]
|
|||
|
|
time = time.split(" ")[-1]
|
|||
|
|
hour = int(time.split(":")[0])
|
|||
|
|
min = int(time.split(":")[1])
|
|||
|
|
time = hour * 60+min
|
|||
|
|
protocol = row["Protocol"]
|
|||
|
|
length = row["Length"]
|
|||
|
|
# get label
|
|||
|
|
label = fri_get_label(srcIP, dstIP, time)
|
|||
|
|
# key
|
|||
|
|
key = srcIP+dstIP
|
|||
|
|
reverse_key = dstIP+srcIP
|
|||
|
|
|
|||
|
|
if not flow.__contains__(key) and not flow.__contains__(reverse_key):
|
|||
|
|
# 创建一个flow
|
|||
|
|
flow[key] = [label, "-"+protocol]
|
|||
|
|
elif flow.__contains__(key):
|
|||
|
|
# key添加到flow里面
|
|||
|
|
if len(flow[key]) > feature_latitude:
|
|||
|
|
feature_ls.append(flow.pop(key))
|
|||
|
|
flow[key] = [label, "-"+protocol]
|
|||
|
|
flow[key].append("-"+protocol)
|
|||
|
|
# 只要有一个包是True,那么整个都是True
|
|||
|
|
flow[key][0] = flow[key][0] or label
|
|||
|
|
else:
|
|||
|
|
# reverse key
|
|||
|
|
if len(flow[reverse_key]) > feature_latitude:
|
|||
|
|
feature_ls.append(flow.pop(reverse_key))
|
|||
|
|
flow[reverse_key] = [label, "+"+protocol]
|
|||
|
|
flow[reverse_key].append("+"+protocol)
|
|||
|
|
# 只要有一个包是True,那么整个都是True
|
|||
|
|
flow[reverse_key][0] = flow[reverse_key][0] or label
|
|||
|
|
|
|||
|
|
print("字典中剩余个数:%d" % (len(flow)))
|
|||
|
|
true_num = 0
|
|||
|
|
false_num = 0
|
|||
|
|
for k, v in flow.items():
|
|||
|
|
if v[0] == True:
|
|||
|
|
true_num += 1
|
|||
|
|
else:
|
|||
|
|
false_num += 1
|
|||
|
|
print("true num:%d" % (true_num))
|
|||
|
|
print("false num:%d" % (false_num))
|
|||
|
|
# write to csv file
|
|||
|
|
total_data = pd.DataFrame(data=feature_ls)
|
|||
|
|
# print(total_data)
|
|||
|
|
total_data.to_csv(output_path, index=False,
|
|||
|
|
encoding="utf-8", sep=',', mode='w', header=True)
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
|
|||
|
|
pre_process(packet_path, packet_label_path)
|