import pandas as pd import os from sklearn.metrics import classification_report from sklearn.metrics import roc_curve, auc, roc_auc_score from sklearn.metrics import confusion_matrix import numpy as np import pandas as pd packet_path = "cicids2017/friday-dos.csv" packet_label_path = "cicids2017/friday-dos-label.csv" flow = {} feature_latitude = 40 + 1 # 第一个元素是label feature_ls = [] def wed_get_label(srcIP, dstIP, time): # for wednesday if time >= 9*60+47 and time <= 10*60+10: if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"): return True if time >= 10*60+14 and time <= 10*60+35: if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"): return True if time >= 10*60+43 and time <= 11*60+0: if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"): return True if time >= 11*60+10 and time <= 11*60+23: if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"): return True return False def fri_get_label(srcIP, dstIP, time): # for friday if time >= 15*60+56 and time <= 16*60+16: if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"): return True return False def pre_process(input_path, output_path): time_diff = 0 # s df = pd.read_csv(input_path) for index, row in df.iterrows(): if index % 20000 == 0: print(index) srcIP = row["Source"] dstIP = row["Destination"] time = row["Time"] time = time.split(" ")[-1] hour = int(time.split(":")[0]) min = int(time.split(":")[1]) time = hour * 60+min protocol = row["Protocol"] length = row["Length"] # get label label = fri_get_label(srcIP, dstIP, time) # key key = srcIP+dstIP reverse_key = dstIP+srcIP if not flow.__contains__(key) and not flow.__contains__(reverse_key): # 创建一个flow flow[key] = [label, "-"+protocol] elif flow.__contains__(key): # key添加到flow里面 if len(flow[key]) > feature_latitude: feature_ls.append(flow.pop(key)) flow[key] = [label, "-"+protocol] flow[key].append("-"+protocol) # 只要有一个包是True,那么整个都是True flow[key][0] = flow[key][0] or label else: # reverse key if len(flow[reverse_key]) > feature_latitude: feature_ls.append(flow.pop(reverse_key)) flow[reverse_key] = [label, "+"+protocol] flow[reverse_key].append("+"+protocol) # 只要有一个包是True,那么整个都是True flow[reverse_key][0] = flow[reverse_key][0] or label print("字典中剩余个数:%d" % (len(flow))) true_num = 0 false_num = 0 for k, v in flow.items(): if v[0] == True: true_num += 1 else: false_num += 1 print("true num:%d" % (true_num)) print("false num:%d" % (false_num)) # write to csv file total_data = pd.DataFrame(data=feature_ls) # print(total_data) total_data.to_csv(output_path, index=False, encoding="utf-8", sep=',', mode='w', header=True) return pre_process(packet_path, packet_label_path)