# -*- coding : utf-8-*- import pandas as pd import os import re DOS2019_FLOWS = {'attackers': ['172.16.0.5'], 'victims': ['192.168.50.1', '192.168.50.4']} input_dir = "cicddos2019/input" label_dir = "cicddos2019/label" feature_latitude = 40 + 1 # 第一个元素是label def get_label(srcIP, dstIP): if (srcIP == "172.16.0.5" and dstIP == "192.168.50.1") or (srcIP == "192.168.50.1" and dstIP == "172.16.0.5"): return 1 elif (srcIP == "172.16.0.5" and dstIP == "192.168.50.4") or (srcIP == "192.168.50.4" and dstIP == "172.16.0.5"): return 1 else: return 0 def pre_process(dir, label_dir): files = os.listdir(dir) files.sort(key=lambda x: int(x[0:1])) for filename in files: feature_ls = [] flow = {} input_path = dir+"/"+filename df = pd.read_csv(input_path, encoding='ISO-8859-1') print("------------") print("processing file: {} ".format(input_path)) for index, row in df.iterrows(): if index % 100000 ==0: print("processing index:{}".format(index)) try: srcIP = str(row["Source"]) dstIP = str(row["Destination"]) except: print(srcIP) print(dstIP) continue if not re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", srcIP) or not re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", srcIP): continue protocol = row["Protocol"] # get label label = get_label(srcIP, dstIP) # key key = srcIP+dstIP reverse_key = dstIP+srcIP if not flow.__contains__(key) and not flow.__contains__(reverse_key): # 创建一个flow flow[key] = [label, "-"+protocol] elif flow.__contains__(key): # key添加到flow里面 if len(flow[key]) >= feature_latitude: feature_ls.append(flow.pop(key)) flow[key] = [label, "-"+protocol] flow[key].append("-"+protocol) flow[key][0] = flow[key][0] or label else: # reverse key if len(flow[reverse_key]) >= feature_latitude: feature_ls.append(flow.pop(reverse_key)) flow[reverse_key] = [label, "+"+protocol] flow[reverse_key].append("+"+protocol) # 更新type数量 flow[reverse_key][0] = flow[reverse_key][0] or label # write to csv file total_data = pd.DataFrame(data=feature_ls) # print(total_data) label_path=label_dir+"/"+filename total_data.to_csv(label_path, index=False, encoding="utf-8", sep=',', mode='w', header=True) return pre_process(input_dir, label_dir)