84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
|
|
# -*- coding : utf-8-*-
|
||
|
|
import pandas as pd
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
|
||
|
|
|
||
|
|
DOS2019_FLOWS = {'attackers': ['172.16.0.5'],
|
||
|
|
'victims': ['192.168.50.1', '192.168.50.4']}
|
||
|
|
|
||
|
|
input_dir = "cicddos2019/input"
|
||
|
|
label_dir = "cicddos2019/label"
|
||
|
|
|
||
|
|
feature_latitude = 40 + 1 # 第一个元素是label
|
||
|
|
|
||
|
|
|
||
|
|
def get_label(srcIP, dstIP):
|
||
|
|
if (srcIP == "172.16.0.5" and dstIP == "192.168.50.1") or (srcIP == "192.168.50.1" and dstIP == "172.16.0.5"):
|
||
|
|
return 1
|
||
|
|
elif (srcIP == "172.16.0.5" and dstIP == "192.168.50.4") or (srcIP == "192.168.50.4" and dstIP == "172.16.0.5"):
|
||
|
|
return 1
|
||
|
|
else:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
def pre_process(dir, label_dir):
|
||
|
|
files = os.listdir(dir)
|
||
|
|
files.sort(key=lambda x: int(x[0:1]))
|
||
|
|
for filename in files:
|
||
|
|
feature_ls = []
|
||
|
|
flow = {}
|
||
|
|
input_path = dir+"/"+filename
|
||
|
|
df = pd.read_csv(input_path, encoding='ISO-8859-1')
|
||
|
|
print("------------")
|
||
|
|
print("processing file: {} ".format(input_path))
|
||
|
|
for index, row in df.iterrows():
|
||
|
|
if index % 100000 ==0:
|
||
|
|
print("processing index:{}".format(index))
|
||
|
|
try:
|
||
|
|
srcIP = str(row["Source"])
|
||
|
|
dstIP = str(row["Destination"])
|
||
|
|
except:
|
||
|
|
print(srcIP)
|
||
|
|
print(dstIP)
|
||
|
|
continue
|
||
|
|
|
||
|
|
if not re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", srcIP) or not re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", srcIP):
|
||
|
|
continue
|
||
|
|
protocol = row["Protocol"]
|
||
|
|
# get label
|
||
|
|
label = get_label(srcIP, dstIP)
|
||
|
|
|
||
|
|
# key
|
||
|
|
key = srcIP+dstIP
|
||
|
|
reverse_key = dstIP+srcIP
|
||
|
|
|
||
|
|
if not flow.__contains__(key) and not flow.__contains__(reverse_key):
|
||
|
|
# 创建一个flow
|
||
|
|
flow[key] = [label, "-"+protocol]
|
||
|
|
elif flow.__contains__(key):
|
||
|
|
# key添加到flow里面
|
||
|
|
if len(flow[key]) >= feature_latitude:
|
||
|
|
feature_ls.append(flow.pop(key))
|
||
|
|
flow[key] = [label, "-"+protocol]
|
||
|
|
flow[key].append("-"+protocol)
|
||
|
|
flow[key][0] = flow[key][0] or label
|
||
|
|
else:
|
||
|
|
# reverse key
|
||
|
|
if len(flow[reverse_key]) >= feature_latitude:
|
||
|
|
feature_ls.append(flow.pop(reverse_key))
|
||
|
|
flow[reverse_key] = [label, "+"+protocol]
|
||
|
|
flow[reverse_key].append("+"+protocol)
|
||
|
|
# 更新type数量
|
||
|
|
flow[reverse_key][0] = flow[reverse_key][0] or label
|
||
|
|
|
||
|
|
# write to csv file
|
||
|
|
total_data = pd.DataFrame(data=feature_ls)
|
||
|
|
# print(total_data)
|
||
|
|
label_path=label_dir+"/"+filename
|
||
|
|
total_data.to_csv(label_path, index=False,
|
||
|
|
encoding="utf-8", sep=',', mode='w', header=True)
|
||
|
|
return
|
||
|
|
|
||
|
|
|
||
|
|
pre_process(input_dir, label_dir)
|