This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
dengzeyi-sequenceshield/代码/sequenceShield/cicddos2019/script/parse.py

84 lines
2.8 KiB
Python
Raw Normal View History

2022-11-21 12:08:58 +08:00
# -*- coding : utf-8-*-
import pandas as pd
import os
import re
DOS2019_FLOWS = {'attackers': ['172.16.0.5'],
'victims': ['192.168.50.1', '192.168.50.4']}
input_dir = "cicddos2019/input"
label_dir = "cicddos2019/label"
feature_latitude = 40 + 1 # 第一个元素是label
def get_label(srcIP, dstIP):
if (srcIP == "172.16.0.5" and dstIP == "192.168.50.1") or (srcIP == "192.168.50.1" and dstIP == "172.16.0.5"):
return 1
elif (srcIP == "172.16.0.5" and dstIP == "192.168.50.4") or (srcIP == "192.168.50.4" and dstIP == "172.16.0.5"):
return 1
else:
return 0
def pre_process(dir, label_dir):
files = os.listdir(dir)
files.sort(key=lambda x: int(x[0:1]))
for filename in files:
feature_ls = []
flow = {}
input_path = dir+"/"+filename
df = pd.read_csv(input_path, encoding='ISO-8859-1')
print("------------")
print("processing file: {} ".format(input_path))
for index, row in df.iterrows():
if index % 100000 ==0:
print("processing index:{}".format(index))
try:
srcIP = str(row["Source"])
dstIP = str(row["Destination"])
except:
print(srcIP)
print(dstIP)
continue
if not re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", srcIP) or not re.match(r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$", srcIP):
continue
protocol = row["Protocol"]
# get label
label = get_label(srcIP, dstIP)
# key
key = srcIP+dstIP
reverse_key = dstIP+srcIP
if not flow.__contains__(key) and not flow.__contains__(reverse_key):
# 创建一个flow
flow[key] = [label, "-"+protocol]
elif flow.__contains__(key):
# key添加到flow里面
if len(flow[key]) >= feature_latitude:
feature_ls.append(flow.pop(key))
flow[key] = [label, "-"+protocol]
flow[key].append("-"+protocol)
flow[key][0] = flow[key][0] or label
else:
# reverse key
if len(flow[reverse_key]) >= feature_latitude:
feature_ls.append(flow.pop(reverse_key))
flow[reverse_key] = [label, "+"+protocol]
flow[reverse_key].append("+"+protocol)
# 更新type数量
flow[reverse_key][0] = flow[reverse_key][0] or label
# write to csv file
total_data = pd.DataFrame(data=feature_ls)
# print(total_data)
label_path=label_dir+"/"+filename
total_data.to_csv(label_path, index=False,
encoding="utf-8", sep=',', mode='w', header=True)
return
pre_process(input_dir, label_dir)