This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
dengzeyi-sequenceshield/代码/sequenceShield/cicddos2019/run/run.py
2022-11-21 12:08:58 +08:00

235 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=UTF-8
from asyncio import protocols
from distutils.debug import DEBUG
from pickle import TRUE
from unittest import TestLoader
import torch
import torch.nn as nn
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from random import randint
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split
from torch.utils.data import TensorDataset,Dataset
import logging
from collections import Counter
import os
import sys
sys.path.append("model")
from model import LSTM,GRU,RNN
class MyDataset(Dataset):
def __init__(self, label_file_path,protocols,sequence_len):
self.sequence_len = sequence_len
self.protocols= protocols
self.input_size = len(self.protocols)+1 # 最后一位用于方向
self.df = pd.read_csv(label_file_path)
self.number_1 = 0
self.number_0 = 0
self.dataset_ls = []
self.parse_file()
self.len=len(self.dataset_ls)
def __len__(self):
return self.len
def __getitem__(self, index):
return self.transfor(self.dataset_ls[index])
def parse_file(self):
for index, row in self.df.iterrows():
label = int(row[0])
if label ==1:
self.number_1+=1
else:
self.number_0+=1
self.dataset_ls.append(row)
logging.info("before sampling number_1:{}".format(self.number_1))
logging.info("before sampling number_0:{}".format(self.number_0))
return
def transfor(self,row):
label = int(row[0])# 标签
y = torch.zeros(1)
y[0] = label
x=torch.zeros(self.sequence_len,input_size)
# 数据
features = row[1:self.sequence_len]
for feature_idx, feature in enumerate(features):
# 方向
direction = 1
if feature[0] == '-':
direction = 0
# 去掉方向信息
# direction = 1
feature = feature[1:]
code_idx = self.protocols.index(feature)
x[feature_idx][code_idx] = 1
x[feature_idx][-1] = direction
return x,y
def get_protocols(dir,sequence_len):
logging.info("get protocols:")
file_list = os.listdir(dir)
# 排序
#file_list.sort(key=lambda x: int(x[0:1]))
protocols = []
for i in range(len(file_list)):
logging.info("reading csv file: {}".format(file_list[i]))
df = pd.read_csv(dir + '/' + file_list[i])
# protocol使用one-hot-encoding计算input_size
for index, row in df.iterrows():
features = row[1:sequence_len]
for i in features:
protocol = i[1:] # 去除掉方向信息
if protocol not in protocols:
protocols.append(protocol)
logging.info("procols num: {}".format(len(protocols)))
logging.info("procols : {}".format(protocols))
return protocols
def get_sampler(dataset):
number_1 = 0
number_0 = 0
for x,y in dataset:
if y==1:
number_1+=1
else:
number_0+=1
weight_1 = 1. / number_1
weight_0 = 1./number_0
samples_weight = torch.zeros(len(dataset),dtype=torch.float)
index = 0
for x,y in dataset:
if y==1:
samples_weight[index]=weight_1
else:
samples_weight[index]=weight_0
index +=1
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
return sampler
# 对label文件进行预处理得到DataLoader对象
def pre_process(path, sequence_len, train_portion,protocols,bs):
dataset=MyDataset(label_file_path=path,protocols=protocols,sequence_len=sequence_len)
# 划分训练集和测试集
train_len = int(len(dataset)*train_portion)
test_len = len(dataset)-train_len
train_dataset, test_dataset = random_split(dataset, [train_len, test_len])
train_sampler=get_sampler(train_dataset)
test_sampler=get_sampler(test_dataset)
# loader
train_loader = DataLoader(train_dataset,batch_size=bs,num_workers=0,sampler=train_sampler)
test_loader = DataLoader(test_dataset,batch_size=bs,num_workers=0,sampler=test_sampler)
return train_loader, test_loader
def train(train_loader,criterion,optimizer,sequence_length,input_size):
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
images = images.reshape(-1, sequence_length, input_size).to(device)
labels = labels.reshape(-1).to(device)
# Forward pass
global model
outputs = model(images)
loss = criterion(outputs, labels.long())
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 每10个step/iteration输出一次
if (i+1) % 200 == 0:
logging.info('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
LOG_FORMAT = "%(asctime)s: %(message)s"
logging.basicConfig(filename="./cicddos2019/log/ss-RNN-cicddos2019.log", level=logging.DEBUG,
format=LOG_FORMAT, datefmt='%a, %d %b %Y %H:%M:%S', filemode="w")
logging.info("-------------")
# Device configuration
logging.info("cuda is_available:{}".format(torch.cuda.is_available()))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset_dir = "./cicddos2019/label"
# Hyper-parameters
sequence_length = 40
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 100
num_epochs = 10
learning_rate = 0.001
train_portion = 0.8
# 打印参数
logging.info("parameters - batch_size:{}, num_epochs:{}, learning_rage:{}".format(
batch_size, num_epochs, learning_rate))
# 获得one-hot-endocing的protocols字典
protocols=get_protocols(dataset_dir,sequence_length)
input_size = len(protocols)+1 # 最后一位用于编码标明方向其余位用于onehot编码协议
model = RNN(input_size, hidden_size, num_layers, num_classes,device).to(device)
#model = LSTM(input_size, hidden_size, num_layers, num_classes,device).to(device)
#model = GRU(input_size, hidden_size, num_layers, num_classes,device).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
test_loader_ls = []
file_list = os.listdir(dataset_dir)
#file_list.sort(key=lambda x: int(x[0:1]))# 排序
for i in range(len(file_list)):
file_path = dataset_dir +'/'+ file_list[i]
logging.info("preprocessing file {}".format(file_path))
train_loader, test_loader = pre_process(path=file_path, sequence_len=sequence_length, train_portion=train_portion,protocols=protocols,bs=batch_size)
# train
logging.info("start training {}".format(file_path))
train(train_loader,criterion=criterion,optimizer=optimizer,sequence_length=sequence_length,input_size=input_size)
test_loader_ls.append(test_loader)
# Test the model
model.eval()
logging.info("start testing on test dataset")
with torch.no_grad():
pred_list = []
label_list = []
for test_loader in test_loader_ls:
for images, labels in test_loader:
images = images.reshape(-1, sequence_length, input_size).to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
pred_temp = predicted.data.tolist()
label_temp = labels.data.tolist()
pred_list.extend(pred_temp)
label_list.extend(label_temp)
f1 = f1_score(label_list, pred_list)
logging.info("\nf1:{}".format(f1))
logging.info("\n{}".format(classification_report(
label_list, pred_list, digits=4)))
logging.info("\n{}".format(confusion_matrix(label_list, pred_list)))
# Save the model checkpoint
# torch.save(model.state_dict(), './cicddos2019/model/model.ckpt')