235 lines
8.1 KiB
Python
235 lines
8.1 KiB
Python
# coding=UTF-8
|
||
from asyncio import protocols
|
||
from distutils.debug import DEBUG
|
||
from pickle import TRUE
|
||
from unittest import TestLoader
|
||
import torch
|
||
import torch.nn as nn
|
||
import pandas as pd
|
||
from sklearn.metrics import confusion_matrix
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import f1_score
|
||
from random import randint
|
||
from torch.utils.data.sampler import WeightedRandomSampler
|
||
from torch.utils.data.dataloader import DataLoader
|
||
from torch.utils.data import random_split
|
||
from torch.utils.data import TensorDataset,Dataset
|
||
import logging
|
||
from collections import Counter
|
||
import os
|
||
|
||
import sys
|
||
sys.path.append("model")
|
||
from model import LSTM,GRU,RNN
|
||
|
||
|
||
class MyDataset(Dataset):
|
||
def __init__(self, label_file_path,protocols,sequence_len):
|
||
self.sequence_len = sequence_len
|
||
self.protocols= protocols
|
||
self.input_size = len(self.protocols)+1 # 最后一位用于方向
|
||
self.df = pd.read_csv(label_file_path)
|
||
self.number_1 = 0
|
||
self.number_0 = 0
|
||
self.dataset_ls = []
|
||
self.parse_file()
|
||
self.len=len(self.dataset_ls)
|
||
|
||
def __len__(self):
|
||
return self.len
|
||
|
||
def __getitem__(self, index):
|
||
return self.transfor(self.dataset_ls[index])
|
||
|
||
def parse_file(self):
|
||
for index, row in self.df.iterrows():
|
||
label = int(row[0])
|
||
if label ==1:
|
||
self.number_1+=1
|
||
else:
|
||
self.number_0+=1
|
||
self.dataset_ls.append(row)
|
||
logging.info("before sampling number_1:{}".format(self.number_1))
|
||
logging.info("before sampling number_0:{}".format(self.number_0))
|
||
return
|
||
|
||
def transfor(self,row):
|
||
label = int(row[0])# 标签
|
||
y = torch.zeros(1)
|
||
y[0] = label
|
||
x=torch.zeros(self.sequence_len,input_size)
|
||
# 数据
|
||
features = row[1:self.sequence_len]
|
||
for feature_idx, feature in enumerate(features):
|
||
# 方向
|
||
direction = 1
|
||
if feature[0] == '-':
|
||
direction = 0
|
||
# 去掉方向信息
|
||
# direction = 1
|
||
feature = feature[1:]
|
||
code_idx = self.protocols.index(feature)
|
||
x[feature_idx][code_idx] = 1
|
||
x[feature_idx][-1] = direction
|
||
return x,y
|
||
|
||
|
||
|
||
|
||
def get_protocols(dir,sequence_len):
|
||
logging.info("get protocols:")
|
||
file_list = os.listdir(dir)
|
||
# 排序
|
||
#file_list.sort(key=lambda x: int(x[0:1]))
|
||
protocols = []
|
||
for i in range(len(file_list)):
|
||
logging.info("reading csv file: {}".format(file_list[i]))
|
||
df = pd.read_csv(dir + '/' + file_list[i])
|
||
# protocol使用one-hot-encoding,计算input_size
|
||
for index, row in df.iterrows():
|
||
features = row[1:sequence_len]
|
||
for i in features:
|
||
protocol = i[1:] # 去除掉方向信息
|
||
if protocol not in protocols:
|
||
protocols.append(protocol)
|
||
logging.info("procols num: {}".format(len(protocols)))
|
||
logging.info("procols : {}".format(protocols))
|
||
return protocols
|
||
|
||
def get_sampler(dataset):
|
||
number_1 = 0
|
||
number_0 = 0
|
||
for x,y in dataset:
|
||
if y==1:
|
||
number_1+=1
|
||
else:
|
||
number_0+=1
|
||
weight_1 = 1. / number_1
|
||
weight_0 = 1./number_0
|
||
samples_weight = torch.zeros(len(dataset),dtype=torch.float)
|
||
index = 0
|
||
for x,y in dataset:
|
||
if y==1:
|
||
samples_weight[index]=weight_1
|
||
else:
|
||
samples_weight[index]=weight_0
|
||
index +=1
|
||
sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
|
||
return sampler
|
||
|
||
|
||
|
||
# 对label文件进行预处理,得到DataLoader对象
|
||
def pre_process(path, sequence_len, train_portion,protocols,bs):
|
||
dataset=MyDataset(label_file_path=path,protocols=protocols,sequence_len=sequence_len)
|
||
# 划分训练集和测试集
|
||
train_len = int(len(dataset)*train_portion)
|
||
test_len = len(dataset)-train_len
|
||
train_dataset, test_dataset = random_split(dataset, [train_len, test_len])
|
||
train_sampler=get_sampler(train_dataset)
|
||
test_sampler=get_sampler(test_dataset)
|
||
# loader
|
||
train_loader = DataLoader(train_dataset,batch_size=bs,num_workers=0,sampler=train_sampler)
|
||
test_loader = DataLoader(test_dataset,batch_size=bs,num_workers=0,sampler=test_sampler)
|
||
return train_loader, test_loader
|
||
|
||
def train(train_loader,criterion,optimizer,sequence_length,input_size):
|
||
total_step = len(train_loader)
|
||
for epoch in range(num_epochs):
|
||
for i, (images, labels) in enumerate(train_loader):
|
||
# 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
|
||
images = images.reshape(-1, sequence_length, input_size).to(device)
|
||
labels = labels.reshape(-1).to(device)
|
||
|
||
# Forward pass
|
||
global model
|
||
outputs = model(images)
|
||
loss = criterion(outputs, labels.long())
|
||
|
||
# Backward and optimize
|
||
optimizer.zero_grad()
|
||
loss.backward()
|
||
optimizer.step()
|
||
|
||
# 每10个step/iteration输出一次
|
||
if (i+1) % 200 == 0:
|
||
logging.info('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
|
||
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
|
||
|
||
|
||
|
||
LOG_FORMAT = "%(asctime)s: %(message)s"
|
||
logging.basicConfig(filename="./cicddos2019/log/ss-RNN-cicddos2019.log", level=logging.DEBUG,
|
||
format=LOG_FORMAT, datefmt='%a, %d %b %Y %H:%M:%S', filemode="w")
|
||
logging.info("-------------")
|
||
|
||
# Device configuration
|
||
logging.info("cuda is_available:{}".format(torch.cuda.is_available()))
|
||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||
|
||
dataset_dir = "./cicddos2019/label"
|
||
# Hyper-parameters
|
||
sequence_length = 40
|
||
hidden_size = 128
|
||
num_layers = 1
|
||
num_classes = 2
|
||
batch_size = 100
|
||
num_epochs = 10
|
||
learning_rate = 0.001
|
||
train_portion = 0.8
|
||
# 打印参数
|
||
logging.info("parameters - batch_size:{}, num_epochs:{}, learning_rage:{}".format(
|
||
batch_size, num_epochs, learning_rate))
|
||
|
||
# 获得one-hot-endocing的protocols字典
|
||
protocols=get_protocols(dataset_dir,sequence_length)
|
||
input_size = len(protocols)+1 # 最后一位用于编码标明方向,其余位用于onehot编码协议
|
||
|
||
model = RNN(input_size, hidden_size, num_layers, num_classes,device).to(device)
|
||
#model = LSTM(input_size, hidden_size, num_layers, num_classes,device).to(device)
|
||
#model = GRU(input_size, hidden_size, num_layers, num_classes,device).to(device)
|
||
|
||
# Loss and optimizer
|
||
criterion = nn.CrossEntropyLoss().to(device)
|
||
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
|
||
|
||
|
||
test_loader_ls = []
|
||
file_list = os.listdir(dataset_dir)
|
||
#file_list.sort(key=lambda x: int(x[0:1]))# 排序
|
||
for i in range(len(file_list)):
|
||
file_path = dataset_dir +'/'+ file_list[i]
|
||
logging.info("preprocessing file {}".format(file_path))
|
||
train_loader, test_loader = pre_process(path=file_path, sequence_len=sequence_length, train_portion=train_portion,protocols=protocols,bs=batch_size)
|
||
# train
|
||
logging.info("start training {}".format(file_path))
|
||
train(train_loader,criterion=criterion,optimizer=optimizer,sequence_length=sequence_length,input_size=input_size)
|
||
test_loader_ls.append(test_loader)
|
||
|
||
|
||
|
||
# Test the model
|
||
model.eval()
|
||
logging.info("start testing on test dataset")
|
||
with torch.no_grad():
|
||
pred_list = []
|
||
label_list = []
|
||
for test_loader in test_loader_ls:
|
||
for images, labels in test_loader:
|
||
images = images.reshape(-1, sequence_length, input_size).to(device)
|
||
labels = labels.to(device)
|
||
outputs = model(images)
|
||
_, predicted = torch.max(outputs.data, 1)
|
||
pred_temp = predicted.data.tolist()
|
||
label_temp = labels.data.tolist()
|
||
pred_list.extend(pred_temp)
|
||
label_list.extend(label_temp)
|
||
f1 = f1_score(label_list, pred_list)
|
||
logging.info("\nf1:{}".format(f1))
|
||
logging.info("\n{}".format(classification_report(
|
||
label_list, pred_list, digits=4)))
|
||
logging.info("\n{}".format(confusion_matrix(label_list, pred_list)))
|
||
|
||
# Save the model checkpoint
|
||
# torch.save(model.state_dict(), './cicddos2019/model/model.ckpt')
|