276 lines
9.5 KiB
Python
276 lines
9.5 KiB
Python
# coding=UTF-8
|
||
from distutils.debug import DEBUG
|
||
from pickle import TRUE
|
||
import torch
|
||
import torch.nn as nn
|
||
import pandas as pd
|
||
from sklearn.metrics import confusion_matrix
|
||
from sklearn.metrics import classification_report
|
||
from sklearn.metrics import f1_score
|
||
from random import randint
|
||
from torch.utils.data.sampler import WeightedRandomSampler
|
||
from torch.utils.data.dataloader import DataLoader
|
||
from torch.utils.data import random_split
|
||
from torch.utils.data import TensorDataset
|
||
import logging
|
||
from collections import Counter
|
||
import datetime
|
||
|
||
import sys
|
||
sys.path.append("model")
|
||
from model import LSTM,GRU,RNN
|
||
|
||
# 对label文件进行预处理,得到DataLoader对象
|
||
def pre_process(path, sequence_len, train_portion, bs):
|
||
dataset_ls = []
|
||
protocols = []
|
||
|
||
logging.info("reading csv file")
|
||
df = pd.read_csv(path)
|
||
logging.info("csv file alread loaded")
|
||
|
||
# protocol使用one-hot-encoding,计算input_size
|
||
for index, row in df.iterrows():
|
||
dataset_ls.append(row)
|
||
features = row[1:sequence_len]
|
||
for i in features:
|
||
protocol = i[1:] # 去除掉方向信息
|
||
if protocol not in protocols:
|
||
protocols.append(protocol)
|
||
input_size = len(protocols)+1 # 最后一位用于编码标明方向,其余位用于onehot编码协议
|
||
logging.info("procols num: {}".format(len(protocols)))
|
||
logging.info("procols : {}".format(protocols))
|
||
|
||
data = torch.zeros([len(dataset_ls), sequence_len, input_size]) # 输入
|
||
target = torch.zeros([len(dataset_ls)], dtype=torch.long) # 标签
|
||
# 详细标签 normal,low,high:0,1,2
|
||
target_detail = torch.zeros([len(dataset_ls)], dtype=torch.long)
|
||
# 原始数据集中的数量
|
||
dataset_high_num = 0
|
||
dataset_low_num = 0
|
||
dataset_normal_num = 0
|
||
for i in range(len(dataset_ls)):
|
||
row = dataset_ls[i]
|
||
# 标签
|
||
label = row[0]
|
||
label_detail = 0
|
||
if label == "normal":
|
||
dataset_normal_num += 1
|
||
label = 0
|
||
label_detail = 0
|
||
elif label == "high":
|
||
dataset_high_num += 1
|
||
label = 1
|
||
label_detail = 1
|
||
else:
|
||
dataset_low_num += 1
|
||
label = 1
|
||
label_detail = 2
|
||
target[i] = label
|
||
target_detail[i] = label_detail
|
||
# 数据
|
||
features = row[1:sequence_len]
|
||
for feature_idx, feature in enumerate(features):
|
||
# 方向
|
||
direction = 1
|
||
if feature[0] == '-':
|
||
direction = 0
|
||
# 去掉方向信息
|
||
#direction = 1
|
||
|
||
feature = feature[1:]
|
||
code_idx = protocols.index(feature)
|
||
data[i][feature_idx][code_idx] = 1
|
||
data[i][feature_idx][-1] = direction
|
||
|
||
logging.info("数据集中原本(high,low,normal)的数量:({},{},{})".format(
|
||
dataset_high_num, dataset_low_num, dataset_normal_num))
|
||
# 划分训练集和测试集
|
||
train_len = int(len(data)*train_portion)
|
||
test_len = len(data)-train_len
|
||
dataset = TensorDataset(data, target, target_detail)
|
||
train_dataset, test_dataset = random_split(dataset, [train_len, test_len])
|
||
|
||
# 测试集
|
||
train_x_ls = []
|
||
train_y_ls = []
|
||
for (x, y, y_detail) in train_dataset:
|
||
train_x_ls.append(x)
|
||
train_y_ls.append(y)
|
||
# 采样
|
||
train_y = torch.tensor(train_y_ls, dtype=torch.long)
|
||
# Compute samples weight (each sample should get its own weight)
|
||
class_sample_count = torch.tensor(
|
||
[(train_y == t).sum() for t in torch.unique(train_y, sorted=True)])
|
||
weight = 1. / class_sample_count.float()
|
||
samples_weight = torch.tensor([weight[y] for y in train_y])
|
||
# Create sampler, dataset, loader
|
||
train_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
|
||
train_loader = DataLoader(
|
||
train_dataset, batch_size=bs, num_workers=0, sampler=train_sampler)
|
||
|
||
# 测试集
|
||
test_x_ls = []
|
||
test_y_ls = []
|
||
for (x, y, y_detail) in test_dataset:
|
||
test_x_ls.append(x)
|
||
test_y_ls.append(y)
|
||
# 采样
|
||
test_y = torch.tensor(test_y_ls, dtype=torch.long)
|
||
# Compute samples weight (each sample should get its own weight)
|
||
class_sample_count = torch.tensor(
|
||
[(test_y == t).sum() for t in torch.unique(test_y, sorted=True)])
|
||
weight = 1. / class_sample_count.float()
|
||
samples_weight = torch.tensor([weight[y] for y in test_y])
|
||
# Create sampler, dataset, loader
|
||
test_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
|
||
test_loader = DataLoader(test_dataset, batch_size=bs,
|
||
num_workers=0, sampler=test_sampler)
|
||
|
||
return train_loader, test_loader, input_size
|
||
|
||
|
||
LOG_FORMAT = "%(asctime)s: %(message)s"
|
||
logging.basicConfig(filename="./cicdos2017/log/ss-GRU-cicdos2017_1_1.log", level=logging.DEBUG,
|
||
format=LOG_FORMAT, datefmt='%a, %d %b %Y %H:%M:%S', filemode="w")
|
||
logging.info("-------------")
|
||
|
||
# Device configuration
|
||
logging.info("cuda is_available:{}".format(torch.cuda.is_available()))
|
||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||
|
||
dataset_path = "./cicdos2017/label/label.csv"
|
||
dataset_test_path = "./cicdos2017/label/label_test.csv"
|
||
|
||
|
||
# Hyper-parameters
|
||
sequence_length = 40
|
||
hidden_size = 128
|
||
num_layers = 1
|
||
num_classes = 2
|
||
batch_size = 100
|
||
num_epochs = 25
|
||
learning_rate = 0.001
|
||
train_portion = 0.8
|
||
# 打印参数
|
||
logging.info("model parameters - num_layers:{}, sequence_len:{}".format(num_layers,sequence_length))
|
||
logging.info("parameters - batch_size:{}, num_epochs:{}, learning_rage:{}".format(
|
||
batch_size, num_epochs, learning_rate))
|
||
|
||
train_loader, test_loader, input_size = pre_process(
|
||
dataset_path, sequence_len=sequence_length, train_portion=train_portion, bs=batch_size)
|
||
logging.info("loader already prepared")
|
||
|
||
# 检查测试数据集中high,low的数量
|
||
loader_high_num = 0
|
||
loader_low_num = 0
|
||
label_detail_ls = []
|
||
for images, labels, labels_detail in test_loader:
|
||
label_detail_ls.extend(labels_detail.data.tolist())
|
||
|
||
logging.info("loader中(0:normal,1:high,2:low):{}".format(
|
||
Counter(label_detail_ls)))
|
||
|
||
|
||
#model = RNN(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
|
||
#model = LSTM(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
|
||
model = GRU(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
|
||
|
||
# Loss and optimizer
|
||
criterion = nn.CrossEntropyLoss().to(device)
|
||
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
|
||
|
||
start_time = datetime.datetime.now()
|
||
# Train the model
|
||
logging.info("start training")
|
||
total_step = len(train_loader)
|
||
for epoch in range(num_epochs):
|
||
for i, (images, labels, labels_detail) in enumerate(train_loader):
|
||
# 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
|
||
images = images.reshape(-1, sequence_length, input_size).to(device)
|
||
labels = labels.to(device)
|
||
|
||
# Forward pass
|
||
outputs = model(images)
|
||
loss = criterion(outputs, labels.long())
|
||
|
||
# Backward and optimize
|
||
optimizer.zero_grad()
|
||
loss.backward()
|
||
optimizer.step()
|
||
|
||
# 每10个step/iteration输出一次
|
||
if (i+1) % 50 == 0:
|
||
logging.info('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
|
||
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
|
||
|
||
end_time = datetime.datetime.now()
|
||
#print((end_time-start_time).total_seconds())
|
||
|
||
# Test the model
|
||
logging.info("start testing on test dataset")
|
||
model.eval()
|
||
start_time = datetime.datetime.now()
|
||
with torch.no_grad():
|
||
pred_list = []
|
||
label_list = []
|
||
label_detail_list = []
|
||
correct = 0
|
||
total = 0
|
||
for images, labels, labels_detail in test_loader:
|
||
total += len(labels)
|
||
images = images.reshape(-1, sequence_length, input_size).to(device)
|
||
labels = labels.to(device)
|
||
outputs = model(images)
|
||
_, predicted = torch.max(outputs.data, 1)
|
||
pred_temp = predicted.data.tolist()
|
||
label_temp = labels.data.tolist()
|
||
label_detail_tmp = labels_detail.data.tolist()
|
||
pred_list.extend(pred_temp)
|
||
label_list.extend(label_temp)
|
||
label_detail_list.extend(label_detail_tmp)
|
||
f1 = f1_score(label_list, pred_list)
|
||
logging.info("f1 :{}".format(f1))
|
||
logging.info("\n{}".format(classification_report(
|
||
label_list, pred_list, digits=4)))
|
||
logging.info("\n{}".format(confusion_matrix(label_list, pred_list)))
|
||
|
||
# 时间消耗
|
||
end_time = datetime.datetime.now()
|
||
print("train_time".format((end_time-start_time).total_seconds()))
|
||
print("train_packet_num:{}".format(total * sequence_length))
|
||
|
||
# 检查high
|
||
high_num = 0
|
||
high_true = 0
|
||
high_false = 0
|
||
low_num = 0
|
||
low_true = 0
|
||
low_false = 0
|
||
for index, i in enumerate(label_detail_list):
|
||
if i == 1:
|
||
# high ddos
|
||
high_num += 1
|
||
if pred_list[index] == 1:
|
||
high_true += 1
|
||
else:
|
||
high_false += 1
|
||
elif i == 2:
|
||
# low ddos
|
||
low_num += 1
|
||
if pred_list[index] == 1:
|
||
low_true += 1
|
||
else:
|
||
low_false += 1
|
||
else:
|
||
# normal
|
||
continue
|
||
|
||
logging.info("high_num,high_true,high_false:{},{},{}".format(
|
||
high_num, high_true, high_false))
|
||
logging.info("low_num,low_true,low_false:{},{},{}".format(
|
||
low_num, low_true, low_false))
|
||
|
||
# Save the model checkpoint
|
||
# torch.save(model.state_dict(), './cicdos2017/model/model.ckpt')
|