This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
dengzeyi-sequenceshield/代码/sequenceShield/cicdos2017/run/run.py
2022-11-21 12:08:58 +08:00

276 lines
9.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=UTF-8
from distutils.debug import DEBUG
from pickle import TRUE
import torch
import torch.nn as nn
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from random import randint
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split
from torch.utils.data import TensorDataset
import logging
from collections import Counter
import datetime
import sys
sys.path.append("model")
from model import LSTM,GRU,RNN
# 对label文件进行预处理得到DataLoader对象
def pre_process(path, sequence_len, train_portion, bs):
dataset_ls = []
protocols = []
logging.info("reading csv file")
df = pd.read_csv(path)
logging.info("csv file alread loaded")
# protocol使用one-hot-encoding计算input_size
for index, row in df.iterrows():
dataset_ls.append(row)
features = row[1:sequence_len]
for i in features:
protocol = i[1:] # 去除掉方向信息
if protocol not in protocols:
protocols.append(protocol)
input_size = len(protocols)+1 # 最后一位用于编码标明方向其余位用于onehot编码协议
logging.info("procols num: {}".format(len(protocols)))
logging.info("procols : {}".format(protocols))
data = torch.zeros([len(dataset_ls), sequence_len, input_size]) # 输入
target = torch.zeros([len(dataset_ls)], dtype=torch.long) # 标签
# 详细标签 normal,low,high:0,1,2
target_detail = torch.zeros([len(dataset_ls)], dtype=torch.long)
# 原始数据集中的数量
dataset_high_num = 0
dataset_low_num = 0
dataset_normal_num = 0
for i in range(len(dataset_ls)):
row = dataset_ls[i]
# 标签
label = row[0]
label_detail = 0
if label == "normal":
dataset_normal_num += 1
label = 0
label_detail = 0
elif label == "high":
dataset_high_num += 1
label = 1
label_detail = 1
else:
dataset_low_num += 1
label = 1
label_detail = 2
target[i] = label
target_detail[i] = label_detail
# 数据
features = row[1:sequence_len]
for feature_idx, feature in enumerate(features):
# 方向
direction = 1
if feature[0] == '-':
direction = 0
# 去掉方向信息
#direction = 1
feature = feature[1:]
code_idx = protocols.index(feature)
data[i][feature_idx][code_idx] = 1
data[i][feature_idx][-1] = direction
logging.info("数据集中原本(high,low,normal)的数量:({},{},{})".format(
dataset_high_num, dataset_low_num, dataset_normal_num))
# 划分训练集和测试集
train_len = int(len(data)*train_portion)
test_len = len(data)-train_len
dataset = TensorDataset(data, target, target_detail)
train_dataset, test_dataset = random_split(dataset, [train_len, test_len])
# 测试集
train_x_ls = []
train_y_ls = []
for (x, y, y_detail) in train_dataset:
train_x_ls.append(x)
train_y_ls.append(y)
# 采样
train_y = torch.tensor(train_y_ls, dtype=torch.long)
# Compute samples weight (each sample should get its own weight)
class_sample_count = torch.tensor(
[(train_y == t).sum() for t in torch.unique(train_y, sorted=True)])
weight = 1. / class_sample_count.float()
samples_weight = torch.tensor([weight[y] for y in train_y])
# Create sampler, dataset, loader
train_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
train_loader = DataLoader(
train_dataset, batch_size=bs, num_workers=0, sampler=train_sampler)
# 测试集
test_x_ls = []
test_y_ls = []
for (x, y, y_detail) in test_dataset:
test_x_ls.append(x)
test_y_ls.append(y)
# 采样
test_y = torch.tensor(test_y_ls, dtype=torch.long)
# Compute samples weight (each sample should get its own weight)
class_sample_count = torch.tensor(
[(test_y == t).sum() for t in torch.unique(test_y, sorted=True)])
weight = 1. / class_sample_count.float()
samples_weight = torch.tensor([weight[y] for y in test_y])
# Create sampler, dataset, loader
test_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
test_loader = DataLoader(test_dataset, batch_size=bs,
num_workers=0, sampler=test_sampler)
return train_loader, test_loader, input_size
LOG_FORMAT = "%(asctime)s: %(message)s"
logging.basicConfig(filename="./cicdos2017/log/ss-GRU-cicdos2017_1_1.log", level=logging.DEBUG,
format=LOG_FORMAT, datefmt='%a, %d %b %Y %H:%M:%S', filemode="w")
logging.info("-------------")
# Device configuration
logging.info("cuda is_available:{}".format(torch.cuda.is_available()))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset_path = "./cicdos2017/label/label.csv"
dataset_test_path = "./cicdos2017/label/label_test.csv"
# Hyper-parameters
sequence_length = 40
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 100
num_epochs = 25
learning_rate = 0.001
train_portion = 0.8
# 打印参数
logging.info("model parameters - num_layers:{}, sequence_len:{}".format(num_layers,sequence_length))
logging.info("parameters - batch_size:{}, num_epochs:{}, learning_rage:{}".format(
batch_size, num_epochs, learning_rate))
train_loader, test_loader, input_size = pre_process(
dataset_path, sequence_len=sequence_length, train_portion=train_portion, bs=batch_size)
logging.info("loader already prepared")
# 检查测试数据集中high,low的数量
loader_high_num = 0
loader_low_num = 0
label_detail_ls = []
for images, labels, labels_detail in test_loader:
label_detail_ls.extend(labels_detail.data.tolist())
logging.info("loader中(0:normal,1:high,2:low):{}".format(
Counter(label_detail_ls)))
#model = RNN(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
#model = LSTM(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
model = GRU(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
start_time = datetime.datetime.now()
# Train the model
logging.info("start training")
total_step = len(train_loader)
for epoch in range(num_epochs):
for i, (images, labels, labels_detail) in enumerate(train_loader):
# 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
images = images.reshape(-1, sequence_length, input_size).to(device)
labels = labels.to(device)
# Forward pass
outputs = model(images)
loss = criterion(outputs, labels.long())
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 每10个step/iteration输出一次
if (i+1) % 50 == 0:
logging.info('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
.format(epoch+1, num_epochs, i+1, total_step, loss.item()))
end_time = datetime.datetime.now()
#print((end_time-start_time).total_seconds())
# Test the model
logging.info("start testing on test dataset")
model.eval()
start_time = datetime.datetime.now()
with torch.no_grad():
pred_list = []
label_list = []
label_detail_list = []
correct = 0
total = 0
for images, labels, labels_detail in test_loader:
total += len(labels)
images = images.reshape(-1, sequence_length, input_size).to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
pred_temp = predicted.data.tolist()
label_temp = labels.data.tolist()
label_detail_tmp = labels_detail.data.tolist()
pred_list.extend(pred_temp)
label_list.extend(label_temp)
label_detail_list.extend(label_detail_tmp)
f1 = f1_score(label_list, pred_list)
logging.info("f1 :{}".format(f1))
logging.info("\n{}".format(classification_report(
label_list, pred_list, digits=4)))
logging.info("\n{}".format(confusion_matrix(label_list, pred_list)))
# 时间消耗
end_time = datetime.datetime.now()
print("train_time".format((end_time-start_time).total_seconds()))
print("train_packet_num:{}".format(total * sequence_length))
# 检查high
high_num = 0
high_true = 0
high_false = 0
low_num = 0
low_true = 0
low_false = 0
for index, i in enumerate(label_detail_list):
if i == 1:
# high ddos
high_num += 1
if pred_list[index] == 1:
high_true += 1
else:
high_false += 1
elif i == 2:
# low ddos
low_num += 1
if pred_list[index] == 1:
low_true += 1
else:
low_false += 1
else:
# normal
continue
logging.info("high_num,high_true,high_false:{},{},{}".format(
high_num, high_true, high_false))
logging.info("low_num,low_true,low_false:{},{},{}".format(
low_num, low_true, low_false))
# Save the model checkpoint
# torch.save(model.state_dict(), './cicdos2017/model/model.ckpt')