dengzeyi-sequenceshield/代码/sequenceShield/cicdos2017/run/run.py

# coding=UTF-8
from distutils.debug import DEBUG
from pickle import TRUE
import torch
import torch.nn as nn
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from random import randint
from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import random_split
from torch.utils.data import TensorDataset
import logging
from collections import Counter
import datetime

import sys
sys.path.append("model")
from model import LSTM,GRU,RNN

# 对label文件进行预处理，得到DataLoader对象
def pre_process(path, sequence_len, train_portion, bs):
    dataset_ls = []
    protocols = []

    logging.info("reading csv file")
    df = pd.read_csv(path)
    logging.info("csv file alread loaded")

    # protocol使用one-hot-encoding，计算input_size
    for index, row in df.iterrows():
        dataset_ls.append(row)
        features = row[1:sequence_len]
        for i in features:
            protocol = i[1:]  # 去除掉方向信息
            if protocol not in protocols:
                protocols.append(protocol)
    input_size = len(protocols)+1  # 最后一位用于编码标明方向，其余位用于onehot编码协议
    logging.info("procols num: {}".format(len(protocols)))
    logging.info("procols : {}".format(protocols))

    data = torch.zeros([len(dataset_ls), sequence_len, input_size])  # 输入
    target = torch.zeros([len(dataset_ls)], dtype=torch.long)  # 标签
    # 详细标签 normal,low,high:0,1,2
    target_detail = torch.zeros([len(dataset_ls)], dtype=torch.long)
    # 原始数据集中的数量
    dataset_high_num = 0
    dataset_low_num = 0
    dataset_normal_num = 0
    for i in range(len(dataset_ls)):
        row = dataset_ls[i]
        # 标签
        label = row[0]
        label_detail = 0
        if label == "normal":
            dataset_normal_num += 1
            label = 0
            label_detail = 0
        elif label == "high":
            dataset_high_num += 1
            label = 1
            label_detail = 1
        else:
            dataset_low_num += 1
            label = 1
            label_detail = 2
        target[i] = label
        target_detail[i] = label_detail
        # 数据
        features = row[1:sequence_len]
        for feature_idx, feature in enumerate(features):
            # 方向
            direction = 1
            if feature[0] == '-':
                direction = 0
            # 去掉方向信息
            #direction = 1

            feature = feature[1:]
            code_idx = protocols.index(feature)
            data[i][feature_idx][code_idx] = 1
            data[i][feature_idx][-1] = direction

    logging.info("数据集中原本(high,low,normal)的数量:({},{},{})".format(
        dataset_high_num, dataset_low_num, dataset_normal_num))
    # 划分训练集和测试集
    train_len = int(len(data)*train_portion)
    test_len = len(data)-train_len
    dataset = TensorDataset(data, target, target_detail)
    train_dataset, test_dataset = random_split(dataset, [train_len, test_len])

    # 测试集
    train_x_ls = []
    train_y_ls = []
    for (x, y, y_detail) in train_dataset:
        train_x_ls.append(x)
        train_y_ls.append(y)
    # 采样
    train_y = torch.tensor(train_y_ls, dtype=torch.long)
    # Compute samples weight (each sample should get its own weight)
    class_sample_count = torch.tensor(
        [(train_y == t).sum() for t in torch.unique(train_y, sorted=True)])
    weight = 1. / class_sample_count.float()
    samples_weight = torch.tensor([weight[y] for y in train_y])
    # Create sampler, dataset, loader
    train_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    train_loader = DataLoader(
        train_dataset, batch_size=bs, num_workers=0, sampler=train_sampler)

    # 测试集
    test_x_ls = []
    test_y_ls = []
    for (x, y, y_detail) in test_dataset:
        test_x_ls.append(x)
        test_y_ls.append(y)
    # 采样
    test_y = torch.tensor(test_y_ls, dtype=torch.long)
    # Compute samples weight (each sample should get its own weight)
    class_sample_count = torch.tensor(
        [(test_y == t).sum() for t in torch.unique(test_y, sorted=True)])
    weight = 1. / class_sample_count.float()
    samples_weight = torch.tensor([weight[y] for y in test_y])
    # Create sampler, dataset, loader
    test_sampler = WeightedRandomSampler(samples_weight, len(samples_weight))
    test_loader = DataLoader(test_dataset, batch_size=bs,
                             num_workers=0, sampler=test_sampler)

    return train_loader, test_loader, input_size


LOG_FORMAT = "%(asctime)s: %(message)s"
logging.basicConfig(filename="./cicdos2017/log/ss-GRU-cicdos2017_1_1.log", level=logging.DEBUG,
                    format=LOG_FORMAT, datefmt='%a, %d %b %Y %H:%M:%S', filemode="w")
logging.info("-------------")

# Device configuration
logging.info("cuda is_available:{}".format(torch.cuda.is_available()))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset_path = "./cicdos2017/label/label.csv"
dataset_test_path = "./cicdos2017/label/label_test.csv"


# Hyper-parameters
sequence_length = 40
hidden_size = 128
num_layers = 1
num_classes = 2
batch_size = 100
num_epochs = 25
learning_rate = 0.001
train_portion = 0.8
# 打印参数
logging.info("model parameters - num_layers:{}, sequence_len:{}".format(num_layers,sequence_length))
logging.info("parameters - batch_size:{}, num_epochs:{}, learning_rage:{}".format(
    batch_size, num_epochs, learning_rate))

train_loader, test_loader, input_size = pre_process(
    dataset_path, sequence_len=sequence_length, train_portion=train_portion, bs=batch_size)
logging.info("loader already prepared")

# 检查测试数据集中high,low的数量
loader_high_num = 0
loader_low_num = 0
label_detail_ls = []
for images, labels, labels_detail in test_loader:
    label_detail_ls.extend(labels_detail.data.tolist())

logging.info("loader中(0:normal,1:high,2:low):{}".format(
    Counter(label_detail_ls)))


#model = RNN(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
#model = LSTM(input_size, hidden_size, num_layers, num_classes,device=device).to(device)
model = GRU(input_size, hidden_size, num_layers, num_classes,device=device).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

start_time = datetime.datetime.now()
# Train the model
logging.info("start training")
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels, labels_detail) in enumerate(train_loader):
        # 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels.long())

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 每10个step/iteration输出一次
        if (i+1) % 50 == 0:
            logging.info('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                         .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

end_time = datetime.datetime.now()
#print((end_time-start_time).total_seconds())

# Test the model
logging.info("start testing on test dataset")
model.eval()
start_time = datetime.datetime.now()
with torch.no_grad():
    pred_list = []
    label_list = []
    label_detail_list = []
    correct = 0
    total = 0
    for images, labels, labels_detail in test_loader:
        total += len(labels)
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        pred_temp = predicted.data.tolist()
        label_temp = labels.data.tolist()
        label_detail_tmp = labels_detail.data.tolist()
        pred_list.extend(pred_temp)
        label_list.extend(label_temp)
        label_detail_list.extend(label_detail_tmp)
    f1 = f1_score(label_list, pred_list)
    logging.info("f1 :{}".format(f1))
    logging.info("\n{}".format(classification_report(
        label_list, pred_list, digits=4)))
    logging.info("\n{}".format(confusion_matrix(label_list, pred_list)))

    # 时间消耗
    end_time = datetime.datetime.now()
    print("train_time".format((end_time-start_time).total_seconds()))
    print("train_packet_num:{}".format(total * sequence_length))

    # 检查high
    high_num = 0
    high_true = 0
    high_false = 0
    low_num = 0
    low_true = 0
    low_false = 0
    for index, i in enumerate(label_detail_list):
        if i == 1:
            # high ddos
            high_num += 1
            if pred_list[index] == 1:
                high_true += 1
            else:
                high_false += 1
        elif i == 2:
            # low ddos
            low_num += 1
            if pred_list[index] == 1:
                low_true += 1
            else:
                low_false += 1
        else:
            # normal
            continue

    logging.info("high_num,high_true,high_false:{},{},{}".format(
        high_num, high_true, high_false))
    logging.info("low_num,low_true,low_false:{},{},{}".format(
        low_num, low_true, low_false))

# Save the model checkpoint
# torch.save(model.state_dict(), './cicdos2017/model/model.ckpt')