dengzeyi-sequenceshield/代码/sequenceShield/cicids2017/parse_file.py

import pandas as pd
import os
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score


from sklearn.metrics import confusion_matrix

import numpy as np
import pandas as pd


packet_path = "cicids2017/friday-dos.csv"
packet_label_path = "cicids2017/friday-dos-label.csv"

flow = {}
feature_latitude = 40 + 1  # 第一个元素是label
feature_ls = []


def wed_get_label(srcIP, dstIP, time):
    # for wednesday
    if time >= 9*60+47 and time <= 10*60+10:
        if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
            return True
    if time >= 10*60+14 and time <= 10*60+35:
        if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
            return True
    if time >= 10*60+43 and time <= 11*60+0:
        if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
            return True
    if time >= 11*60+10 and time <= 11*60+23:
        if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
            return True
    return False

def fri_get_label(srcIP, dstIP, time):
    # for friday
    if time >= 15*60+56 and time <= 16*60+16:
        if (srcIP == "172.16.0.1" and dstIP == "192.168.10.50") or (dstIP == "172.16.0.1" and srcIP == "192.168.10.50"):
            return True
    return False

def pre_process(input_path, output_path):
    time_diff = 0  # s
    df = pd.read_csv(input_path)
    for index, row in df.iterrows():
        if index % 20000 == 0:
            print(index)
        srcIP = row["Source"]
        dstIP = row["Destination"]
        time = row["Time"]
        time = time.split(" ")[-1]
        hour = int(time.split(":")[0])
        min = int(time.split(":")[1])
        time = hour * 60+min
        protocol = row["Protocol"]
        length = row["Length"]
        # get label
        label = fri_get_label(srcIP, dstIP, time)
        # key
        key = srcIP+dstIP
        reverse_key = dstIP+srcIP

        if not flow.__contains__(key) and not flow.__contains__(reverse_key):
            # 创建一个flow
            flow[key] = [label, "-"+protocol]
        elif flow.__contains__(key):
            # key添加到flow里面
            if len(flow[key]) > feature_latitude:
                feature_ls.append(flow.pop(key))
                flow[key] = [label, "-"+protocol]
            flow[key].append("-"+protocol)
            # 只要有一个包是True，那么整个都是True
            flow[key][0] = flow[key][0] or label
        else:
            # reverse key
            if len(flow[reverse_key]) > feature_latitude:
                feature_ls.append(flow.pop(reverse_key))
                flow[reverse_key] = [label, "+"+protocol]
            flow[reverse_key].append("+"+protocol)
            # 只要有一个包是True，那么整个都是True
            flow[reverse_key][0] = flow[reverse_key][0] or label

    print("字典中剩余个数：%d" % (len(flow)))
    true_num = 0
    false_num = 0
    for k, v in flow.items():
        if v[0] == True:
            true_num += 1
        else:
            false_num += 1
    print("true num:%d" % (true_num))
    print("false num:%d" % (false_num))
    # write to csv file
    total_data = pd.DataFrame(data=feature_ls)
    # print(total_data)
    total_data.to_csv(output_path, index=False,
                      encoding="utf-8", sep=',', mode='w', header=True)
    return


pre_process(packet_path, packet_label_path)