206 lines
7.2 KiB
Python
206 lines
7.2 KiB
Python
|
|
import pandas as pd
|
|||
|
|
from sklearn.model_selection import StratifiedKFold
|
|||
|
|
from sklearn.svm import OneClassSVM
|
|||
|
|
from sklearn.ensemble import IsolationForest
|
|||
|
|
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix
|
|||
|
|
import sys
|
|||
|
|
import _pickle as pkl
|
|||
|
|
import numpy as np
|
|||
|
|
import warnings
|
|||
|
|
|
|||
|
|
warnings.filterwarnings('ignore')
|
|||
|
|
|
|||
|
|
features_name = [
|
|||
|
|
"Flow Duration",
|
|||
|
|
"Total Fwd Packet",
|
|||
|
|
"Total Bwd packets",
|
|||
|
|
"Total Length of Fwd Packet",
|
|||
|
|
"Total Length of Bwd Packet",
|
|||
|
|
"Fwd Packet Length Max",
|
|||
|
|
"Fwd Packet Length Min",
|
|||
|
|
"Fwd Packet Length Mean",
|
|||
|
|
"Fwd Packet Length Std",
|
|||
|
|
"Bwd Packet Length Max",
|
|||
|
|
"Bwd Packet Length Min",
|
|||
|
|
"Bwd Packet Length Mean",
|
|||
|
|
"Bwd Packet Length Std",
|
|||
|
|
"Flow Bytes/s",
|
|||
|
|
"Flow Packets/s",
|
|||
|
|
"Flow IAT Mean",
|
|||
|
|
"Flow IAT Std",
|
|||
|
|
"Flow IAT Max",
|
|||
|
|
"Flow IAT Min",
|
|||
|
|
"Fwd IAT Total",
|
|||
|
|
"Fwd IAT Mean",
|
|||
|
|
"Fwd IAT Std",
|
|||
|
|
"Fwd IAT Max",
|
|||
|
|
"Fwd IAT Min",
|
|||
|
|
"Bwd IAT Total",
|
|||
|
|
"Bwd IAT Mean",
|
|||
|
|
"Bwd IAT Std",
|
|||
|
|
"Bwd IAT Max",
|
|||
|
|
"Bwd IAT Min",
|
|||
|
|
"Fwd PSH Flags",
|
|||
|
|
"Bwd PSH Flags",
|
|||
|
|
"Fwd URG Flags",
|
|||
|
|
"Bwd URG Flags",
|
|||
|
|
"Fwd Header Length",
|
|||
|
|
"Bwd Header Length",
|
|||
|
|
"Fwd Packets/s",
|
|||
|
|
"Bwd Packets/s",
|
|||
|
|
"Packet Length Min",
|
|||
|
|
"Packet Length Max",
|
|||
|
|
"Packet Length Mean",
|
|||
|
|
"Packet Length Std",
|
|||
|
|
"Packet Length Variance",
|
|||
|
|
"FIN Flag Count",
|
|||
|
|
"SYN Flag Count",
|
|||
|
|
"RST Flag Count",
|
|||
|
|
"PSH Flag Count",
|
|||
|
|
"ACK Flag Count",
|
|||
|
|
"URG Flag Count",
|
|||
|
|
"CWR Flag Count",
|
|||
|
|
"ECE Flag Count",
|
|||
|
|
"Down/Up Ratio",
|
|||
|
|
"Average Packet Size",
|
|||
|
|
"Fwd Segment Size Avg",
|
|||
|
|
"Bwd Segment Size Avg",
|
|||
|
|
"Fwd Bytes/Bulk Avg",
|
|||
|
|
"Fwd Packet/Bulk Avg",
|
|||
|
|
"Fwd Bulk Rate Avg",
|
|||
|
|
"Bwd Bytes/Bulk Avg",
|
|||
|
|
"Bwd Packet/Bulk Avg",
|
|||
|
|
"Bwd Bulk Rate Avg",
|
|||
|
|
"Subflow Fwd Packets",
|
|||
|
|
"Subflow Fwd Bytes",
|
|||
|
|
"Subflow Bwd Packets",
|
|||
|
|
"Subflow Bwd Bytes",
|
|||
|
|
"FWD Init Win Bytes",
|
|||
|
|
"Bwd Init Win Bytes",
|
|||
|
|
"Fwd Act Data Pkts",
|
|||
|
|
"Fwd Seg Size Min",
|
|||
|
|
"Active Mean",
|
|||
|
|
"Active Std",
|
|||
|
|
"Active Max",
|
|||
|
|
"Active Min",
|
|||
|
|
"Idle Mean",
|
|||
|
|
"Idle Std",
|
|||
|
|
"Idle Max",
|
|||
|
|
"Idle Min",
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def print_important_feature(sort_index, num=10):
|
|||
|
|
print("top important feature is:")
|
|||
|
|
for index in sort_index[:num]:
|
|||
|
|
print(features_name[index])
|
|||
|
|
|
|||
|
|
|
|||
|
|
def ocsvm_classifier(train, test, test_ow="ndarray"):
|
|||
|
|
X = train.features.tolist()
|
|||
|
|
Y = train.label.tolist()
|
|||
|
|
test_X = test.features.tolist()
|
|||
|
|
# print(len(X), len(Y))
|
|||
|
|
# print(len(X[0]))
|
|||
|
|
ocsvm = OneClassSVM(kernel="linear")
|
|||
|
|
ocsvm.fit(X, Y)
|
|||
|
|
# importance = rf.feature_importances_
|
|||
|
|
# sort_index = np.flipud(importance.argsort())
|
|||
|
|
# print_important_feature(sort_index)
|
|||
|
|
pred_ret = ocsvm.predict(test_X)
|
|||
|
|
# print(pred_ret)
|
|||
|
|
if not isinstance(test_ow, str):
|
|||
|
|
ow_X = test_ow.features.tolist()
|
|||
|
|
return pred_ret, ocsvm.predict(ow_X)
|
|||
|
|
else:
|
|||
|
|
return pred_ret
|
|||
|
|
|
|||
|
|
|
|||
|
|
def svdd_classifier(train, test, test_ow="ndarray"):
|
|||
|
|
X = train.features.tolist()
|
|||
|
|
Y = train.label.tolist()
|
|||
|
|
test_X = test.features.tolist()
|
|||
|
|
# print(len(X), len(Y))
|
|||
|
|
# print(len(X[0]))
|
|||
|
|
ocsvm = OneClassSVM(kernel="rbf")
|
|||
|
|
ocsvm.fit(X, Y)
|
|||
|
|
# importance = rf.feature_importances_
|
|||
|
|
# sort_index = np.flipud(importance.argsort())
|
|||
|
|
# print_important_feature(sort_index)
|
|||
|
|
pred_ret = ocsvm.predict(test_X)
|
|||
|
|
# print(pred_ret)
|
|||
|
|
if not isinstance(test_ow, str):
|
|||
|
|
ow_X = test_ow.features.tolist()
|
|||
|
|
return pred_ret, ocsvm.predict(ow_X)
|
|||
|
|
else:
|
|||
|
|
return pred_ret
|
|||
|
|
|
|||
|
|
|
|||
|
|
def isolation_forest(train, test, test_ow="ndarray"):
|
|||
|
|
X = train.features.tolist()
|
|||
|
|
Y = train.label.tolist()
|
|||
|
|
test_X = test.features.tolist()
|
|||
|
|
# print(len(X), len(Y))
|
|||
|
|
# print(len(X[0]))
|
|||
|
|
ifc = IsolationForest()
|
|||
|
|
ifc.fit(X, Y)
|
|||
|
|
# importance = rf.feature_importances_
|
|||
|
|
# sort_index = np.flipud(importance.argsort())
|
|||
|
|
# print_important_feature(sort_index)
|
|||
|
|
pred_ret = ifc.predict(test_X)
|
|||
|
|
if not isinstance(test_ow, str):
|
|||
|
|
ow_X = test_ow.features.tolist()
|
|||
|
|
return pred_ret, ifc.predict(ow_X)
|
|||
|
|
else:
|
|||
|
|
return pred_ret
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
kf = StratifiedKFold(n_splits=5, shuffle=True)
|
|||
|
|
doh_dataset = pkl.load(open("./result/doh_features.pkl", "rb"))
|
|||
|
|
doh_dataset['label'] = doh_dataset['label'].map(lambda x: 1)
|
|||
|
|
for file in ["./result/web_features.pkl", "./result/chat_features.pkl", "./result/email_features.pkl",
|
|||
|
|
"./result/voip_features.pkl", "./result/file_features.pkl"]:
|
|||
|
|
print(file)
|
|||
|
|
web_dataset = pkl.load(open(file, "rb"))
|
|||
|
|
web_dataset = web_dataset.sample(min(len(web_dataset), len(doh_dataset) // 5))
|
|||
|
|
web_dataset['label'] = web_dataset['label'].map(lambda x: -1)
|
|||
|
|
# cw_file_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
|||
|
|
# cw_voip_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
|||
|
|
# ow_doh_dataset = pkl.load(open("./result/ow_doh_features.pkl", "rb"))
|
|||
|
|
# ow_web_dataset = pkl.load(open("./result/ow_web_features.pkl", "rb"))
|
|||
|
|
print("数据集组成如下:")
|
|||
|
|
print(f"封闭数据集中正负样本比例为1:{len(web_dataset) // len(doh_dataset)},"
|
|||
|
|
f"正样本数量为{len(doh_dataset)},负样本数量为{len(web_dataset)}")
|
|||
|
|
|
|||
|
|
print("load data suc!")
|
|||
|
|
cw_dataset = pd.concat([doh_dataset])
|
|||
|
|
# ow_dataset = pd.concat([ow_web_dataset, ow_doh_dataset])
|
|||
|
|
for clf in [isolation_forest, svdd_classifier, ocsvm_classifier]:
|
|||
|
|
classify = clf
|
|||
|
|
|
|||
|
|
for k, (train, test) in enumerate(kf.split(cw_dataset, list(cw_dataset.label))):
|
|||
|
|
test_dataset = pd.concat([cw_dataset.iloc[test], web_dataset])
|
|||
|
|
predict_results = classify(cw_dataset.iloc[train], test_dataset)
|
|||
|
|
gt_Y = test_dataset.label.tolist()
|
|||
|
|
precision = precision_score(gt_Y, predict_results, pos_label=0, average=None)
|
|||
|
|
recall = recall_score(gt_Y, predict_results, pos_label=0, average=None)
|
|||
|
|
f1 = f1_score(gt_Y, predict_results, pos_label=0, average=None)
|
|||
|
|
acc = accuracy_score(gt_Y, predict_results)
|
|||
|
|
print(confusion_matrix(gt_Y, predict_results))
|
|||
|
|
print("封闭测试集准确率: ", precision, end="\t")
|
|||
|
|
print("封闭测试集召回率: ", recall, end="\t")
|
|||
|
|
print("封闭测试集f1值: ", f1, end="\t")
|
|||
|
|
print("封闭测试集acc: ", acc)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# ow_gt_Y = ow_dataset.label.tolist()
|
|||
|
|
# precision = precision_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
|||
|
|
# recall = recall_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
|||
|
|
# f1 = f1_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
|||
|
|
# acc = accuracy_score(ow_gt_Y, ow_predict_result)
|
|||
|
|
# print("开放测试集准确率: ", precision, end="\t")
|
|||
|
|
# print("开放测试集召回率: ", recall, end="\t")
|
|||
|
|
# print("开放测试集f1值: ", f1, end="\t")
|
|||
|
|
# print("开放测试集acc: ", acc)
|