206 lines
7.2 KiB
Python
206 lines
7.2 KiB
Python
import pandas as pd
|
||
from sklearn.model_selection import StratifiedKFold
|
||
from sklearn.svm import OneClassSVM
|
||
from sklearn.ensemble import IsolationForest
|
||
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix
|
||
import sys
|
||
import _pickle as pkl
|
||
import numpy as np
|
||
import warnings
|
||
|
||
warnings.filterwarnings('ignore')
|
||
|
||
features_name = [
|
||
"Flow Duration",
|
||
"Total Fwd Packet",
|
||
"Total Bwd packets",
|
||
"Total Length of Fwd Packet",
|
||
"Total Length of Bwd Packet",
|
||
"Fwd Packet Length Max",
|
||
"Fwd Packet Length Min",
|
||
"Fwd Packet Length Mean",
|
||
"Fwd Packet Length Std",
|
||
"Bwd Packet Length Max",
|
||
"Bwd Packet Length Min",
|
||
"Bwd Packet Length Mean",
|
||
"Bwd Packet Length Std",
|
||
"Flow Bytes/s",
|
||
"Flow Packets/s",
|
||
"Flow IAT Mean",
|
||
"Flow IAT Std",
|
||
"Flow IAT Max",
|
||
"Flow IAT Min",
|
||
"Fwd IAT Total",
|
||
"Fwd IAT Mean",
|
||
"Fwd IAT Std",
|
||
"Fwd IAT Max",
|
||
"Fwd IAT Min",
|
||
"Bwd IAT Total",
|
||
"Bwd IAT Mean",
|
||
"Bwd IAT Std",
|
||
"Bwd IAT Max",
|
||
"Bwd IAT Min",
|
||
"Fwd PSH Flags",
|
||
"Bwd PSH Flags",
|
||
"Fwd URG Flags",
|
||
"Bwd URG Flags",
|
||
"Fwd Header Length",
|
||
"Bwd Header Length",
|
||
"Fwd Packets/s",
|
||
"Bwd Packets/s",
|
||
"Packet Length Min",
|
||
"Packet Length Max",
|
||
"Packet Length Mean",
|
||
"Packet Length Std",
|
||
"Packet Length Variance",
|
||
"FIN Flag Count",
|
||
"SYN Flag Count",
|
||
"RST Flag Count",
|
||
"PSH Flag Count",
|
||
"ACK Flag Count",
|
||
"URG Flag Count",
|
||
"CWR Flag Count",
|
||
"ECE Flag Count",
|
||
"Down/Up Ratio",
|
||
"Average Packet Size",
|
||
"Fwd Segment Size Avg",
|
||
"Bwd Segment Size Avg",
|
||
"Fwd Bytes/Bulk Avg",
|
||
"Fwd Packet/Bulk Avg",
|
||
"Fwd Bulk Rate Avg",
|
||
"Bwd Bytes/Bulk Avg",
|
||
"Bwd Packet/Bulk Avg",
|
||
"Bwd Bulk Rate Avg",
|
||
"Subflow Fwd Packets",
|
||
"Subflow Fwd Bytes",
|
||
"Subflow Bwd Packets",
|
||
"Subflow Bwd Bytes",
|
||
"FWD Init Win Bytes",
|
||
"Bwd Init Win Bytes",
|
||
"Fwd Act Data Pkts",
|
||
"Fwd Seg Size Min",
|
||
"Active Mean",
|
||
"Active Std",
|
||
"Active Max",
|
||
"Active Min",
|
||
"Idle Mean",
|
||
"Idle Std",
|
||
"Idle Max",
|
||
"Idle Min",
|
||
]
|
||
|
||
|
||
def print_important_feature(sort_index, num=10):
|
||
print("top important feature is:")
|
||
for index in sort_index[:num]:
|
||
print(features_name[index])
|
||
|
||
|
||
def ocsvm_classifier(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
ocsvm = OneClassSVM(kernel="linear")
|
||
ocsvm.fit(X, Y)
|
||
# importance = rf.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = ocsvm.predict(test_X)
|
||
# print(pred_ret)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, ocsvm.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
def svdd_classifier(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
ocsvm = OneClassSVM(kernel="rbf")
|
||
ocsvm.fit(X, Y)
|
||
# importance = rf.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = ocsvm.predict(test_X)
|
||
# print(pred_ret)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, ocsvm.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
def isolation_forest(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
ifc = IsolationForest()
|
||
ifc.fit(X, Y)
|
||
# importance = rf.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = ifc.predict(test_X)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, ifc.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
if __name__ == "__main__":
|
||
kf = StratifiedKFold(n_splits=5, shuffle=True)
|
||
doh_dataset = pkl.load(open("./result/doh_features.pkl", "rb"))
|
||
doh_dataset['label'] = doh_dataset['label'].map(lambda x: 1)
|
||
for file in ["./result/web_features.pkl", "./result/chat_features.pkl", "./result/email_features.pkl",
|
||
"./result/voip_features.pkl", "./result/file_features.pkl"]:
|
||
print(file)
|
||
web_dataset = pkl.load(open(file, "rb"))
|
||
web_dataset = web_dataset.sample(min(len(web_dataset), len(doh_dataset) // 5))
|
||
web_dataset['label'] = web_dataset['label'].map(lambda x: -1)
|
||
# cw_file_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
||
# cw_voip_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
||
# ow_doh_dataset = pkl.load(open("./result/ow_doh_features.pkl", "rb"))
|
||
# ow_web_dataset = pkl.load(open("./result/ow_web_features.pkl", "rb"))
|
||
print("数据集组成如下:")
|
||
print(f"封闭数据集中正负样本比例为1:{len(web_dataset) // len(doh_dataset)},"
|
||
f"正样本数量为{len(doh_dataset)},负样本数量为{len(web_dataset)}")
|
||
|
||
print("load data suc!")
|
||
cw_dataset = pd.concat([doh_dataset])
|
||
# ow_dataset = pd.concat([ow_web_dataset, ow_doh_dataset])
|
||
for clf in [isolation_forest, svdd_classifier, ocsvm_classifier]:
|
||
classify = clf
|
||
|
||
for k, (train, test) in enumerate(kf.split(cw_dataset, list(cw_dataset.label))):
|
||
test_dataset = pd.concat([cw_dataset.iloc[test], web_dataset])
|
||
predict_results = classify(cw_dataset.iloc[train], test_dataset)
|
||
gt_Y = test_dataset.label.tolist()
|
||
precision = precision_score(gt_Y, predict_results, pos_label=0, average=None)
|
||
recall = recall_score(gt_Y, predict_results, pos_label=0, average=None)
|
||
f1 = f1_score(gt_Y, predict_results, pos_label=0, average=None)
|
||
acc = accuracy_score(gt_Y, predict_results)
|
||
print(confusion_matrix(gt_Y, predict_results))
|
||
print("封闭测试集准确率: ", precision, end="\t")
|
||
print("封闭测试集召回率: ", recall, end="\t")
|
||
print("封闭测试集f1值: ", f1, end="\t")
|
||
print("封闭测试集acc: ", acc)
|
||
break
|
||
|
||
# ow_gt_Y = ow_dataset.label.tolist()
|
||
# precision = precision_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||
# recall = recall_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||
# f1 = f1_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||
# acc = accuracy_score(ow_gt_Y, ow_predict_result)
|
||
# print("开放测试集准确率: ", precision, end="\t")
|
||
# print("开放测试集召回率: ", recall, end="\t")
|
||
# print("开放测试集f1值: ", f1, end="\t")
|
||
# print("开放测试集acc: ", acc)
|