abc
This commit is contained in:
205
occ.py
Normal file
205
occ.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.svm import OneClassSVM
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix
|
||||
import sys
|
||||
import _pickle as pkl
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
features_name = [
|
||||
"Flow Duration",
|
||||
"Total Fwd Packet",
|
||||
"Total Bwd packets",
|
||||
"Total Length of Fwd Packet",
|
||||
"Total Length of Bwd Packet",
|
||||
"Fwd Packet Length Max",
|
||||
"Fwd Packet Length Min",
|
||||
"Fwd Packet Length Mean",
|
||||
"Fwd Packet Length Std",
|
||||
"Bwd Packet Length Max",
|
||||
"Bwd Packet Length Min",
|
||||
"Bwd Packet Length Mean",
|
||||
"Bwd Packet Length Std",
|
||||
"Flow Bytes/s",
|
||||
"Flow Packets/s",
|
||||
"Flow IAT Mean",
|
||||
"Flow IAT Std",
|
||||
"Flow IAT Max",
|
||||
"Flow IAT Min",
|
||||
"Fwd IAT Total",
|
||||
"Fwd IAT Mean",
|
||||
"Fwd IAT Std",
|
||||
"Fwd IAT Max",
|
||||
"Fwd IAT Min",
|
||||
"Bwd IAT Total",
|
||||
"Bwd IAT Mean",
|
||||
"Bwd IAT Std",
|
||||
"Bwd IAT Max",
|
||||
"Bwd IAT Min",
|
||||
"Fwd PSH Flags",
|
||||
"Bwd PSH Flags",
|
||||
"Fwd URG Flags",
|
||||
"Bwd URG Flags",
|
||||
"Fwd Header Length",
|
||||
"Bwd Header Length",
|
||||
"Fwd Packets/s",
|
||||
"Bwd Packets/s",
|
||||
"Packet Length Min",
|
||||
"Packet Length Max",
|
||||
"Packet Length Mean",
|
||||
"Packet Length Std",
|
||||
"Packet Length Variance",
|
||||
"FIN Flag Count",
|
||||
"SYN Flag Count",
|
||||
"RST Flag Count",
|
||||
"PSH Flag Count",
|
||||
"ACK Flag Count",
|
||||
"URG Flag Count",
|
||||
"CWR Flag Count",
|
||||
"ECE Flag Count",
|
||||
"Down/Up Ratio",
|
||||
"Average Packet Size",
|
||||
"Fwd Segment Size Avg",
|
||||
"Bwd Segment Size Avg",
|
||||
"Fwd Bytes/Bulk Avg",
|
||||
"Fwd Packet/Bulk Avg",
|
||||
"Fwd Bulk Rate Avg",
|
||||
"Bwd Bytes/Bulk Avg",
|
||||
"Bwd Packet/Bulk Avg",
|
||||
"Bwd Bulk Rate Avg",
|
||||
"Subflow Fwd Packets",
|
||||
"Subflow Fwd Bytes",
|
||||
"Subflow Bwd Packets",
|
||||
"Subflow Bwd Bytes",
|
||||
"FWD Init Win Bytes",
|
||||
"Bwd Init Win Bytes",
|
||||
"Fwd Act Data Pkts",
|
||||
"Fwd Seg Size Min",
|
||||
"Active Mean",
|
||||
"Active Std",
|
||||
"Active Max",
|
||||
"Active Min",
|
||||
"Idle Mean",
|
||||
"Idle Std",
|
||||
"Idle Max",
|
||||
"Idle Min",
|
||||
]
|
||||
|
||||
|
||||
def print_important_feature(sort_index, num=10):
|
||||
print("top important feature is:")
|
||||
for index in sort_index[:num]:
|
||||
print(features_name[index])
|
||||
|
||||
|
||||
def ocsvm_classifier(train, test, test_ow="ndarray"):
|
||||
X = train.features.tolist()
|
||||
Y = train.label.tolist()
|
||||
test_X = test.features.tolist()
|
||||
# print(len(X), len(Y))
|
||||
# print(len(X[0]))
|
||||
ocsvm = OneClassSVM(kernel="linear")
|
||||
ocsvm.fit(X, Y)
|
||||
# importance = rf.feature_importances_
|
||||
# sort_index = np.flipud(importance.argsort())
|
||||
# print_important_feature(sort_index)
|
||||
pred_ret = ocsvm.predict(test_X)
|
||||
# print(pred_ret)
|
||||
if not isinstance(test_ow, str):
|
||||
ow_X = test_ow.features.tolist()
|
||||
return pred_ret, ocsvm.predict(ow_X)
|
||||
else:
|
||||
return pred_ret
|
||||
|
||||
|
||||
def svdd_classifier(train, test, test_ow="ndarray"):
|
||||
X = train.features.tolist()
|
||||
Y = train.label.tolist()
|
||||
test_X = test.features.tolist()
|
||||
# print(len(X), len(Y))
|
||||
# print(len(X[0]))
|
||||
ocsvm = OneClassSVM(kernel="rbf")
|
||||
ocsvm.fit(X, Y)
|
||||
# importance = rf.feature_importances_
|
||||
# sort_index = np.flipud(importance.argsort())
|
||||
# print_important_feature(sort_index)
|
||||
pred_ret = ocsvm.predict(test_X)
|
||||
# print(pred_ret)
|
||||
if not isinstance(test_ow, str):
|
||||
ow_X = test_ow.features.tolist()
|
||||
return pred_ret, ocsvm.predict(ow_X)
|
||||
else:
|
||||
return pred_ret
|
||||
|
||||
|
||||
def isolation_forest(train, test, test_ow="ndarray"):
|
||||
X = train.features.tolist()
|
||||
Y = train.label.tolist()
|
||||
test_X = test.features.tolist()
|
||||
# print(len(X), len(Y))
|
||||
# print(len(X[0]))
|
||||
ifc = IsolationForest()
|
||||
ifc.fit(X, Y)
|
||||
# importance = rf.feature_importances_
|
||||
# sort_index = np.flipud(importance.argsort())
|
||||
# print_important_feature(sort_index)
|
||||
pred_ret = ifc.predict(test_X)
|
||||
if not isinstance(test_ow, str):
|
||||
ow_X = test_ow.features.tolist()
|
||||
return pred_ret, ifc.predict(ow_X)
|
||||
else:
|
||||
return pred_ret
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
kf = StratifiedKFold(n_splits=5, shuffle=True)
|
||||
doh_dataset = pkl.load(open("./result/doh_features.pkl", "rb"))
|
||||
doh_dataset['label'] = doh_dataset['label'].map(lambda x: 1)
|
||||
for file in ["./result/web_features.pkl", "./result/chat_features.pkl", "./result/email_features.pkl",
|
||||
"./result/voip_features.pkl", "./result/file_features.pkl"]:
|
||||
print(file)
|
||||
web_dataset = pkl.load(open(file, "rb"))
|
||||
web_dataset = web_dataset.sample(min(len(web_dataset), len(doh_dataset) // 5))
|
||||
web_dataset['label'] = web_dataset['label'].map(lambda x: -1)
|
||||
# cw_file_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
||||
# cw_voip_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
||||
# ow_doh_dataset = pkl.load(open("./result/ow_doh_features.pkl", "rb"))
|
||||
# ow_web_dataset = pkl.load(open("./result/ow_web_features.pkl", "rb"))
|
||||
print("数据集组成如下:")
|
||||
print(f"封闭数据集中正负样本比例为1:{len(web_dataset) // len(doh_dataset)},"
|
||||
f"正样本数量为{len(doh_dataset)},负样本数量为{len(web_dataset)}")
|
||||
|
||||
print("load data suc!")
|
||||
cw_dataset = pd.concat([doh_dataset])
|
||||
# ow_dataset = pd.concat([ow_web_dataset, ow_doh_dataset])
|
||||
for clf in [isolation_forest, svdd_classifier, ocsvm_classifier]:
|
||||
classify = clf
|
||||
|
||||
for k, (train, test) in enumerate(kf.split(cw_dataset, list(cw_dataset.label))):
|
||||
test_dataset = pd.concat([cw_dataset.iloc[test], web_dataset])
|
||||
predict_results = classify(cw_dataset.iloc[train], test_dataset)
|
||||
gt_Y = test_dataset.label.tolist()
|
||||
precision = precision_score(gt_Y, predict_results, pos_label=0, average=None)
|
||||
recall = recall_score(gt_Y, predict_results, pos_label=0, average=None)
|
||||
f1 = f1_score(gt_Y, predict_results, pos_label=0, average=None)
|
||||
acc = accuracy_score(gt_Y, predict_results)
|
||||
print(confusion_matrix(gt_Y, predict_results))
|
||||
print("封闭测试集准确率: ", precision, end="\t")
|
||||
print("封闭测试集召回率: ", recall, end="\t")
|
||||
print("封闭测试集f1值: ", f1, end="\t")
|
||||
print("封闭测试集acc: ", acc)
|
||||
break
|
||||
|
||||
# ow_gt_Y = ow_dataset.label.tolist()
|
||||
# precision = precision_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||||
# recall = recall_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||||
# f1 = f1_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||||
# acc = accuracy_score(ow_gt_Y, ow_predict_result)
|
||||
# print("开放测试集准确率: ", precision, end="\t")
|
||||
# print("开放测试集召回率: ", recall, end="\t")
|
||||
# print("开放测试集f1值: ", f1, end="\t")
|
||||
# print("开放测试集acc: ", acc)
|
||||
Reference in New Issue
Block a user