220 lines
7.7 KiB
Python
220 lines
7.7 KiB
Python
import pandas as pd
|
||
from sklearn.model_selection import StratifiedKFold
|
||
from sklearn.naive_bayes import GaussianNB
|
||
from sklearn.tree import DecisionTreeClassifier
|
||
from sklearn.svm import SVC
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, confusion_matrix
|
||
import sys
|
||
import _pickle as pkl
|
||
import numpy as np
|
||
|
||
features_name = [
|
||
"Flow Duration",
|
||
"Total Fwd Packet",
|
||
"Total Bwd packets",
|
||
"Total Length of Fwd Packet",
|
||
"Total Length of Bwd Packet",
|
||
"Fwd Packet Length Max",
|
||
"Fwd Packet Length Min",
|
||
"Fwd Packet Length Mean",
|
||
"Fwd Packet Length Std",
|
||
"Bwd Packet Length Max",
|
||
"Bwd Packet Length Min",
|
||
"Bwd Packet Length Mean",
|
||
"Bwd Packet Length Std",
|
||
"Flow Bytes/s",
|
||
"Flow Packets/s",
|
||
"Flow IAT Mean",
|
||
"Flow IAT Std",
|
||
"Flow IAT Max",
|
||
"Flow IAT Min",
|
||
"Fwd IAT Total",
|
||
"Fwd IAT Mean",
|
||
"Fwd IAT Std",
|
||
"Fwd IAT Max",
|
||
"Fwd IAT Min",
|
||
"Bwd IAT Total",
|
||
"Bwd IAT Mean",
|
||
"Bwd IAT Std",
|
||
"Bwd IAT Max",
|
||
"Bwd IAT Min",
|
||
"Fwd PSH Flags",
|
||
"Bwd PSH Flags",
|
||
"Fwd URG Flags",
|
||
"Bwd URG Flags",
|
||
"Fwd Header Length",
|
||
"Bwd Header Length",
|
||
"Fwd Packets/s",
|
||
"Bwd Packets/s",
|
||
"Packet Length Min",
|
||
"Packet Length Max",
|
||
"Packet Length Mean",
|
||
"Packet Length Std",
|
||
"Packet Length Variance",
|
||
"FIN Flag Count",
|
||
"SYN Flag Count",
|
||
"RST Flag Count",
|
||
"PSH Flag Count",
|
||
"ACK Flag Count",
|
||
"URG Flag Count",
|
||
"CWR Flag Count",
|
||
"ECE Flag Count",
|
||
"Down/Up Ratio",
|
||
"Average Packet Size",
|
||
"Fwd Segment Size Avg",
|
||
"Bwd Segment Size Avg",
|
||
"Fwd Bytes/Bulk Avg",
|
||
"Fwd Packet/Bulk Avg",
|
||
"Fwd Bulk Rate Avg",
|
||
"Bwd Bytes/Bulk Avg",
|
||
"Bwd Packet/Bulk Avg",
|
||
"Bwd Bulk Rate Avg",
|
||
"Subflow Fwd Packets",
|
||
"Subflow Fwd Bytes",
|
||
"Subflow Bwd Packets",
|
||
"Subflow Bwd Bytes",
|
||
"FWD Init Win Bytes",
|
||
"Bwd Init Win Bytes",
|
||
"Fwd Act Data Pkts",
|
||
"Fwd Seg Size Min",
|
||
"Active Mean",
|
||
"Active Std",
|
||
"Active Max",
|
||
"Active Min",
|
||
"Idle Mean",
|
||
"Idle Std",
|
||
"Idle Max",
|
||
"Idle Min",
|
||
]
|
||
|
||
|
||
def print_important_feature(sort_index, num=10):
|
||
print("top important feature is:")
|
||
for index in sort_index[:num]:
|
||
print(features_name[index])
|
||
|
||
|
||
def random_forest(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
nb = RandomForestClassifier()
|
||
nb.fit(X, Y)
|
||
# importance = nb.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = nb.predict(test_X)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, nb.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
def naive_bayesian(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
rf = GaussianNB()
|
||
rf.fit(X, Y)
|
||
# importance = rf.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = rf.predict(test_X)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, rf.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
def decision_tree_classifier(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
dt = DecisionTreeClassifier()
|
||
dt.fit(X, Y)
|
||
# importance = rf.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = dt.predict(test_X)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, dt.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
def svm_classifier(train, test, test_ow="ndarray"):
|
||
X = train.features.tolist()
|
||
Y = train.label.tolist()
|
||
test_X = test.features.tolist()
|
||
# print(len(X), len(Y))
|
||
# print(len(X[0]))
|
||
svm = SVC()
|
||
svm.fit(X, Y)
|
||
# importance = rf.feature_importances_
|
||
# sort_index = np.flipud(importance.argsort())
|
||
# print_important_feature(sort_index)
|
||
pred_ret = svm.predict(test_X)
|
||
if not isinstance(test_ow, str):
|
||
ow_X = test_ow.features.tolist()
|
||
return pred_ret, svm.predict(ow_X)
|
||
else:
|
||
return pred_ret
|
||
|
||
|
||
if __name__ == "__main__":
|
||
kf = StratifiedKFold(n_splits=5, shuffle=True)
|
||
for file in ["./result/web_features.pkl", "./result/chat_features.pkl", "./result/email_features.pkl",
|
||
"./result/voip_features.pkl", "./result/file_features.pkl"]:
|
||
doh_dataset = pkl.load(open("./result/doh_features.pkl", "rb"))
|
||
print("测试负样本", file)
|
||
web_dataset = pkl.load(open(file, "rb"))
|
||
doh_dataset = doh_dataset.sample(min(len(web_dataset), len(doh_dataset) * 1))
|
||
web_dataset = web_dataset.sample(min(len(web_dataset), len(doh_dataset) * 1))
|
||
# cw_file_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
||
# cw_voip_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb"))
|
||
# ow_doh_dataset = pkl.load(open("./result/ow_doh_features.pkl", "rb"))
|
||
# ow_web_dataset = pkl.load(open("./result/ow_web_features.pkl", "rb"))
|
||
print("数据集组成如下:")
|
||
print(f"封闭数据集中正负样本比例为1:{len(web_dataset) // len(doh_dataset)},"
|
||
f"正样本数量为{len(doh_dataset)},负样本数量为{len(web_dataset)}")
|
||
|
||
print("load data suc!")
|
||
cw_dataset = pd.concat([web_dataset, doh_dataset])
|
||
# ow_dataset = pd.concat([ow_web_dataset, ow_doh_dataset])
|
||
for clf in [naive_bayesian, svm_classifier, decision_tree_classifier, random_forest]:
|
||
classify = clf
|
||
|
||
for k, (train, test) in enumerate(kf.split(cw_dataset, list(cw_dataset.label))):
|
||
predict_results = classify(cw_dataset.iloc[train], cw_dataset.iloc[test])
|
||
gt_Y = cw_dataset.iloc[test].label.tolist()
|
||
precision = precision_score(gt_Y, predict_results, pos_label=0, average="binary")
|
||
recall = recall_score(gt_Y, predict_results, pos_label=0, average="binary")
|
||
f1 = f1_score(gt_Y, predict_results, pos_label=0, average="binary")
|
||
acc = accuracy_score(gt_Y, predict_results)
|
||
print(confusion_matrix(gt_Y, predict_results))
|
||
print("封闭测试集准确率: ", precision, end="\t")
|
||
print("封闭测试集召回率: ", recall, end="\t")
|
||
print("封闭测试集f1值: ", f1, end="\t")
|
||
print("封闭测试集acc: ", acc)
|
||
break
|
||
|
||
# ow_gt_Y = ow_dataset.label.tolist()
|
||
# precision = precision_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||
# recall = recall_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||
# f1 = f1_score(ow_gt_Y, ow_predict_result, pos_label=0, average="binary")
|
||
# acc = accuracy_score(ow_gt_Y, ow_predict_result)
|
||
# print("开放测试集准确率: ", precision, end="\t")
|
||
# print("开放测试集召回率: ", recall, end="\t")
|
||
# print("开放测试集f1值: ", f1, end="\t")
|
||
# print("开放测试集acc: ", acc)
|