# Name:fang xiaoyu
# Time: 2023/3/11 22:35
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix , precision_score, recall_score, f1_score
import xgboost as xgb
from sklearn.metrics import classification_report

# 读取CSV文件
df = pd.read_csv('sufshark_openvpn_tcp+youdao_header.csv')


# 将VPN和非VPN数据分为两个类别
df['class1'] = np.where(df['class1'] == 'VPN', 1, 0)


# 划分训练集和测试集
train, test = train_test_split(df, test_size=0.2, random_state=42)

# 将数据转换为DMatrix格式
train_dmatrix = xgb.DMatrix(data=train.drop(['class1'], axis=1), label=train['class1'])
test_dmatrix = xgb.DMatrix(data=test.drop(['class1'], axis=1), label=test['class1'])

# print(train_dmatrix)
# print(test_dmatrix)

# 定义XGBoost模型参数
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# 训练XGBoost模型
xgb_model = xgb.train(
    params=params,
    dtrain=train_dmatrix,
    num_boost_round=100,
    early_stopping_rounds=10,
    evals=[(test_dmatrix, 'test')]
)

# 对测试集进行预测
y_pred = xgb_model.predict(test_dmatrix)

# 将预测结果转换为类别标签
y_pred_label = np.where(y_pred > 0.5, 1, 0)

# 计算模型准确性
accuracy = accuracy_score(test['class1'], y_pred_label)
precision = precision_score(test['class1'], y_pred_label)
recall = recall_score(test['class1'], y_pred_label)
f1 = f1_score(test['class1'], y_pred_label)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(classification_report(test['class1'], y_pred_label))
# Accuracy: 0.9139125799573561
# Precision: 0.8972275334608031
# Recall: 0.9455919395465995
# F1-score: 0.9207750797154771