This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
grityu-model-duplication/test_2.py
2023-03-16 22:42:35 +08:00

168 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# # Name:fang xiaoyu
# # Time: 2023/3/10 23:53
# # 导入所需库
# import pandas as pd
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# from subprocess import run
#
# # 使用Tranalyzer2分析PCAP文件并提取TCP流量特征
# def extract_features(pcap_file):
# # 定义Tranalyzer2命令行参数
# tranalyzer_args = [
# "t2build", "-x", "tcp", "--no-tests", "--no-progress",
# "--tcp-fields", "sip dip sport dport tcp_flags tcp_flags_str bytes",
# "--histo", "sip dip sport dport", "--top", "sip dip sport dport bytes",
# "--both-ways", "--export", "csv", "-w", "-"
# ]
# # 运行Tranalyzer2命令行并将结果保存为CSV文件
# result = run(["sudo", "tshark", "-r", pcap_file, "-w", "-", "-F", "pcapng", "-Y", "tcp"],
# stdout=PIPE, check=True)
# result = run(["sudo", "tranalyzer2", "-r", "-", *tranalyzer_args], input=result.stdout, stdout=PIPE, check=True)
# result_csv = result.stdout.decode()
# # 解析CSV文件并返回特征数据框
# features_df = pd.read_csv(pd.compat.StringIO(result_csv))
# return features_df
#
# # 加载训练数据集,并提取特征
# train_pcap = "20230309_fxy_psiphon_operation.pcapng"
# train_labels = pd.read_csv("/path/to/train_labels.csv")
# train_features = extract_features(train_pcap)
#
# # 将标签与特征合并为单个数据框
# train_data = pd.merge(train_features, train_labels, on="flow_key")
#
# # 分割数据集为训练集和测试集
# X_train, X_test, y_train, y_test = train_test_split(train_data.drop(["flow_key", "label"], axis=1), train_data["label"], test_size=0.3)
#
# # 训练KNN模型
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train, y_train)
#
# # 在测试集上评估模型性能
# accuracy = knn.score(X_test, y_test)
# print("Accuracy:", accuracy)
#
#
'''
import subprocess
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Step 1: Install Tranalyzer2 and required Python modules
# Step 2: Extract features using Tranalyzer2
pcap_file = '20230309_fxy_psiphon_operation.pcapng'
output_file = 'output.csv'
command = f'sudo t2 -r {pcap_file} -w {output_file} -c basic'
subprocess.call(command, shell=True)
# Step 3: Load features into a Pandas dataframe and convert to NumPy array
data = pd.read_csv(output_file)
features = np.array(data)
# Step 4: Prepare the dataset
X = features[:, :-1]
y = features[:, -1]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Step 5: Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# Step 6: Evaluate the model
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
'''
import pandas as pd
import numpy as np
import os
import glob
import subprocess
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# 提取流量特征
def extract_features(pcap_file):
# 运行tranalyzer2命令行工具
command = "t2 -r {pcap_file} -w {pcap_file}.csv -f features.csv"
subprocess.run(command, shell=True)
# 读取特征数据
features = pd.read_csv('features.csv', skiprows=6, header=None, delimiter=';', index_col=0)
# 删除无用列
features.drop([1, 2, 3, 4, 5], axis=1, inplace=True)
# 重命名列名
features.columns = ['duration', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'packets', 'bytes', 'flows',
'flags', 'tos', 'class']
# 删除class列因为我们不需要使用它
features.drop(['class'], axis=1, inplace=True)
return features
# 获取所有pcap文件
# pcap_files = []
# for file in os.listdir('.'):
# if file.endswith('.pcap'):
# pcap_files.append(file)
folder_path = "wcx-抓包-用于模型复现"
pcap_files = []
for file in glob.glob(os.path.join(folder_path, "*.pcap")):
pcap_files.append(file)
# 提取所有pcap文件的特征
features_list = []
for pcap_file in pcap_files:
features = extract_features(pcap_file)
features_list.append(features)
# 将所有特征合并成一个DataFrame
all_features = pd.concat(features_list)
# 标准化特征数据
mean = all_features.mean()
std = all_features.std()
normalized_features = (all_features - mean) / std
# 将标准化后的特征数据和SVM模型拟合
X_train, X_test, y_train, y_test = train_test_split(normalized_features.values, np.zeros(len(normalized_features)),
test_size=0.2, random_state=42)
clf = SVC()
# 使用交叉验证来评估模型性能
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross Validation Scores: {scores}")
print(f"Mean Score: {np.mean(scores)}")
print(f"Std Score: {np.std(scores)}")
# 在测试集上测试并计算准确率
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")