Delete test_2.py

This commit is contained in:
fang xiaoyu
2023-03-16 15:00:55 +00:00
parent 8a0493110b
commit 9e686da745

167
test_2.py
View File

@@ -1,167 +0,0 @@
# # Name:fang xiaoyu
# # Time: 2023/3/10 23:53
# # 导入所需库
# import pandas as pd
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# from subprocess import run
#
# # 使用Tranalyzer2分析PCAP文件并提取TCP流量特征
# def extract_features(pcap_file):
# # 定义Tranalyzer2命令行参数
# tranalyzer_args = [
# "t2build", "-x", "tcp", "--no-tests", "--no-progress",
# "--tcp-fields", "sip dip sport dport tcp_flags tcp_flags_str bytes",
# "--histo", "sip dip sport dport", "--top", "sip dip sport dport bytes",
# "--both-ways", "--export", "csv", "-w", "-"
# ]
# # 运行Tranalyzer2命令行并将结果保存为CSV文件
# result = run(["sudo", "tshark", "-r", pcap_file, "-w", "-", "-F", "pcapng", "-Y", "tcp"],
# stdout=PIPE, check=True)
# result = run(["sudo", "tranalyzer2", "-r", "-", *tranalyzer_args], input=result.stdout, stdout=PIPE, check=True)
# result_csv = result.stdout.decode()
# # 解析CSV文件并返回特征数据框
# features_df = pd.read_csv(pd.compat.StringIO(result_csv))
# return features_df
#
# # 加载训练数据集,并提取特征
# train_pcap = "20230309_fxy_psiphon_operation.pcapng"
# train_labels = pd.read_csv("/path/to/train_labels.csv")
# train_features = extract_features(train_pcap)
#
# # 将标签与特征合并为单个数据框
# train_data = pd.merge(train_features, train_labels, on="flow_key")
#
# # 分割数据集为训练集和测试集
# X_train, X_test, y_train, y_test = train_test_split(train_data.drop(["flow_key", "label"], axis=1), train_data["label"], test_size=0.3)
#
# # 训练KNN模型
# knn = KNeighborsClassifier(n_neighbors=5)
# knn.fit(X_train, y_train)
#
# # 在测试集上评估模型性能
# accuracy = knn.score(X_test, y_test)
# print("Accuracy:", accuracy)
#
#
'''
import subprocess
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# Step 1: Install Tranalyzer2 and required Python modules
# Step 2: Extract features using Tranalyzer2
pcap_file = '20230309_fxy_psiphon_operation.pcapng'
output_file = 'output.csv'
command = f'sudo t2 -r {pcap_file} -w {output_file} -c basic'
subprocess.call(command, shell=True)
# Step 3: Load features into a Pandas dataframe and convert to NumPy array
data = pd.read_csv(output_file)
features = np.array(data)
# Step 4: Prepare the dataset
X = features[:, :-1]
y = features[:, -1]
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Step 5: Train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
# Step 6: Evaluate the model
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
'''
import pandas as pd
import numpy as np
import os
import glob
import subprocess
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
# 提取流量特征
def extract_features(pcap_file):
# 运行tranalyzer2命令行工具
command = "t2 -r {pcap_file} -w {pcap_file}.csv -f features.csv"
subprocess.run(command, shell=True)
# 读取特征数据
features = pd.read_csv('features.csv', skiprows=6, header=None, delimiter=';', index_col=0)
# 删除无用列
features.drop([1, 2, 3, 4, 5], axis=1, inplace=True)
# 重命名列名
features.columns = ['duration', 'protocol', 'src_ip', 'src_port', 'dst_ip', 'dst_port', 'packets', 'bytes', 'flows',
'flags', 'tos', 'class']
# 删除class列因为我们不需要使用它
features.drop(['class'], axis=1, inplace=True)
return features
# 获取所有pcap文件
# pcap_files = []
# for file in os.listdir('.'):
# if file.endswith('.pcap'):
# pcap_files.append(file)
folder_path = "wcx-抓包-用于模型复现"
pcap_files = []
for file in glob.glob(os.path.join(folder_path, "*.pcap")):
pcap_files.append(file)
# 提取所有pcap文件的特征
features_list = []
for pcap_file in pcap_files:
features = extract_features(pcap_file)
features_list.append(features)
# 将所有特征合并成一个DataFrame
all_features = pd.concat(features_list)
# 标准化特征数据
mean = all_features.mean()
std = all_features.std()
normalized_features = (all_features - mean) / std
# 将标准化后的特征数据和SVM模型拟合
X_train, X_test, y_train, y_test = train_test_split(normalized_features.values, np.zeros(len(normalized_features)),
test_size=0.2, random_state=42)
clf = SVC()
# 使用交叉验证来评估模型性能
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross Validation Scores: {scores}")
print(f"Mean Score: {np.mean(scores)}")
print(f"Std Score: {np.std(scores)}")
# 在测试集上测试并计算准确率
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")