This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
grityu-model-duplication/test4.py
2023-03-16 22:42:35 +08:00

92 lines
2.6 KiB
Python

# Name:fang xiaoyu
# Time: 2023/3/11 18:43
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dpkt
import socket
import struct
import binascii
from scapy.all import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
#import tshark
import scapy
# 从pcap文件中读取流量数据
def read_pcap(filename):
packets = rdpcap(filename)
flows = {}
for packet in packets:
if packet.haslayer(TCP):
src_ip = packet[IP].src
dst_ip = packet[IP].dst
src_port = packet[TCP].sport
dst_port = packet[TCP].dport
key = (src_ip, dst_ip, src_port, dst_port)
if key not in flows:
flows[key] = [packet]
else:
flows[key].append(packet)
return flows
# 提取流量数据的特征
def extract_features(flow):
features = []
total_len = 0
total_pkts = len(flow)
start_time = flow[0].time
end_time = flow[-1].time
for packet in flow:
total_len += len(packet)
duration = end_time - start_time
features.append(total_len)
features.append(total_pkts)
features.append(duration)
return features
# 将特征向量转换为numpy数组
def vectorize_data(data):
return np.array(data)
# 读取VPN和non-VPN流量数据
vpn_traffic = read_pcap('vpn_traffic.pcap')
nonvpn_traffic = read_pcap('nonvpn_traffic.pcap')
# 提取VPN和non-VPN流量的特征
vpn_traffic = [extract_features(flow) for flow in vpn_traffic.values()]
nonvpn_traffic = [extract_features(flow) for flow in nonvpn_traffic.values()]
# 将VPN和non-VPN流量数据转换为numpy数组
vpn_traffic = vectorize_data(vpn_traffic)
nonvpn_traffic = vectorize_data(nonvpn_traffic)
# 将VPN和non-VPN流量数据合并
X = np.concatenate((vpn_traffic, nonvpn_traffic))
y = np.concatenate((np.ones(len(vpn_traffic)), np.zeros(len(nonvpn_traffic))))
# 使用交叉验证选择最佳的K值
cv_scores = []
for k in range(1, 31):
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
cv_scores.append(scores.mean())
# 可视化交叉验证结果
plt.plot(range(1, 31), cv_scores)
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.show()
# 使用最佳的K值进行模型训练和预测
best_k = np.argmax(cv_scores) + 1
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X, y)
# 对新数据进行预测
new_data = read_pcap('new_traffic.pcap')
new_data = [extract_features(flow) for flow in new_data.values()]
new_data = vectorize_data(new_data)
prediction = knn.predict(new_data)
print(prediction)