增加中期实验数据,代码,ppt

This commit is contained in:
崔一鸣
2019-12-23 01:20:51 +08:00
parent 5508ddeca0
commit bfc0df0f0d
35 changed files with 307836 additions and 271 deletions

View File

@@ -99,6 +99,7 @@ def main():
stream = li[3]
host = li[4]
if(stream.split(' ')[4] != '443'):
traceback.print_exc()
continue
ua = ""
for index in range(5, len(li), 1):

View File

@@ -4,11 +4,15 @@ import traceback
filenameList = [
#"http.log.test",
"./log/2019-12-06/http.log.2019-12-06-0",
"./log/2019-12-04/http2.log.2019-12-06-0",
"./log/2019-12-20_21/http.log.2019-12-20",
"./log/2019-12-20_21/http2.log.2019-12-20",
"./log/2019-12-20_21/http.log.2019-12-21",
"./log/2019-12-20_21/http2.log.2019-12-21",
]
outputFile = "./result.txt"
'''
appDict = {
"wechat" : ["wechat", "MicroMessenger Client", "MicroMessenger"],
"qq" : ["qq", "TencentMidasConnect"],
@@ -34,7 +38,15 @@ appDict = {
"safari" : ["Version/12.1.2", "MobileSafari"],
"firefox" : ["FxiOS"],
}
'''
appDict = {
"douyin" : ["Aweme", "ttplayer"],
"weibo" : ["weibo", "微博", "afma-sdk-onShow-v", "SensorsAnalytics"],
"toutiao" : ["News", "今日头条"],
"hupu" : ["hupu", "prokanqiu", "虎扑", "AVMDL"],
"zhihu": ["osee2unifiedRelease",]
}
def getAppName(ua):
for name, ids in appDict.items():
@@ -74,6 +86,7 @@ filterUaList = {
"swcd",
"null",
"SafariSafeBrowsing",
"CriOS"
}
def handleUnknownApp(host, stream, ua):
@@ -91,16 +104,19 @@ def main():
stm2app_dict = dict()
with open(outputFile, "w+") as f1:
for filename in filenameList:
with open(filename) as f:
with open(filename, errors='ignore') as f:
logs = f.readlines()
for log in logs:
try:
li = log.split(',')
stream = li[3]
host = li[4]
if(stream.split(' ')[4] != '443'):
try:
if(stream.split(' ')[4] != '443'):
continue
ua = ""
except:
continue
ua = ""
for index in range(5, len(li), 1):
ua += li[index]
host = host.strip()
@@ -108,14 +124,22 @@ def main():
ua = ua.strip()
appName = getAppName(ua)
if appName != None:
stm2app_dict[stream] = appName
if stream not in stm2app_dict.keys():
stm2app_dict[stream] = set()
stm2app_dict[stream].add(appName)
else:
handleUnknownApp(host, stream, ua)
except:
print("log: " + log)
traceback.print_exc()
for stream, app in stm2app_dict.items():
f1.write(stream + ": " + app + "\n")
for stream, apps in stm2app_dict.items():
if len(apps) > 1:
continue
f1.write(stream + " ")
for app in apps:
f1.write(app + " ")
f1.write("\n")
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

10540
DataSet/DataTag/result.txt Normal file

File diff suppressed because it is too large Load Diff

16
DataSet/DataTag/test.py Normal file
View File

@@ -0,0 +1,16 @@
import sys
import traceback
filename = "./log/2019-12-20_21/http2.log.2019-12-21"
with open(filename) as f:
lines = f.readlines()
print(len(lines))

View File

@@ -122,7 +122,7 @@ extern "C" unsigned char sslstat_entry(stSessionInfo *session_info, void **param
extern "C" int sslstat_init(){
g_fp = fopen("./ssl_stat.txt", "w+");
g_fp = fopen("./ssl_stat.txt", "a+");
return 0;
}

View File

@@ -78,6 +78,7 @@ struct tls_message_type g_tls_types[] = {
{23, 23, 0, "application_data"},
{24, 24, 0, "heartbeat"},
{25, 25, 0, "tls12_cid"},
{26, 22, -1, "handshake_unknown"},
};
struct pkt_stat_info{
@@ -127,6 +128,13 @@ struct pme_info{
struct ssl_chello chello;
int tls_message_count;
struct tls_message_info tls_info_list[STREAM_PACKET_COUNT_MAX];
unsigned char c2s_tls_payload[1500];
int c2s_tls_last_segment_len;
int c2s_tls_current_segment_offset;
unsigned char s2c_tls_payload[1500];
int s2c_tls_last_segment_len;
int s2c_tls_current_segment_offset;
int has_fin_rst;
};
int ipv4_header_parse(const void *a_packet, struct pkt_parsed_info* pktinfo){
@@ -200,43 +208,103 @@ int get_tls_message_type(int content_type, int handshake_type){
return i;
}
}
if(content_type == 22){
return type_count - 1;
}
return -1;
}
int tls_header_parse(struct streaminfo *stream, struct pme_info *pmeinfo, struct pkt_parsed_info *pktinfo){
unsigned char *buff = (unsigned char*)pktinfo->data;
int len = pktinfo->data_len;
int curdir = stream->curdir;
unsigned char *buff = NULL;
int len = 0;
if(curdir == 1){
if(pmeinfo->c2s_tls_current_segment_offset >= pktinfo->data_len){
pmeinfo->c2s_tls_current_segment_offset -= pktinfo->data_len;
return 0;
}
memcpy((char*)pmeinfo->c2s_tls_payload + pmeinfo->c2s_tls_last_segment_len,
pktinfo->data + pmeinfo->c2s_tls_current_segment_offset, pktinfo->data_len - pmeinfo->c2s_tls_current_segment_offset);
buff = pmeinfo->c2s_tls_payload;
len = pktinfo->data_len + pmeinfo->c2s_tls_last_segment_len - pmeinfo->c2s_tls_current_segment_offset;
}
if(curdir == 2){
if(pmeinfo->s2c_tls_current_segment_offset >= pktinfo->data_len){
pmeinfo->s2c_tls_current_segment_offset -= pktinfo->data_len;
return 0;
}
memcpy((char*)pmeinfo->s2c_tls_payload + pmeinfo->s2c_tls_last_segment_len,
pktinfo->data + pmeinfo->s2c_tls_current_segment_offset, pktinfo->data_len - pmeinfo->s2c_tls_current_segment_offset);
buff = pmeinfo->s2c_tls_payload;
len = pktinfo->data_len + pmeinfo->s2c_tls_last_segment_len - pmeinfo->s2c_tls_current_segment_offset;
}
int i = 0;
int flag = 0;
while(i < len){
if(i + 4 >= len){
return -1;
flag = 1;
break;
}
int content_type = buff[i];
int handshake_type = 0;
if(buff[i] == 0x16){
if(i + 5 >= len){
return -1;
flag = 1;
break;
}
handshake_type = buff[i + 5];
}
int message_type = get_tls_message_type(content_type, handshake_type);
if(message_type < 0){
return -1;
LOG_ERROR(g_logger, "message_type unknown, value = %02x %02x %02x %02x %02x\n", buff[i], buff[i + 1], buff[i + 2], buff[i + 3], buff[i + 4]);
flag = 2;
break;
}
int version = (uint16_t)(buff[i + 1] << 8) + (uint8_t)buff[i + 2];
if(version < 0x0300 || version > 0x0304){
return -1;
LOG_ERROR(g_logger, "version unknown, value = %02x %02x\n", buff[i + 1], buff[i + 2]);
flag = 2;
break;
}
int len = (uint16_t)(buff[i + 3] << 8) + (uint8_t)buff[i + 4];
if(len < 0){
printf("%02hhx %02hhx\n", buff[i + 3], buff[i + 4]);
}
pmeinfo->tls_info_list[pmeinfo->tls_message_count].dir = stream->curdir;
pmeinfo->tls_info_list[pmeinfo->tls_message_count].type = message_type;
pmeinfo->tls_info_list[pmeinfo->tls_message_count].length = len;
pmeinfo->tls_message_count++;
i += (5 + len);
}
if(flag == 1){
if(curdir == 1){
memcpy((char*)pmeinfo->c2s_tls_payload, pktinfo->data, len - i);
pmeinfo->c2s_tls_last_segment_len = len - i;
pmeinfo->c2s_tls_current_segment_offset = 0;
}
if(curdir == 2){
memcpy((char*)pmeinfo->s2c_tls_payload, pktinfo->data, len - i);
pmeinfo->s2c_tls_last_segment_len = len - i;
pmeinfo->s2c_tls_current_segment_offset = 0;
}
return -1;
}
if(flag == 2){
if(curdir == 1){
pmeinfo->c2s_tls_last_segment_len = 0;
pmeinfo->c2s_tls_current_segment_offset = 0;
}
if(curdir == 2){
pmeinfo->s2c_tls_last_segment_len = 0;
pmeinfo->s2c_tls_current_segment_offset = 0;
}
return -2;
}
if(curdir == 1){
pmeinfo->c2s_tls_last_segment_len = 0;
pmeinfo->c2s_tls_current_segment_offset = i - len;
}
if(curdir == 2){
pmeinfo->s2c_tls_last_segment_len = 0;
pmeinfo->s2c_tls_current_segment_offset = i - len;
}
return 0;
}
@@ -256,6 +324,10 @@ int packet_need_filter(struct pkt_parsed_info *pktinfo){
}
char pending_opstate(struct streaminfo *stream, struct pme_info *pmeinfo, struct pkt_parsed_info *pktinfo){
struct tcphdr *_tcphdr = pktinfo->tcphdr;
if(_tcphdr->fin || _tcphdr->rst){
pmeinfo->has_fin_rst = 1;
}
pmeinfo->last_c2s_pkt_index = -1;
pmeinfo->last_s2c_pkt_index = -1;
get_rawpkt_opt_from_streaminfo(stream, RAW_PKT_GET_TIMESTAMP, &(pmeinfo->start_time));
@@ -280,6 +352,10 @@ char pending_opstate(struct streaminfo *stream, struct pme_info *pmeinfo, struct
char data_opstate(struct streaminfo *stream, struct pme_info *pmeinfo, struct pkt_parsed_info *pktinfo){
get_rawpkt_opt_from_streaminfo(stream, RAW_PKT_GET_TIMESTAMP, &(pmeinfo->end_time));
struct tcphdr *_tcphdr = pktinfo->tcphdr;
if(_tcphdr->fin || _tcphdr->rst){
pmeinfo->has_fin_rst = 1;
}
if(packet_need_filter(pktinfo) == 0){
tls_header_parse(stream, pmeinfo, pktinfo);
int ret = packet_stat(stream, pmeinfo, pktinfo);
@@ -303,6 +379,9 @@ void time_tostring(struct timeval tv, char *buf, int buflen){
}
void output_result(struct pme_info *pmeinfo){
if(pmeinfo->has_fin_rst == 0){
return;
}
cJSON *log_obj = cJSON_CreateObject();
cJSON_AddStringToObject(log_obj, "sip", pmeinfo->sip);
cJSON_AddNumberToObject(log_obj, "sport", pmeinfo->sport);
@@ -373,6 +452,10 @@ void output_result(struct pme_info *pmeinfo){
char close_opstate(struct streaminfo *stream, struct pme_info *pmeinfo, struct pkt_parsed_info *pktinfo, const void *a_packet){
if(a_packet != NULL){
get_rawpkt_opt_from_streaminfo(stream, RAW_PKT_GET_TIMESTAMP, &(pmeinfo->end_time));
struct tcphdr *_tcphdr = pktinfo->tcphdr;
if(_tcphdr->fin || _tcphdr->rst){
pmeinfo->has_fin_rst = 1;
}
if(packet_need_filter(pktinfo) == 0){
tls_header_parse(stream, pmeinfo, pktinfo);
packet_stat(stream, pmeinfo, pktinfo);
@@ -438,7 +521,7 @@ extern "C" int stmstat_init(){
char *log_path = (char*)"./stream_stat.log";
int log_level = 10;
g_logger = MESA_create_runtime_log_handle(log_path, log_level);
g_fp = fopen("./stream_stat.txt", "w+");
g_fp = fopen("./stream_stat.txt", "a+");
return 0;
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,77 +0,0 @@
import sys
import os
import json
import pandas as pd
import numpy as np
PREFIX_DIR = "/Users/Leo/Documents/github/GradProj/"
def label_dict_build(date):
example_label_file = PREFIX_DIR + 'DataSet/result/' + date + '/stream_tag.txt'
example_label_df = pd.read_table(example_label_file, sep='\s+', header=None)
example_label_df[3] = 443
example_label = {tuple(example_label_df.iloc[i,0:4].values):example_label_df.iloc[i,4] for i in example_label_df.index}
return example_label
app_cert = dict()
def main():
date = sys.argv[1]
example_label = label_dict_build(date)
#print(example_label)
row_count = 1771
cloumn_count = 25
example_json_file = PREFIX_DIR + 'DataSet/result/' + date + '/ssl_stat.txt'
example_json_f = open(example_json_file, 'r')
array_shape = (row_count, cloumn_count)
result_data = np.zeros(array_shape)
result_label = list()
i = 0
for line in example_json_f.readlines():
example_json = json.loads(line)
#标签
try:
flow_key = (example_json['sip'], example_json['sport'], example_json['dip'], example_json['dport'])
label = example_label[flow_key]
except Exception:
#traceback.print_exc()
continue
#专家特征
result_label.append(label)
san_count = 0
if 'san' in example_json.keys():
san = example_json['san']
san_count = len(san.split(';'))
cert_count = example_json['Cert']['cert_count']
'''
cert_len_str = ''
for cert in example_json['Cert']['cert_list']:
cert_len_str += (str(cert['length']) + ',')
if label not in app_cert.keys():
app_cert[label] = set()
app_cert[label].add(cert_len_str)
'''
if label not in app_cert.keys():
app_cert[label] = set()
app_cert[label].add(san_count)
#result_data[i,:] = result
i += 1
print(i)
for k, v in app_cert.items():
print(k)
print(v)
'''
print('row = ' + str(row_count))
print("result_label = " + str(len(result_label)))
base_head = ['cert_count', 'cert_len', 'san_len', 's2c_pkts']
header = base_head
result_df = pd.DataFrame(result_data, columns=header)
result_df['label'] = np.array(result_label)
example_csv_file = PREFIX_DIR + 'Experiment/statFeature/csvFile/' + date + '/examples.csv'
result_df.to_csv(example_csv_file, index=False)
'''
if __name__ == '__main__':
main()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -1,51 +0,0 @@
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,recall_score,precision_score
import random
import matplotlib.pyplot as plt
%matplotlib inline
PREFIX_DIR = "/Users/Leo/Documents/github/GradProj/"
def RF():
classifer = RandomForestClassifier()
classifer.fit(x_train, y_train)
y_pred = classifer.predict(x_test)
f1_score_list.append(f1_score(y_test, y_pred, average='micro'))
recall_score_list.append(recall_score(y_test, y_pred, average='micro'))
precision_score_list.append(precision_score(y_test, y_pred, average='micro'))
scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]
score_df.loc['RandomForest'] = scores
score_df.plot.bar()
print(scores)
def main():
date = sys.argv[1]
example_csv_file = PREFIX_DIR + 'Experiment/statFeature/csvFile/' + date + '/examples.csv'
examples_df = pd.read_csv(example_csv_file)
class_counts = examples_df['label'].value_counts().plot.bar()
examples = examples_df.values.copy()
score_df = pd.DataFrame(np.zeros((5,3)),index = ['LogisticRegression', 'SVM', 'GaussianNB', 'tree', 'RandomForest'], \
columns = ['precision', 'recall', 'f1'])
f1_score_list = list()
recall_score_list = list()
precision_score_list = list()
for i in range(50):
np.random.shuffle(examples)
examples_train = examples[:int(len(examples)*0.75)]
examples_test = examples[int(len(examples)*0.75):]
x_train = examples_train[:,0:-1]
y_train = examples_train[:,-1]
x_test = examples_test[:,0:-1]
y_test = examples_test[:,-1]
RF(score_df, f1_score_list, recall_score_list, precision_score_list, \
x_train, y_train, x_test, y_test)
if __name__ == '__main__':
main()

View File

@@ -1,124 +0,0 @@
import sys
import os
import json
import pandas as pd
import numpy as np
PREFIX_DIR = "/Users/Leo/Documents/github/GradProj/"
def label_dict_build(date):
example_label_file = PREFIX_DIR + 'DataSet/result/' + date + '/stream_tag.txt'
example_label_df = pd.read_table(example_label_file, sep='\s+', header=None)
example_label_df[3] = 443
example_label = {tuple(example_label_df.iloc[i,0:4].values):example_label_df.iloc[i,4] for i in example_label_df.index}
return example_label
def main():
date = sys.argv[1]
example_label = label_dict_build(date)
row_count = 1771
cloumn_count = 25
example_json_file = PREFIX_DIR + 'DataSet/result/' + date + '/stream_feature.txt'
example_json_f = open(example_json_file, 'r')
array_shape = (row_count, cloumn_count)
result_data = np.zeros(array_shape)
result_label = list()
i = 0
for line in example_json_f.readlines():
example_json = json.loads(line)
#标签
try:
flow_key = (example_json['sip'], example_json['sport'], example_json['dip'], example_json['dport'])
result_label.append(example_label[flow_key])
except Exception:
continue
#统计特征
packets = example_json['packets']
c2s_packets_bytes = list()
s2c_packets_bytes = list()
c2s_packets_intervals = list()
s2c_packets_intervals = list()
for packet in packets:
if packet['dir'] == 1:
c2s_packets_bytes.append(packet['bytes'])
c2s_packets_intervals.append(packet['interval'])
elif packet['dir'] == 2:
s2c_packets_bytes.append(packet['bytes'])
s2c_packets_intervals.append(packet['interval'])
c2s_bytes = example_json['c2s_bytes']
s2c_bytes = example_json['s2c_bytes']
c2s_pkts = example_json['c2s_pkts']
s2c_pkts = example_json['s2c_pkts']
duration = example_json['duration']
c2s_packets_bytes_mean = 0
c2s_packets_bytes_median = 0
c2s_packets_bytes_std = 0
c2s_packets_bytes_max = 0
c2s_packets_bytes_min = 0
c2s_packets_intervals_mean = 0
c2s_packets_intervals_median = 0
c2s_packets_intervals_std = 0
c2s_packets_intervals_max = 0
c2s_packets_intervals_min = 0
s2c_packets_bytes_mean = 0
s2c_packets_bytes_median = 0
s2c_packets_bytes_std = 0
s2c_packets_bytes_max = 0
s2c_packets_bytes_min = 0
s2c_packets_intervals_mean = 0
s2c_packets_intervals_median = 0
s2c_packets_intervals_std = 0
s2c_packets_intervals_max = 0
s2c_packets_intervals_min = 0
if c2s_bytes > 0:
c2s_packets_bytes_mean = np.mean(c2s_packets_bytes)
c2s_packets_bytes_median = np.median(c2s_packets_bytes)
c2s_packets_bytes_std = np.std(c2s_packets_bytes)
c2s_packets_bytes_max = np.max(c2s_packets_bytes)
c2s_packets_bytes_min = np.min(c2s_packets_bytes)
c2s_packets_intervals_mean = np.mean(c2s_packets_intervals)
c2s_packets_intervals_median = np.median(c2s_packets_intervals)
c2s_packets_intervals_std = np.std(c2s_packets_intervals)
c2s_packets_intervals_max = np.max(c2s_packets_intervals)
c2s_packets_intervals_min = np.min(c2s_packets_intervals)
if s2c_bytes > 0:
s2c_packets_bytes_mean = np.mean(s2c_packets_bytes)
s2c_packets_bytes_median = np.median(s2c_packets_bytes)
s2c_packets_bytes_std = np.std(s2c_packets_bytes)
s2c_packets_bytes_max = np.max(s2c_packets_bytes)
s2c_packets_bytes_min = np.min(s2c_packets_bytes)
s2c_packets_intervals_mean = np.mean(s2c_packets_intervals)
s2c_packets_intervals_median = np.median(s2c_packets_intervals)
s2c_packets_intervals_std = np.std(s2c_packets_intervals)
s2c_packets_intervals_max = np.max(s2c_packets_intervals)
s2c_packets_intervals_min = np.min(s2c_packets_intervals)
result = [c2s_bytes, c2s_pkts, s2c_bytes, s2c_pkts, duration, c2s_packets_bytes_mean, c2s_packets_bytes_median, c2s_packets_bytes_std,\
c2s_packets_bytes_max, c2s_packets_bytes_min, c2s_packets_intervals_mean, c2s_packets_intervals_median, c2s_packets_intervals_std,\
c2s_packets_intervals_max, c2s_packets_intervals_min, s2c_packets_bytes_mean, s2c_packets_bytes_median, s2c_packets_bytes_std,\
s2c_packets_bytes_max, s2c_packets_bytes_min, s2c_packets_intervals_mean, s2c_packets_intervals_median, s2c_packets_intervals_std,\
s2c_packets_intervals_max, s2c_packets_intervals_min]
result_data[i,:] = result
i += 1
print('row = ' + str(row_count))
print("result_label = " + str(len(result_label)))
base_head = ['c2s_bytes', 'c2s_pkts', 's2c_bytes', 's2c_pkts', 'duration', 'c2s_packets_bytes_mean', 'c2s_packets_bytes_median', 'c2s_packets_bytes_std',\
'c2s_packets_bytes_max', 'c2s_packets_bytes_min', 'c2s_packets_intervals_mean', 'c2s_packets_intervals_median', 'c2s_packets_intervals_std',\
'c2s_packets_intervals_max', 'c2s_packets_intervals_min', 's2c_packets_bytes_mean', 's2c_packets_bytes_median', 's2c_packets_bytes_std',\
's2c_packets_bytes_max', 's2c_packets_bytes_min', 's2c_packets_intervals_mean', 's2c_packets_intervals_median', 's2c_packets_intervals_std',\
's2c_packets_intervals_max', 's2c_packets_intervals_min']
header = base_head
result_df = pd.DataFrame(result_data, columns=header)
result_df['label'] = np.array(result_label)
example_csv_file = PREFIX_DIR + 'Experiment/statFeature/csvFile/' + date + '/examples.csv'
result_df.to_csv(example_csv_file, index=False)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 36 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 34 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 39 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 37 KiB

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 38 KiB