This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
cuiyiming-gradproj/Experiment/statFeature/StatFeature.ipynb

616 lines
42 KiB
Plaintext
Raw Normal View History

2019-12-23 01:20:51 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import pandas as pd\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ciper_suits = {\n",
" '1305':0,\n",
" 'C030':1,\n",
"\t'C02C':2,\n",
"\t'C028':3,\n",
"\t'C024':4,\n",
"\t'C014':5,\n",
"\t'C00A':6,\n",
"\t'00A5':7,\n",
"\t'00A3':8,\n",
"\t'00A1':9,\n",
"\t'009F':10,\n",
"\t'006B':11,\n",
"\t'006A':12,\n",
"\t'0069':13,\n",
"\t'0068':14,\n",
"\t'0039':15,\n",
"\t'0038':16,\n",
"\t'0037':17,\n",
"\t'0036':18,\n",
"\t'0088':19,\n",
"\t'0087':20,\n",
"\t'0086':21,\n",
"\t'0085':22,\n",
"\t'C019':23,\n",
"\t'00A7':24,\n",
"\t'006D':25,\n",
"\t'003A':26,\n",
"\t'0089':27,\n",
"\t'C032':28,\n",
"\t'C02E':29,\n",
"\t'C02A':30,\n",
"\t'C026':31,\n",
"\t'C00F':32,\n",
"\t'C005':33,\n",
"\t'009D':34,\n",
"\t'003D':35,\n",
"\t'0035':36,\n",
"\t'0084':37,\n",
"\t'008D':38,\n",
"\t'C02F':39,\n",
"\t'C02B':40,\n",
"\t'C027':41,\n",
"\t'C023':42,\n",
"\t'C013':43,\n",
"\t'C009':44,\n",
"\t'00A4':45,\n",
"\t'00A2':46,\n",
"\t'00A0':47,\n",
"\t'009E':48,\n",
"\t'0067':49,\n",
"\t'0040':50,\n",
"\t'003F':51,\n",
"\t'003E':52,\n",
"\t'0033':53,\n",
"\t'0032':54,\n",
"\t'0031':55,\n",
"\t'0030':56,\n",
"\t'009A':57,\n",
"\t'0099':58,\n",
"\t'0098':59,\n",
"\t'0097':60,\n",
"\t'0045':61,\n",
"\t'0044':62,\n",
"\t'0043':63,\n",
"\t'0042':64,\n",
"\t'C018':65,\n",
"\t'00A6':66,\n",
"\t'006C':67,\n",
"\t'0034':68,\n",
"\t'009B':69,\n",
"\t'0046':70,\n",
"\t'C031':71,\n",
"\t'C02D':72,\n",
"\t'C029':73,\n",
"\t'C025':74,\n",
"\t'C00E':75,\n",
"\t'C004':76,\n",
"\t'009C':77,\n",
"\t'003C':78,\n",
"\t'002F':79,\n",
"\t'0096':80,\n",
"\t'0041':81,\n",
"\t'008C':82,\n",
"\t'C012':83,\n",
"\t'C008':84,\n",
"\t'0016':85,\n",
"\t'0013':86,\n",
"\t'0010':87,\n",
"\t'000D':88,\n",
"\t'C017':89,\n",
"\t'001B':90,\n",
"\t'C00D':91,\n",
"\t'C003':92,\n",
"\t'000A':93,\n",
"\t'0007':94,\n",
"\t'008B':95,\n",
"\t'0021':96,\n",
"\t'001F':97,\n",
"\t'0025':98,\n",
"\t'0023':99,\n",
"\t'C011':100,\n",
"\t'C007':101,\n",
"\t'C016':102,\n",
"\t'0018':103,\n",
"\t'C00C':104,\n",
"\t'C002':105,\n",
"\t'0005':106,\n",
"\t'0004':107,\n",
"\t'008A':108,\n",
"\t'0020':109,\n",
"\t'0024':110,\n",
"\t'C010':111,\n",
"\t'C006':112,\n",
"\t'C015':113,\n",
"\t'C00B':114,\n",
"\t'C001':115,\n",
"\t'003B':116,\n",
"\t'0002':117,\n",
"\t'0001':118,\n",
" '1301':119,\n",
"\t'1302':120,\n",
"\t'1303':121,\n",
"\t'1304':122\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"extensions = { \n",
" 0:0, \n",
" 1:1, \n",
" 2:2, \n",
" 3:3, \n",
" 4:4, \n",
" 5:5, \n",
" 6:6, \n",
" 7:7, \n",
" 8:8, \n",
" 9:9, \n",
" 10:10, \n",
" 11:11, \n",
" 12:12, \n",
" 13:13, \n",
" 14:14, \n",
" 15:15, \n",
" 16:16, \n",
" 17:17, \n",
" 18:18, \n",
" 19:19, \n",
" 20:20, \n",
" 21:21, \n",
" 22:22, \n",
" 23:23, \n",
" 24:24, \n",
" 25:25, \n",
" 26:26, \n",
" 27:27, \n",
" 28:28, \n",
" 29:29, \n",
" 30:30, \n",
" 31:31, \n",
" 35:32, \n",
" 65281:33 \n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"date = '2019-12-20_21'\n",
"root_dir = '/Users/Leo/Documents/github/GradProj/'\n",
"example_label_file = root_dir + 'DataSet/result/' + date + '/stream_tag.txt'\n",
"example_label_df = pd.read_table(example_label_file, sep='\\s+', header=None)\n",
"example_label = {tuple(example_label_df.iloc[i,0:4].values):example_label_df.iloc[i,4] for i in example_label_df.index}\n",
"example_json_file = root_dir + 'DataSet/result/' + date + '/stream_stat.txt'\n",
"example_json_f = open(example_json_file, 'r')\n",
"result_data = list()\n",
"result_label = list()\n",
"i = 0\n",
"for line in example_json_f.readlines():\n",
" example_json = json.loads(line)\n",
" #标签\n",
" try:\n",
" flow_key = (example_json['sip'], example_json['sport'], example_json['dip'], example_json['dport'])\n",
" result_label.append(example_label[flow_key])\n",
" except Exception:\n",
" continue\n",
" \n",
" #统计特征\n",
" packets = example_json['packets']\n",
" c2s_packets_bytes = list()\n",
" s2c_packets_bytes = list()\n",
" c2s_packets_intervals = list()\n",
" s2c_packets_intervals = list()\n",
" for packet in packets:\n",
" if packet['dir'] == 1:\n",
" c2s_packets_bytes.append(packet['bytes'])\n",
" c2s_packets_intervals.append(packet['interval'])\n",
" elif packet['dir'] == 2:\n",
" s2c_packets_bytes.append(packet['bytes'])\n",
" s2c_packets_intervals.append(packet['interval'])\n",
" c2s_bytes = example_json['c2s_bytes']\n",
" s2c_bytes = example_json['s2c_bytes']\n",
" c2s_pkts = example_json['c2s_pkts']\n",
" s2c_pkts = example_json['s2c_pkts']\n",
" duration = example_json['duration']\n",
" c2s_packets_bytes_mean = 0\n",
" c2s_packets_bytes_median = 0\n",
" c2s_packets_bytes_std = 0\n",
" c2s_packets_bytes_max = 0\n",
" c2s_packets_bytes_min = 0\n",
"\n",
" c2s_packets_intervals_mean = 0\n",
" c2s_packets_intervals_median = 0\n",
" c2s_packets_intervals_std = 0\n",
" c2s_packets_intervals_max = 0\n",
" c2s_packets_intervals_min = 0\n",
"\n",
" s2c_packets_bytes_mean = 0\n",
" s2c_packets_bytes_median = 0\n",
" s2c_packets_bytes_std = 0\n",
" s2c_packets_bytes_max = 0\n",
" s2c_packets_bytes_min = 0\n",
"\n",
" s2c_packets_intervals_mean = 0\n",
" s2c_packets_intervals_median = 0\n",
" s2c_packets_intervals_std = 0\n",
" s2c_packets_intervals_max = 0\n",
" s2c_packets_intervals_min = 0\n",
" \n",
" if c2s_bytes > 0:\n",
" c2s_packets_bytes_mean = np.mean(c2s_packets_bytes)\n",
" c2s_packets_bytes_median = np.median(c2s_packets_bytes)\n",
" c2s_packets_bytes_std = np.std(c2s_packets_bytes)\n",
" c2s_packets_bytes_max = np.max(c2s_packets_bytes)\n",
" c2s_packets_bytes_min = np.min(c2s_packets_bytes)\n",
"\n",
" c2s_packets_intervals_mean = np.mean(c2s_packets_intervals)\n",
" c2s_packets_intervals_median = np.median(c2s_packets_intervals)\n",
" c2s_packets_intervals_std = np.std(c2s_packets_intervals)\n",
" c2s_packets_intervals_max = np.max(c2s_packets_intervals)\n",
" c2s_packets_intervals_min = np.min(c2s_packets_intervals)\n",
" \n",
" if s2c_bytes > 0:\n",
" s2c_packets_bytes_mean = np.mean(s2c_packets_bytes)\n",
" s2c_packets_bytes_median = np.median(s2c_packets_bytes)\n",
" s2c_packets_bytes_std = np.std(s2c_packets_bytes)\n",
" s2c_packets_bytes_max = np.max(s2c_packets_bytes)\n",
" s2c_packets_bytes_min = np.min(s2c_packets_bytes)\n",
"\n",
" s2c_packets_intervals_mean = np.mean(s2c_packets_intervals)\n",
" s2c_packets_intervals_median = np.median(s2c_packets_intervals)\n",
" s2c_packets_intervals_std = np.std(s2c_packets_intervals)\n",
" s2c_packets_intervals_max = np.max(s2c_packets_intervals)\n",
" s2c_packets_intervals_min = np.min(s2c_packets_intervals)\n",
"\n",
" #tls\n",
" tls = example_json['tls']\n",
" extensions_list = tls['extensions_list']\n",
" #print(extensions_list)\n",
" ciphers = tls['cipher_suites']\n",
" #print(ciphers)\n",
" extensions_arr = np.zeros(34, dtype=np.uint8)\n",
" cipher_suits_arr = np.zeros(123, dtype=np.uint8)\n",
" for extension in extensions_list:\n",
" try:\n",
" extensions_arr[extensions[extension]]=1\n",
" except Exception:\n",
" pass\n",
" for cipher in ciphers:\n",
" try:\n",
" cipher = cipher.upper()\n",
" cipher_suits_arr[ciper_suits[cipher]]=1\n",
" except Exception:\n",
" pass\n",
" result = [c2s_bytes, c2s_pkts, s2c_bytes, s2c_pkts, duration, c2s_packets_bytes_mean, c2s_packets_bytes_median, c2s_packets_bytes_std,\\\n",
" c2s_packets_bytes_max, c2s_packets_bytes_min, c2s_packets_intervals_mean, c2s_packets_intervals_median, c2s_packets_intervals_std,\\\n",
" c2s_packets_intervals_max, c2s_packets_intervals_min, s2c_packets_bytes_mean, s2c_packets_bytes_median, s2c_packets_bytes_std,\\\n",
" s2c_packets_bytes_max, s2c_packets_bytes_min, s2c_packets_intervals_mean, s2c_packets_intervals_median, s2c_packets_intervals_std,\\\n",
" s2c_packets_intervals_max, s2c_packets_intervals_min]\n",
" result += list(cipher_suits_arr)\n",
" result += list(extensions_arr)\n",
" result_data.append(result)\n",
" i += 1\n",
"extensions_head = list()\n",
"for i in range(len(extensions)):\n",
" extensions_head.append('extension'+str(i))\n",
"cipher_head = ['cipher'+str(i) for i in range(len(ciper_suits))]\n",
"base_head = ['c2s_bytes', 'c2s_pkts', 's2c_bytes', 's2c_pkts', 'duration', 'c2s_packets_bytes_mean', 'c2s_packets_bytes_median', 'c2s_packets_bytes_std',\\\n",
" 'c2s_packets_bytes_max', 'c2s_packets_bytes_min', 'c2s_packets_intervals_mean', 'c2s_packets_intervals_median', 'c2s_packets_intervals_std',\\\n",
" 'c2s_packets_intervals_max', 'c2s_packets_intervals_min', 's2c_packets_bytes_mean', 's2c_packets_bytes_median', 's2c_packets_bytes_std',\\\n",
" 's2c_packets_bytes_max', 's2c_packets_bytes_min', 's2c_packets_intervals_mean', 's2c_packets_intervals_median', 's2c_packets_intervals_std',\\\n",
" 's2c_packets_intervals_max', 's2c_packets_intervals_min']\n",
"header = base_head+cipher_head+extensions_head\n",
"result_df = pd.DataFrame(result_data, columns=header)\n",
"result_df['label'] = np.array(result_label)\n",
"example_csv_file = root_dir + 'Experiment/StatFeature/CsvFile/' + date + '/examples.csv'\n",
"result_df.to_csv(example_csv_file, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hupu: 489846\n",
"weibo: 897897\n",
"douyin: 158497\n",
"toutiao: 213989\n",
"zhihu: 968036\n"
]
}
],
"source": [
"# 统计每个app的包数\n",
"date = '2019-12-20_21'\n",
"root_dir = '/Users/Leo/Documents/github/GradProj/'\n",
"exmaples_file = root_dir + 'Experiment/StatFeature/CsvFile/' + date + '/examples.csv'\n",
"app2pktsDict = dict()\n",
"with open(exmaples_file) as f:\n",
" lines = f.readlines()\n",
" i = 0\n",
" for line in lines:\n",
" if i == 0:\n",
" i += 1\n",
" continue;\n",
" line = line.split(',')\n",
" pkts = int(line[1]) + int(line[3])\n",
" appName = line[-1]\n",
" if appName not in app2pktsDict.keys():\n",
" app2pktsDict[appName] = 0\n",
" app2pktsDict[appName] += pkts \n",
"for appName, pkts in app2pktsDict.items():\n",
" appName = appName[:-1]\n",
" print(appName + ': ', pkts)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"zhihu 6403\n",
"weibo 5487\n",
"douyin 3964\n",
"hupu 2304\n",
"toutiao 520\n",
"Name: label, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEVCAYAAADpbDJPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFx1JREFUeJzt3Xu0nXV95/H3B6hirUDQwFASDbZZXuoIYgRmvIxKy9UR\nxkoHV9XU0mYu1LFTZ2x0Zg0VdYm2asdebBlBo2OLqFWoMGqKqKWjSLgYUXQSESULK9EA2nphxX7n\nj/07ZBNOcvYJh/2c8Hu/1jprP8/v+e29v88m7M9+fs8tVYUkqT/7DF2AJGkYBoAkdcoAkKROGQCS\n1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpU/sNXcDuPOpRj6oVK1YMXYYk7VWuvfba71TV0rn6LeoA\nWLFiBRs2bBi6DEnaqyT5xiT9HAKSpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoA\nkKROLeozgRfCirWXDV0CALecd+rQJUjSvbgFIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhS\npwwASerURAGQ5KAkH0zylSQ3JfkXSQ5Osj7Jpva4pPVNkrcn2ZxkY5Kjx15ndeu/KcnqB2qlJElz\nm3QL4H8CH6uqxwNHAjcBa4ErqmolcEWbBzgZWNn+1gDvAEhyMHAOcCxwDHDOTGhIkqZvzgBIcgDw\nLOACgKq6u6ruBE4D1rVu64DT2/RpwHtq5HPAQUkOA04E1lfVtqq6A1gPnLSgayNJmtgkWwCPBbYC\n70pyfZJ3Jnk4cGhVfQugPR7S+h8O3Dr2/C2tbVftkqQBTBIA+wFHA++oqqcA/8iO4Z7ZZJa22k37\nvZ+crEmyIcmGrVu3TlCeJGlPTBIAW4AtVXV1m/8go0D4dhvaoT3ePtZ/+djzlwG37ab9Xqrq/Kpa\nVVWrli5dOp91kSTNw5wBUFV/D9ya5HGt6Xjgy8ClwMyRPKuBS9r0pcBL29FAxwF3tSGijwMnJFnS\ndv6e0NokSQOY9H4ALwfel+QhwM3AyxiFx8VJzgK+CZzR+l4OnAJsBn7Q+lJV25K8Drim9Tu3qrYt\nyFpIkuZtogCoqhuAVbMsOn6WvgWcvYvXuRC4cD4FSpIeGJ4JLEmdMgAkqVMGgCR1ygCQpE4ZAJLU\nKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdWrSi8HpQWDF2suGLgGAW847degSJOEWgCR1ywCQ\npE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1KmJAiDJLUm+mOSGJBta\n28FJ1ifZ1B6XtPYkeXuSzUk2Jjl67HVWt/6bkqx+YFZJkjSJ+WwBPKeqjqqqVW1+LXBFVa0Ermjz\nACcDK9vfGuAdMAoM4BzgWOAY4JyZ0JAkTd/9GQI6DVjXptcBp4+1v6dGPgcclOQw4ERgfVVtq6o7\ngPXASffj/SVJ98OkAVDAJ5Jcm2RNazu0qr4F0B4Pae2HA7eOPXdLa9tVuyRpAJPeD+DpVXVbkkOA\n9Um+spu+maWtdtN+7yePAmYNwKMf/egJy5MkzddEWwBVdVt7vB34MKMx/G+3oR3a4+2t+xZg+djT\nlwG37aZ95/c6v6pWVdWqpUuXzm9tJEkTmzMAkjw8ySNmpoETgBuBS4GZI3lWA5e06UuBl7ajgY4D\n7mpDRB8HTkiypO38PaG1SZIGMMkQ0KHAh5PM9P+LqvpYkmuAi5OcBXwTOKP1vxw4BdgM/AB4GUBV\nbUvyOuCa1u/cqtq2YGsiSZqXOQOgqm4Gjpyl/bvA8bO0F3D2Ll7rQuDC+ZcpSVpongksSZ0yACSp\nUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjpl\nAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnq1MQBkGTfJNcn+Wib\nPyLJ1Uk2JXl/koe09oe2+c1t+Yqx13h1a/9qkhMXemUkSZObzxbAK4CbxubfBLytqlYCdwBntfaz\ngDuq6ueBt7V+JHkicCbwC8BJwJ8m2ff+lS9J2lMTBUCSZcCpwDvbfIDnAh9sXdYBp7fp09o8bfnx\nrf9pwEVV9eOq+jqwGThmIVZCkjR/k24B/CHwKuCf2vwjgTuranub3wIc3qYPB24FaMvvav3vaZ/l\nOfdIsibJhiQbtm7dOo9VkSTNx35zdUjyPOD2qro2ybNnmmfpWnMs291zdjRUnQ+cD7Bq1ar7LJcW\nwoq1lw1dArecd+rQJahzcwYA8HTg+UlOAfYHDmC0RXBQkv3ar/xlwG2t/xZgObAlyX7AgcC2sfYZ\n48+RJE3ZnENAVfXqqlpWVSsY7cT9ZFX9KnAl8MLWbTVwSZu+tM3Tln+yqqq1n9mOEjoCWAl8fsHW\nRJI0L5NsAezK7wIXJXk9cD1wQWu/AHhvks2MfvmfCVBVX0pyMfBlYDtwdlX95H68vyTpfphXAFTV\np4BPtembmeUonqr6EXDGLp7/BuAN8y1SkrTwPBNYkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoA\nkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ\n6pQBIEmdMgAkqVMGgCR1ygCQpE7NGQBJ9k/y+SRfSPKlJK9t7UckuTrJpiTvT/KQ1v7QNr+5LV8x\n9lqvbu1fTXLiA7VSkqS5TbIF8GPguVV1JHAUcFKS44A3AW+rqpXAHcBZrf9ZwB1V9fPA21o/kjwR\nOBP4BeAk4E+T7LuQKyNJmtycAVAj/9Bmf6r9FfBc4IOtfR1weps+rc3Tlh+fJK39oqr6cVV9HdgM\nHLMgayFJmreJ9gEk2TfJDcDtwHrga8CdVbW9ddkCHN6mDwduBWjL7wIeOd4+y3MkSVM2UQBU1U+q\n6ihgGaNf7U+YrVt7zC6W7ar9XpKsSbIhyYatW7dOUp4kaQ/M6yigqroT+BRwHHBQkv3aomXAbW16\nC7AcoC0/ENg23j7Lc8bf4/yqWlVVq5YuXTqf8iRJ8zDJUUBLkxzUph8G/CJwE3Al8MLWbTVwSZu+\ntM3Tln+yqqq1n9mOEjoCWAl8fqFWRJI0P/vN3YXDgHXtiJ19gIur6qNJvgxclOT1wPXABa3/BcB7\nk2xm9Mv/TICq+lKSi4EvA9uBs6vqJwu7OpKkSc0ZAFW1EXjKLO03M8tRPFX1I+CMXbzWG4A3zL9M\nSdJC80xgSeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaA\nJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6tQkt4SU9CC2Yu1lQ5fALeedOnQJXXILQJI6ZQBIUqcM\nAEnqlAEgSZ0yACSpU3MGQJLlSa5MclOSLyV5RWs/OMn6JJva45LWniRvT7I5ycYkR4+91urWf1OS\n1Q/cakmS5jLJFsB24JVV9QTgOODsJE8E1gJXVNVK4Io2D3AysLL9rQHeAaPAAM4BjgWOAc6ZCQ1J\n0vTNGQBV9a2quq5Nfx+4CTgcOA1Y17qtA05v06cB76mRzwEHJTkMOBFYX1XbquoOYD1w0oKujSRp\nYvPaB5BkBfAU4Grg0Kr6FoxCAjikdTscuHXsaVta267aJUkDmDgAkvwM8CHgt6vqe7vrOktb7aZ9\n5/dZk2RDkg1bt26dtDxJ0jxNFABJforRl//7quqvWvO329AO7fH21r4FWD729GXAbbtpv5eqOr+q\nVlXVqqVLl85nXSRJ8zDJUUABLgBuqqq3ji26FJg5kmc1cMlY+0vb0UDHAXe1IaKPAyckWdJ2/p7Q\n2iRJA5jkYnBPB14CfDHJDa3tNcB5wMVJzgK+CZzRll0OnAJsBn4AvAygqrYleR1wTet3blVtW5C1\nkCTN25wBUFVXMfv4PcD
"text/plain": [
"<matplotlib.figure.Figure at 0x10ac1f320>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"examples_df = pd.read_csv(exmaples_file)\n",
"class_counts = examples_df['label'].value_counts()\n",
"class_counts.plot.bar()\n",
"class_counts"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"183\n",
" precision recall f1\n",
"LogisticRegression 0.773019 0.773019 0.773019\n",
"SVM 0.841328 0.841328 0.841328\n",
"GaussianNB 0.730193 0.730193 0.730193\n",
"tree 0.984154 0.984154 0.984154\n",
"RandomForest 0.988651 0.988651 0.988651\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAFcCAYAAAAzq/4LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmYVOWZ/vHvTSvgbiJoDCigQQWVTRAQt2icqOOgiYlI\nQozBJYrEdRYy5pc4ZjIxmwZHxWhwjeIyjooGIxpRo0aHRXABMUha7RgVcQFEZPH5/XFOQ9E0dDUW\nfarPuT/XVZd1lqp6qqTvOvWe97yvIgIzM8uXNlkXYGZmledwNzPLIYe7mVkOOdzNzHLI4W5mlkMO\ndzOzHHK4m5nlkMPdqp6kf5f024187BJJu22CmnaS9LikxZJ+VennN/u05IuYbFOSdBHwhYgYUeb+\nhwK/i4jOG/Faj6aP3agvgma+1v8D+gLHx6f8I5J0A1AXET+oRG1m4CN3s43VBZj9aYO9EiRtlnUN\nVoUiwjffKnID/g34G7AYmAv8I7AcWAEsAWal+30HmJPuNx/4brp+K+Aj4JN0/yXA54GLSI7IAdoD\nvwMWAu8DU4GdgJ8Aq4Bl6eOuSPcPkl8OAFsAvwJeBT4AnkjXNfqcG3ifN6TvaXn6Wl8iOVAaA7yS\nPs8dwGdLHnMn8Gb6uo8De6frT2/wXPc1rLvkNf8zvX8oUJd+3m8CN6frjwFmpu/hKaBX1v8mfMvu\n5m98qwhJewKjgQER8YakrkAN8F+s2yzzNkkQzQcOBh6QNDUiZkg6igbNMpJKX+rbwHbALsDHQB/g\no4i4UNIQNtws80tgb+AAklAcSPJFcnJjz7m+9xoRJ6c1rW5KkXQucBxwCLAAuBy4EhiePuwBYCRJ\niP8MuAXoExHXSDqA5jfLfA74LMkviDaS+gHXAf8ETANGABMl7RkRHzfjeS0n3CxjlbIKaAf0lLR5\nRNRGxCuN7RgRv4+IVyLxGDAZOKjM11kB7EDyhbEqIqZHxKKmHiSpDUm4nhMRf0sf+1QafBv1nA18\nF7gwIurS57wI+Fp9k0lEXBcRi0u29Za0XTNfo9QnwI8i4uOI+Ag4DfhNRDyTvocbSb6oBn2K17BW\nzOFuFRER84BzSYLrbUm3Sfp8Y/tKOkrS05LelfQ+cDTQocyXuhl4ELhN0huSfi5p8zIe14Gk+aWx\nL5yNfc5SXYC7Jb2fvqc5JF94O0mqkXSJpFckLQJqS2raWAsiYlmD17+g/vXTGnYhadayAnK4W8VE\nxK0RcSBJ0ARJ88NaJxwltQPuImki2SkitgcmAfVtLxs8QRkRKyLiPyKiJ0nzyjHASWU89h2S9vjd\nm/mc5XodOCoiti+5tY+IvwHfAI4laZvfDuiaPmZD73kpsGXJ8ucalt3I6/+kwetvGRETmvk+LCcc\n7lYRkvaUdFga3stI2qxXAW8BXdNmEYC2JM03C4CVaRv7P5Q81VvADutrspD0RUn7SqoBFpE0qawq\neWyjfdoj4hOSNulLJX0+PZoeLKldE89ZrquBn0jqktbZUdKx6bZtSJpIFpIE9n81eGxjdc8EvpHW\neSRJW/6GXAucIWmgEltJ+kdJ2zTzfVhOONytUtoBl5AcIb8J7Aj8O0kvEYCFkmZExGLgbJLeJO+R\nHNVOrH+SiHgJmADMT5sXGjYrfA74H5IQngM8RtLTBWAsSTv3e5Iub6TGfwaeJ+kN8y7JL4s2TTxn\nucam72OypMXA0yQnbAFuIumh8zdgdrqt1HiScxXvS7onXXcOycnR94FvAvewARExjaTd/QqSz3Ue\nyYliKyhfxGRmlkM+cjczyyH3czdbD0lL1rPpqIj4U4sWY9ZMbpYxM8shN8uYmeVQZs0yHTp0iK5d\nu2b18mZmrdL06dPfiYiOTe2XWbh37dqVadOmZfXyZmatkqRXy9nPzTJmZjnkcDczy6Emw13SdZLe\nlvTCerZL0uWS5kl6Lh161MzMMlROm/sNJJc037Se7UcB3dPbQGAcay67NlvHihUrqKurY9myZU3v\nbOto3749nTt3ZvPNmztwpRVJk+EeEY+nEy+sz7HATZF0mH9a0vaSdo6Iv1eoRsuZuro6ttlmG7p2\n7dpwIg5rQkSwcOFC6urq6NatW9blWBWrRJt7J5LhRuvVpevMGrVs2TJ22GEHB/tGkMQOO+zgXz3W\npEqEe2N/oY1e9irpdEnTJE1bsGBBBV7aWisH+8bzZ2flqES415HM+FKvM/BGYztGxDUR0T8i+nfs\n2GQffLNW54ADDtjg9qOPPpr333+/haqxIqvERUwTgdGSbiM5kfqB29utObqO+X1Fn6/2kn+syPOs\nWrWKmpqaZj3mqaee2uD2SZMmfZqS7FNozr+z2vbfKHvffbvtWva+z3/7+bL3/bSaDHdJE4BDgQ6S\n6oAfAZsDRMTVJFOkHU0yOcBS4DubqlizSqmtreXII49k4MCBPPvss+yxxx7cdNNN9OzZk5EjRzJ5\n8mRGjx7NgAEDOOuss1iwYAFbbrkl1157LXvttRdvvfUWZ5xxBvPnzwdg3LhxHHDAAWy99dYsWbKE\nv//97wwbNoxFixaxcuVKxo0bx0EHHbT6yuwOHTpw6aWXct111wFw6qmncu6551JbW8tRRx3FgQce\nyFNPPUWnTp2499572WKLLTbqfRYt0GyNcnrLDG9iewBnVawisxYyd+5cxo8fz5AhQxg5ciRXXXUV\nkHQ1fOKJJwA4/PDDufrqq+nevTvPPPMMo0aN4pFHHuHss8/mkEMO4e6772bVqlUsWbL26MC33nor\nX/7yl7nwwgtZtWoVS5cuXWv79OnTuf7663nmmWeICAYOHMghhxzCZz7zGf7yl78wYcIErr32Wk44\n4QTuuusuRowY0TIfiuWGx3O3wtpll10YMmQIACNGjODyy5OZ+YYNGwbAkiVLeOqpp/j617+++jEf\nf/wxAI888gg33ZRc+lFTU8N226095euAAQMYOXIkK1as4LjjjqNPnz5rbX/iiSf4yle+wlZbbQXA\nV7/6Vf70pz8xdOhQunXrtnr//fbbj9ra2gq/cysCDz9ghdWw10n9cn3gfvLJJ2y//fbMnDlz9W3O\nnDllPffBBx/M448/TqdOnfjWt761+oug3obmUWjXrt3q+zU1NaxcubKs1zQr5XC3wnrttdf485//\nDMCECRM48MAD19q+7bbb0q1bN+68M5njOyKYNWsWkDTXjBs3DkhOvC5atGitx7766qvsuOOOnHba\naZxyyinMmDFjre0HH3ww99xzD0uXLuXDDz/k7rvv5qCDDtok79OKyeFuhdWjRw9uvPFGevXqxbvv\nvsuZZ565zj633HIL48ePp3fv3uy9997ce++9AIwdO5YpU6aw7777st9++/Hiiy+u9bhHH32UPn36\n0LdvX+666y7OOeectbb369ePk08+mf3335+BAwdy6qmn0rdv3033Zq1wMptmr3///uHx3Itpzpw5\n9OjRI9MaamtrOeaYY3jhhUbHw6t65X6G7i2zRl4+C0nTI6J/U/v5yN3MLIcc7lZIXbt2bbVH7Wbl\ncLibmeWQw93MLIcc7mZmOeRwNzPLIYe7WYXU1tayzz77AEk/92OOOSbjiqzIPLaMZe+i7Zrep1nP\n90Gzdo8IIoI2bXysY/nhf81WSLW1tfTo0YNRo0bRr18/br75ZgYPHky/fv34+te/vnqUx6lTp3LA\nAQfQu3dv9t9/fxYvXkxtbS0HHXQQ/fr1o1+/fk2O4W6WBYe7FdbcuXM56aSTeOihhxg/fjwPP/ww\nM2bMoH///lx66aUsX76cYcOGMXbsWGbNmsXDDz/MFltswY477shDDz3EjBkzuP322zn77LOzfitm\n63CzjBVWly5dGDRoEPfffz+zZ89ePfzv8uXLGTx4MHPnzmXnnXdmwIABQDKQGMCHH37I6NGjmTlz\nJjU1Nbz88suZvQez9XG4W2HVD+0bERxxxBFMmDBhre3PPfdco5NRX3bZZey0007MmjWLTz75hPbt\n27dIvWbN4WYZK7xBgwb
"text/plain": [
"<matplotlib.figure.Figure at 0x112e0e550>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import SVC\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn import tree\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import f1_score,recall_score,precision_score\n",
"import random\n",
"examples = examples_df.values.copy()\n",
"print(len(examples[0]))\n",
"#只取25个流统计特征\n",
"examples = np.c_[examples[:,:25].copy(),examples[:,-1].copy()]\n",
"#print(examples)\n",
"score_df = pd.DataFrame(np.zeros((5,3)),index = ['LogisticRegression', 'SVM', 'GaussianNB', 'tree', 'RandomForest'], \\\n",
" columns = ['precision', 'recall', 'f1'])\n",
"#def a():\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = LogisticRegression()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['LogisticRegression'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = SVC()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['SVM'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = GaussianNB()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['GaussianNB'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = tree.DecisionTreeClassifier()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['tree'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = RandomForestClassifier()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['RandomForest'] = scores\n",
"print(score_df)\n",
"ax = score_df.plot.bar(title='statistics_feature')\n",
"fig = ax.get_figure()\n",
"#fig.savefig('../figure/base_feature.svg')\n",
"#print(score_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}