This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
cuiyiming-gradproj/Experiment/statFeature/.ipynb_checkpoints/StatFeature-checkpoint.ipynb
2019-12-23 01:20:51 +08:00

612 lines
41 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import pandas as pd\n",
"import numpy as np\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ciper_suits = {\n",
" '1305':0,\n",
" 'C030':1,\n",
"\t'C02C':2,\n",
"\t'C028':3,\n",
"\t'C024':4,\n",
"\t'C014':5,\n",
"\t'C00A':6,\n",
"\t'00A5':7,\n",
"\t'00A3':8,\n",
"\t'00A1':9,\n",
"\t'009F':10,\n",
"\t'006B':11,\n",
"\t'006A':12,\n",
"\t'0069':13,\n",
"\t'0068':14,\n",
"\t'0039':15,\n",
"\t'0038':16,\n",
"\t'0037':17,\n",
"\t'0036':18,\n",
"\t'0088':19,\n",
"\t'0087':20,\n",
"\t'0086':21,\n",
"\t'0085':22,\n",
"\t'C019':23,\n",
"\t'00A7':24,\n",
"\t'006D':25,\n",
"\t'003A':26,\n",
"\t'0089':27,\n",
"\t'C032':28,\n",
"\t'C02E':29,\n",
"\t'C02A':30,\n",
"\t'C026':31,\n",
"\t'C00F':32,\n",
"\t'C005':33,\n",
"\t'009D':34,\n",
"\t'003D':35,\n",
"\t'0035':36,\n",
"\t'0084':37,\n",
"\t'008D':38,\n",
"\t'C02F':39,\n",
"\t'C02B':40,\n",
"\t'C027':41,\n",
"\t'C023':42,\n",
"\t'C013':43,\n",
"\t'C009':44,\n",
"\t'00A4':45,\n",
"\t'00A2':46,\n",
"\t'00A0':47,\n",
"\t'009E':48,\n",
"\t'0067':49,\n",
"\t'0040':50,\n",
"\t'003F':51,\n",
"\t'003E':52,\n",
"\t'0033':53,\n",
"\t'0032':54,\n",
"\t'0031':55,\n",
"\t'0030':56,\n",
"\t'009A':57,\n",
"\t'0099':58,\n",
"\t'0098':59,\n",
"\t'0097':60,\n",
"\t'0045':61,\n",
"\t'0044':62,\n",
"\t'0043':63,\n",
"\t'0042':64,\n",
"\t'C018':65,\n",
"\t'00A6':66,\n",
"\t'006C':67,\n",
"\t'0034':68,\n",
"\t'009B':69,\n",
"\t'0046':70,\n",
"\t'C031':71,\n",
"\t'C02D':72,\n",
"\t'C029':73,\n",
"\t'C025':74,\n",
"\t'C00E':75,\n",
"\t'C004':76,\n",
"\t'009C':77,\n",
"\t'003C':78,\n",
"\t'002F':79,\n",
"\t'0096':80,\n",
"\t'0041':81,\n",
"\t'008C':82,\n",
"\t'C012':83,\n",
"\t'C008':84,\n",
"\t'0016':85,\n",
"\t'0013':86,\n",
"\t'0010':87,\n",
"\t'000D':88,\n",
"\t'C017':89,\n",
"\t'001B':90,\n",
"\t'C00D':91,\n",
"\t'C003':92,\n",
"\t'000A':93,\n",
"\t'0007':94,\n",
"\t'008B':95,\n",
"\t'0021':96,\n",
"\t'001F':97,\n",
"\t'0025':98,\n",
"\t'0023':99,\n",
"\t'C011':100,\n",
"\t'C007':101,\n",
"\t'C016':102,\n",
"\t'0018':103,\n",
"\t'C00C':104,\n",
"\t'C002':105,\n",
"\t'0005':106,\n",
"\t'0004':107,\n",
"\t'008A':108,\n",
"\t'0020':109,\n",
"\t'0024':110,\n",
"\t'C010':111,\n",
"\t'C006':112,\n",
"\t'C015':113,\n",
"\t'C00B':114,\n",
"\t'C001':115,\n",
"\t'003B':116,\n",
"\t'0002':117,\n",
"\t'0001':118,\n",
" '1301':119,\n",
"\t'1302':120,\n",
"\t'1303':121,\n",
"\t'1304':122\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"extensions = { \n",
" 0:0, \n",
" 1:1, \n",
" 2:2, \n",
" 3:3, \n",
" 4:4, \n",
" 5:5, \n",
" 6:6, \n",
" 7:7, \n",
" 8:8, \n",
" 9:9, \n",
" 10:10, \n",
" 11:11, \n",
" 12:12, \n",
" 13:13, \n",
" 14:14, \n",
" 15:15, \n",
" 16:16, \n",
" 17:17, \n",
" 18:18, \n",
" 19:19, \n",
" 20:20, \n",
" 21:21, \n",
" 22:22, \n",
" 23:23, \n",
" 24:24, \n",
" 25:25, \n",
" 26:26, \n",
" 27:27, \n",
" 28:28, \n",
" 29:29, \n",
" 30:30, \n",
" 31:31, \n",
" 35:32, \n",
" 65281:33 \n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"date = '2019-12-20_21'\n",
"example_label_file = '../../DataSet/result/' + date + '/stream_tag.txt'\n",
"example_label_df = pd.read_table(example_label_file, sep='\\s+', header=None)\n",
"example_label = {tuple(example_label_df.iloc[i,0:4].values):example_label_df.iloc[i,4] for i in example_label_df.index}\n",
"example_json_file = '../../DataSet/result/' + date + '/stream_stat.txt'\n",
"example_json_f = open(example_json_file, 'r')\n",
"result_data = list()\n",
"result_label = list()\n",
"i = 0\n",
"for line in example_json_f.readlines():\n",
" example_json = json.loads(line)\n",
" #标签\n",
" try:\n",
" flow_key = (example_json['sip'], example_json['sport'], example_json['dip'], example_json['dport'])\n",
" result_label.append(example_label[flow_key])\n",
" except Exception:\n",
" continue\n",
" \n",
" #统计特征\n",
" packets = example_json['packets']\n",
" c2s_packets_bytes = list()\n",
" s2c_packets_bytes = list()\n",
" c2s_packets_intervals = list()\n",
" s2c_packets_intervals = list()\n",
" for packet in packets:\n",
" if packet['dir'] == 1:\n",
" c2s_packets_bytes.append(packet['bytes'])\n",
" c2s_packets_intervals.append(packet['interval'])\n",
" elif packet['dir'] == 2:\n",
" s2c_packets_bytes.append(packet['bytes'])\n",
" s2c_packets_intervals.append(packet['interval'])\n",
" c2s_bytes = example_json['c2s_bytes']\n",
" s2c_bytes = example_json['s2c_bytes']\n",
" c2s_pkts = example_json['c2s_pkts']\n",
" s2c_pkts = example_json['s2c_pkts']\n",
" duration = example_json['duration']\n",
" c2s_packets_bytes_mean = 0\n",
" c2s_packets_bytes_median = 0\n",
" c2s_packets_bytes_std = 0\n",
" c2s_packets_bytes_max = 0\n",
" c2s_packets_bytes_min = 0\n",
"\n",
" c2s_packets_intervals_mean = 0\n",
" c2s_packets_intervals_median = 0\n",
" c2s_packets_intervals_std = 0\n",
" c2s_packets_intervals_max = 0\n",
" c2s_packets_intervals_min = 0\n",
"\n",
" s2c_packets_bytes_mean = 0\n",
" s2c_packets_bytes_median = 0\n",
" s2c_packets_bytes_std = 0\n",
" s2c_packets_bytes_max = 0\n",
" s2c_packets_bytes_min = 0\n",
"\n",
" s2c_packets_intervals_mean = 0\n",
" s2c_packets_intervals_median = 0\n",
" s2c_packets_intervals_std = 0\n",
" s2c_packets_intervals_max = 0\n",
" s2c_packets_intervals_min = 0\n",
" \n",
" if c2s_bytes > 0:\n",
" c2s_packets_bytes_mean = np.mean(c2s_packets_bytes)\n",
" c2s_packets_bytes_median = np.median(c2s_packets_bytes)\n",
" c2s_packets_bytes_std = np.std(c2s_packets_bytes)\n",
" c2s_packets_bytes_max = np.max(c2s_packets_bytes)\n",
" c2s_packets_bytes_min = np.min(c2s_packets_bytes)\n",
"\n",
" c2s_packets_intervals_mean = np.mean(c2s_packets_intervals)\n",
" c2s_packets_intervals_median = np.median(c2s_packets_intervals)\n",
" c2s_packets_intervals_std = np.std(c2s_packets_intervals)\n",
" c2s_packets_intervals_max = np.max(c2s_packets_intervals)\n",
" c2s_packets_intervals_min = np.min(c2s_packets_intervals)\n",
" \n",
" if s2c_bytes > 0:\n",
" s2c_packets_bytes_mean = np.mean(s2c_packets_bytes)\n",
" s2c_packets_bytes_median = np.median(s2c_packets_bytes)\n",
" s2c_packets_bytes_std = np.std(s2c_packets_bytes)\n",
" s2c_packets_bytes_max = np.max(s2c_packets_bytes)\n",
" s2c_packets_bytes_min = np.min(s2c_packets_bytes)\n",
"\n",
" s2c_packets_intervals_mean = np.mean(s2c_packets_intervals)\n",
" s2c_packets_intervals_median = np.median(s2c_packets_intervals)\n",
" s2c_packets_intervals_std = np.std(s2c_packets_intervals)\n",
" s2c_packets_intervals_max = np.max(s2c_packets_intervals)\n",
" s2c_packets_intervals_min = np.min(s2c_packets_intervals)\n",
"\n",
" #tls\n",
" tls = example_json['tls']\n",
" extensions_list = tls['extensions_list']\n",
" #print(extensions_list)\n",
" ciphers = tls['cipher_suites']\n",
" #print(ciphers)\n",
" extensions_arr = np.zeros(34, dtype=np.uint8)\n",
" cipher_suits_arr = np.zeros(123, dtype=np.uint8)\n",
" for extension in extensions_list:\n",
" try:\n",
" extensions_arr[extensions[extension]]=1\n",
" except Exception:\n",
" pass\n",
" for cipher in ciphers:\n",
" try:\n",
" cipher = cipher.upper()\n",
" cipher_suits_arr[ciper_suits[cipher]]=1\n",
" except Exception:\n",
" pass\n",
" result = [c2s_bytes, c2s_pkts, s2c_bytes, s2c_pkts, duration, c2s_packets_bytes_mean, c2s_packets_bytes_median, c2s_packets_bytes_std,\\\n",
" c2s_packets_bytes_max, c2s_packets_bytes_min, c2s_packets_intervals_mean, c2s_packets_intervals_median, c2s_packets_intervals_std,\\\n",
" c2s_packets_intervals_max, c2s_packets_intervals_min, s2c_packets_bytes_mean, s2c_packets_bytes_median, s2c_packets_bytes_std,\\\n",
" s2c_packets_bytes_max, s2c_packets_bytes_min, s2c_packets_intervals_mean, s2c_packets_intervals_median, s2c_packets_intervals_std,\\\n",
" s2c_packets_intervals_max, s2c_packets_intervals_min]\n",
" result += list(cipher_suits_arr)\n",
" result += list(extensions_arr)\n",
" result_data.append(result)\n",
" i += 1\n",
"extensions_head = list()\n",
"for i in range(len(extensions)):\n",
" extensions_head.append('extension'+str(i))\n",
"cipher_head = ['cipher'+str(i) for i in range(len(ciper_suits))]\n",
"base_head = ['c2s_bytes', 'c2s_pkts', 's2c_bytes', 's2c_pkts', 'duration', 'c2s_packets_bytes_mean', 'c2s_packets_bytes_median', 'c2s_packets_bytes_std',\\\n",
" 'c2s_packets_bytes_max', 'c2s_packets_bytes_min', 'c2s_packets_intervals_mean', 'c2s_packets_intervals_median', 'c2s_packets_intervals_std',\\\n",
" 'c2s_packets_intervals_max', 'c2s_packets_intervals_min', 's2c_packets_bytes_mean', 's2c_packets_bytes_median', 's2c_packets_bytes_std',\\\n",
" 's2c_packets_bytes_max', 's2c_packets_bytes_min', 's2c_packets_intervals_mean', 's2c_packets_intervals_median', 's2c_packets_intervals_std',\\\n",
" 's2c_packets_intervals_max', 's2c_packets_intervals_min']\n",
"header = base_head+cipher_head+extensions_head\n",
"result_df = pd.DataFrame(result_data, columns=header)\n",
"result_df['label'] = np.array(result_label)\n",
"example_csv_file = './CsvFile/' + date + '/examples.csv'\n",
"result_df.to_csv(example_csv_file, index=False)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import os\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"hupu: 489846\n",
"weibo: 897897\n",
"douyin: 158497\n",
"toutiao: 213989\n",
"zhihu: 968036\n"
]
}
],
"source": [
"# 统计每个app的包数\n",
"date = '2019-12-20_21'\n",
"exmaples_file = './CsvFile/' + date + '/examples.csv'\n",
"app2pktsDict = dict()\n",
"with open(exmaples_file) as f:\n",
" lines = f.readlines()\n",
" i = 0\n",
" for line in lines:\n",
" if i == 0:\n",
" i += 1\n",
" continue;\n",
" line = line.split(',')\n",
" pkts = int(line[1]) + int(line[3])\n",
" appName = line[-1]\n",
" if appName not in app2pktsDict.keys():\n",
" app2pktsDict[appName] = 0\n",
" app2pktsDict[appName] += pkts \n",
"for appName, pkts in app2pktsDict.items():\n",
" appName = appName[:-1]\n",
" print(appName + ': ', pkts)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"zhihu 6403\n",
"weibo 5487\n",
"douyin 3964\n",
"hupu 2304\n",
"toutiao 520\n",
"Name: label, dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEVCAYAAADpbDJPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFx1JREFUeJzt3Xu0nXV95/H3B6hirUDQwFASDbZZXuoIYgRmvIxKy9UR\nxkoHV9XU0mYu1LFTZ2x0Zg0VdYm2asdebBlBo2OLqFWoMGqKqKWjSLgYUXQSESULK9EA2nphxX7n\nj/07ZBNOcvYJh/2c8Hu/1jprP8/v+e29v88m7M9+fs8tVYUkqT/7DF2AJGkYBoAkdcoAkKROGQCS\n1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpU/sNXcDuPOpRj6oVK1YMXYYk7VWuvfba71TV0rn6LeoA\nWLFiBRs2bBi6DEnaqyT5xiT9HAKSpE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoA\nkKROLeozgRfCirWXDV0CALecd+rQJUjSvbgFIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhS\npwwASerURAGQ5KAkH0zylSQ3JfkXSQ5Osj7Jpva4pPVNkrcn2ZxkY5Kjx15ndeu/KcnqB2qlJElz\nm3QL4H8CH6uqxwNHAjcBa4ErqmolcEWbBzgZWNn+1gDvAEhyMHAOcCxwDHDOTGhIkqZvzgBIcgDw\nLOACgKq6u6ruBE4D1rVu64DT2/RpwHtq5HPAQUkOA04E1lfVtqq6A1gPnLSgayNJmtgkWwCPBbYC\n70pyfZJ3Jnk4cGhVfQugPR7S+h8O3Dr2/C2tbVftkqQBTBIA+wFHA++oqqcA/8iO4Z7ZZJa22k37\nvZ+crEmyIcmGrVu3TlCeJGlPTBIAW4AtVXV1m/8go0D4dhvaoT3ePtZ/+djzlwG37ab9Xqrq/Kpa\nVVWrli5dOp91kSTNw5wBUFV/D9ya5HGt6Xjgy8ClwMyRPKuBS9r0pcBL29FAxwF3tSGijwMnJFnS\ndv6e0NokSQOY9H4ALwfel+QhwM3AyxiFx8VJzgK+CZzR+l4OnAJsBn7Q+lJV25K8Drim9Tu3qrYt\nyFpIkuZtogCoqhuAVbMsOn6WvgWcvYvXuRC4cD4FSpIeGJ4JLEmdMgAkqVMGgCR1ygCQpE4ZAJLU\nKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdWrSi8HpQWDF2suGLgGAW847degSJOEWgCR1ywCQ\npE4ZAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1KmJAiDJLUm+mOSGJBta\n28FJ1ifZ1B6XtPYkeXuSzUk2Jjl67HVWt/6bkqx+YFZJkjSJ+WwBPKeqjqqqVW1+LXBFVa0Ermjz\nACcDK9vfGuAdMAoM4BzgWOAY4JyZ0JAkTd/9GQI6DVjXptcBp4+1v6dGPgcclOQw4ERgfVVtq6o7\ngPXASffj/SVJ98OkAVDAJ5Jcm2RNazu0qr4F0B4Pae2HA7eOPXdLa9tVuyRpAJPeD+DpVXVbkkOA\n9Um+spu+maWtdtN+7yePAmYNwKMf/egJy5MkzddEWwBVdVt7vB34MKMx/G+3oR3a4+2t+xZg+djT\nlwG37aZ95/c6v6pWVdWqpUuXzm9tJEkTmzMAkjw8ySNmpoETgBuBS4GZI3lWA5e06UuBl7ajgY4D\n7mpDRB8HTkiypO38PaG1SZIGMMkQ0KHAh5PM9P+LqvpYkmuAi5OcBXwTOKP1vxw4BdgM/AB4GUBV\nbUvyOuCa1u/cqtq2YGsiSZqXOQOgqm4Gjpyl/bvA8bO0F3D2Ll7rQuDC+ZcpSVpongksSZ0yACSp\nUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6pQBIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjpl\nAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnq1MQBkGTfJNcn+Wib\nPyLJ1Uk2JXl/koe09oe2+c1t+Yqx13h1a/9qkhMXemUkSZObzxbAK4CbxubfBLytqlYCdwBntfaz\ngDuq6ueBt7V+JHkicCbwC8BJwJ8m2ff+lS9J2lMTBUCSZcCpwDvbfIDnAh9sXdYBp7fp09o8bfnx\nrf9pwEVV9eOq+jqwGThmIVZCkjR/k24B/CHwKuCf2vwjgTuranub3wIc3qYPB24FaMvvav3vaZ/l\nOfdIsibJhiQbtm7dOo9VkSTNx35zdUjyPOD2qro2ybNnmmfpWnMs291zdjRUnQ+cD7Bq1ar7LJcW\nwoq1lw1dArecd+rQJahzcwYA8HTg+UlOAfYHDmC0RXBQkv3ar/xlwG2t/xZgObAlyX7AgcC2sfYZ\n48+RJE3ZnENAVfXqqlpWVSsY7cT9ZFX9KnAl8MLWbTVwSZu+tM3Tln+yqqq1n9mOEjoCWAl8fsHW\nRJI0L5NsAezK7wIXJXk9cD1wQWu/AHhvks2MfvmfCVBVX0pyMfBlYDtwdlX95H68vyTpfphXAFTV\np4BPtembmeUonqr6EXDGLp7/BuAN8y1SkrTwPBNYkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoA\nkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKnDABJ\n6pQBIEmdMgAkqVMGgCR1ygCQpE7NGQBJ9k/y+SRfSPKlJK9t7UckuTrJpiTvT/KQ1v7QNr+5LV8x\n9lqvbu1fTXLiA7VSkqS5TbIF8GPguVV1JHAUcFKS44A3AW+rqpXAHcBZrf9ZwB1V9fPA21o/kjwR\nOBP4BeAk4E+T7LuQKyNJmtycAVAj/9Bmf6r9FfBc4IOtfR1weps+rc3Tlh+fJK39oqr6cVV9HdgM\nHLMgayFJmreJ9gEk2TfJDcDtwHrga8CdVbW9ddkCHN6mDwduBWjL7wIeOd4+y3MkSVM2UQBU1U+q\n6ihgGaNf7U+YrVt7zC6W7ar9XpKsSbIhyYatW7dOUp4kaQ/M6yigqroT+BRwHHBQkv3aomXAbW16\nC7AcoC0/ENg23j7Lc8bf4/yqWlVVq5YuXTqf8iRJ8zDJUUBLkxzUph8G/CJwE3Al8MLWbTVwSZu+\ntM3Tln+yqqq1n9mOEjoCWAl8fqFWRJI0P/vN3YXDgHXtiJ19gIur6qNJvgxclOT1wPXABa3/BcB7\nk2xm9Mv/TICq+lKSi4EvA9uBs6vqJwu7OpKkSc0ZAFW1EXjKLO03M8tRPFX1I+CMXbzWG4A3zL9M\nSdJC80xgSeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaA\nJHXKAJCkThkAktQpA0CSOmUASFKnDABJ6tQkt4SU9CC2Yu1lQ5fALeedOnQJXXILQJI6ZQBIUqcM\nAEnqlAEgSZ0yACSpU3MGQJLlSa5MclOSLyV5RWs/OMn6JJva45LWniRvT7I5ycYkR4+91urWf1OS\n1Q/cakmS5jLJFsB24JVV9QTgOODsJE8E1gJXVNVK4Io2D3AysLL9rQHeAaPAAM4BjgWOAc6ZCQ1J\n0vTNGQBV9a2quq5Nfx+4CTgcOA1Y17qtA05v06cB76mRzwEHJTkMOBFYX1XbquoOYD1w0oKujSRp\nYvPaB5BkBfAU4Grg0Kr6FoxCAjikdTscuHXsaVta267aJUkDmDgAkvwM8CHgt6vqe7vrOktb7aZ9\n5/dZk2RDkg1bt26dtDxJ0jxNFABJforRl//7quqvWvO329AO7fH21r4FWD729GXAbbtpv5eqOr+q\nVlXVqqVLl85nXSRJ8zDJUUABLgBuqqq3ji26FJg5kmc1cMlY+0vb0UDHAXe1IaKPAyckWdJ2/p7Q\n2iRJA5jkYnBPB14CfDHJDa3tNcB5wMVJzgK+CZzRll0OnAJsBn4AvAygqrYleR1wTet3blVtW5C1\nkCTN25wBUFVXMfv4PcDxs/Qv4OxdvNaFwIXzKVCS9MDwTGBJ6pQBIEmdMgAkqVMGgCR1ygCQpE4Z\nAJLUKQNAkjplAEhSpwwASeqUASBJnTIAJKlTBoAkdcoAkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEg\nSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOmUASFKn5gyAJBcmuT3JjWNtBydZn2RTe1zS2pPk7Uk2\nJ9mY5Oix56xu/TclWf3ArI4kaVKTbAG8Gzhpp7a1wBVVtRK4os0DnAysbH9rgHfAKDCAc4BjgWOA\nc2ZCQ5I0jDkDoKo+A2zbqfk0YF2bXgecPtb+nhr5HHBQksOAE4H1VbWtqu4A1nPfUJEkTdGe7gM4\ntKq+BdAeD2nthwO3jvXb0tp21X4fSdYk2ZBkw9atW/ewPEnSXBZ6J3BmaavdtN+3ser8qlpVVauW\nLl26oMVJknbY0wD4dhvaoT3e3tq3AMvH+i0DbttNuyRpIHsaAJcCM0fyrAYuGWt/aTsa6DjgrjZE\n9HHghCRL2s7fE1qbJGkg+83VIclfAs8GHpVkC6Ojec4DLk5yFvBN4IzW/XLgFGAz8APgZQBVtS3J\n64BrWr9zq2rnHcuSpCmaMwCq6kW7WHT8LH0LOHsXr3MhcOG8qpMkPWA8E1iSOmUASFKnDABJ6pQB\nIEmdMgAkqVMGgCR1ygCQpE4ZAJLUKQNAkjplAEhSp+a8FIQk9WLF2suGLoFbzjt1au/lFoAkdcoA\nkKROGQCS1CkDQJI6ZQBIUqcMAEnqlAEgSZ0yACSpUwaAJHXKAJCkThkAktQpA0CSOjX1AEhyUpKv\nJtmcZO2031+SNDLVAEiyL/AnwMnAE4EXJXniNGuQJI1MewvgGGBzVd1cVXcDFwGnTbkGSRLTD4DD\ngVvH5re0NknSlKWqpvdmyRnAiVX1G23+JcAxVfXysT5rgDVt9nHAV6dW4K49CvjO0EUsEn4WO/hZ\n7OBnscNi+CweU1VL5+o07TuCbQGWj80vA24b71BV5wPnT7OouSTZUFWrhq5jMfCz2MHPYgc/ix32\nps9i2kNA1wArkxyR5CHAmcClU65BksSUtwCqanuS3wI+DuwLXFhVX5pmDZKkkanfFL6qLgcun/b7\n3k+LakhqYH4WO/hZ7OBnscNe81lMdSewJGnx8FIQktQpA0CSOmUASFKnpr4TeG+Q5FmztVfVZ6Zd\ni6TFL8nzgZnvjU9X1V8PWc+k3Ak8iyTj//H2Z3QNo2ur6rkDlTSYJAcCvwc8szV9Gji3qu4arKiB\nJHkB8CbgECDtr6rqgEELG0iS/zFbe1WdO+1ahpTkjYy+I97Xml4EbKiqVw9X1WQMgAkkWQ68uape\nNHQt05bkQ8CNwLrW9BLgyKp6wXBVDSPJZuBfV9VNQ9eyGCR55djs/sDzgJuq6tcHKmkQSTYCR1XV\nP7X5fYHrq+rJw1Y2N4eAJrMFeNLQRQzk56rql8fmX5vkhsGqGda3/fLfoareMj6f5A/o98z+g4Bt\nbfrAIQuZDwNgFkn+CJjZNNoHOAr4wnAVDeqHSZ5RVVcBJHk68MOBaxrKhiTvBz4C/Himsar+ariS\nFpWfBh47dBEDeCNwfZIrGQ0LPgtY9MM/4BDQrJKsHpvdDtxSVX83VD1DSnIUo+GfAxn9494GrK6q\njYMWNoAk75qluXob8piR5Ivs+KG0L7CU0f6hPx6uqmEkOQx4GqP/R66uqr8fuKSJGACaSJIDAKrq\ne0PXosUhyWPGZrczGiLbPlQ9Q0qyBFjJaF8IsHccNWgAzKINc/we8BhGw2QzR3t0t3mb5JHAOcAz\nGP3au4rRr7zvDlrYFCV5VVW9eaehwXtU1X8aoKxFIcnRjP3bqKrrBy5p6pL8BvAKRpe3vwE4Dvjs\n3nDUoPsAZncB8J+Ba4GfDFzL0C4CPgPM7Aj+VeD9wC8OVtH0zez43TBoFYtMOwz0DGBmH8i7k3yg\nql4/YFlDeAWj4Z/PVdVzkjweeO3ANU3ELYBZJLm6qo4duo7FIMm1VfXUndr2mhteLKQkT6qqG4eu\nY7FIchPwlKr6UZt/GHBdVT1h2MqmK8k1VfW0dnTcsVX14yQ3VNVRQ9c2F7cAxrTNWYArk/w+o182\n40d7XDdIYcO6MsmZwMVt/oXAZQPWM6Q/azcyejfwF1V158D1DO0WRmPeP2rzDwW+Nlg1w9mS5CBG\nR4etT3IHO93pcLFyC2BMO4xrV2pvGNNbKEm+z2hcN8DD2TEUti/wDx2f/boS+HVGQx+fB95dVZ8Y\ntqphJPkIo6GP9Yz+rfwSo31Et0Of+0aS/CtGR8x9rKruHrqeuRgA0jy1Mz1PB94OfI9RSL6mt/MB\ndjpc+j6qat3ulu/tkhxQVd9LcvBsy6tq22zti4kBMCbJi6vqfyf5ndmWV9Vbp13TUJI8vqq+MjYs\ndi89DocleTLwMuBURr96L6iq65L8LKOjPh6z2xfQg0qSj1bV85J8nR1by/c87g1HDboP4N4e3h4f\nMWgVi8PvAGuAt8yyrIBuhsPG/DHwTka/9u85G7qqbkvy34craxhjX3z3sjd88S2Eqnpeezxi6Fr2\nlFsAkvZIO0dkxv6M9oscXFWzXiX0wSrJFVV1/Fxti5EBMIskS4HfBFYwtpXU4yn/SX6a0dbAo6tq\nTdsJ+riq+ujApU1d7794J5Hkqqp6xtB1TEOS/Rld/+hK4NmMhn4ADgD+z95wOKxDQLO7BPhb4G/w\nRLB3MToh7l+2+S3AB4DuAgAYP/fhnl+8A9UyuJ32D+3D6PPpafj03wG/DfwsML5P7HvAnwxS0Ty5\nBTCLveUkjmmYOekryfVV9ZTW9oWqOnLo2haDnn7x7qwdNj3zBbKd0XkBf1BV/2+wogaQ5OVV9UdD\n17En3AKY3UeTnFJVlw9dyCJwdzvDswCS/BxjJ8f1xF+893Eyo0uErGDHd8mZQFd3BAPuSvLSnRur\n6j1DFDMfBsCYsZOfAF6T5G7gbvq+9d85wMeA5UneBzwd+LVBKxrO+BFRM794f2WYUhaFjwB3Mhr+\n+NEcfR/MnjY2vT9wPKPPZNEHgENAs0jySeAtVXXZWNv/qqrfHLCsQSR5L/BFRjeBuZnRtc6/M2xV\nWgyS3FhVvd4pb5fafbTfW1XPH7qWuewzdAGL1ArgVTvd9Pqpu+j7YPcuRr9qns/ozNc/T/KKYUsa\nRpIDk7w1yYb295b2P3uv/m+Sfz50EYvQDxjdG2DRcwtgFkmuA45h9IW3HHgxcGVVzXpW7INdu/TB\n04DnAP8e+GFVPX7YqqYvyYeAGxndIQ3gJcCRVfWC4aqavrE7ge3H6IvuZkb7hWaGShf9zdAXUpK/\n5t53RnsCcHFVrR2uqskYALPY6YiXXwNeCSypqmWDFjaAJFcwOkP6s4wOjb2qqm4ftqphzHZ0WI9H\njO10J7D7qKpvTKuWxaBdAG7GduAbVbVlqHrmw53As/uzmYmqenf7xXP2gPUMaSOj4a8nAXcBdyb5\n7PilEDrywyTPqKqr4J47x3X3OfT2BT+Xqvp0kkPZsTN405D1zIdbAJpIkp9hdCG0/wL8s6p66MAl\nTV2SIxkd2TEz7n8HsLqqNg5XlYaW5FeA3wc+xWgY7JnAf62qDw5Z1yQMAO1Wkt9i9A/6qcA3GN0e\n8m+r6pODFjZFO10ddub+CAD/yGjMu5urxOq+knwB+KWZodF2KZm/2RtOlnQISHN5GPBW4Nqq2j50\nMQOZOdnrcYw28y9hFAQvZhSI6ts+O+0X+y57yRGWbgFIE0ryCeCXq+r7bf4RwAeq6qRhK9OQkrwZ\nOBL4y9b0b4GNVfW7w1U1mb0ipaRF4tGMzgyfcTejc0bUtwL+HHgyoyA4f9hyJucWgDShJP+N0aUf\nPszof/p/A7y/qt44aGEaVJLrdj5HKMnGveF8CANAmod2QbhnttnPVNX1Q9aj4ST5D8B/BB4LfG1s\n0SOAv6uqFw9S2DwYAJK0B9plQJYAbwTGz/r9/t5wQ3gwACSpW+4ElqROGQCS1CkDQJI6ZQBIUqcM\nAEnq1P8HV9/axntrFV8AAAAASUVORK5CYII=\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x1a16bb52e8>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"examples_df = pd.read_csv(exmaples_file)\n",
"class_counts = examples_df['label'].value_counts()\n",
"class_counts.plot.bar()\n",
"class_counts"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1\n",
"LogisticRegression 0.767452 0.767452 0.767452\n",
"SVM 0.822912 0.822912 0.822912\n",
"GaussianNB 0.711349 0.711349 0.711349\n",
"tree 0.982013 0.982013 0.982013\n",
"RandomForest 0.985225 0.985225 0.985225\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAFcCAYAAAAzq/4LAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmYVNWd//H3hxZsRJGMtI4BBSQacWMRFNwT40QdB40m\nASbGGBdGkRijkxkzyZM4JvmZxXGLWzSKSxCXOCoajEvEGDU67ERAFEmjHY1BorIYZfH7++PehqJp\n6Gos+lbf+3k9Tz3WXbr62yX96VPnnnuOIgIzM8uXDlkXYGZmledwNzPLIYe7mVkOOdzNzHLI4W5m\nlkMOdzOzHHK4m5nlkMPdMiOpXtJnqqCOgyW9LGm5pBOyrsesEhzuZnAxcHVEbBsR93+UF6qWP1hm\nDncz6AXMyboIAElbZV2D5YPD3bI2RNJcSW9LGiepVtLHJD0kaXG6/yFJPRu/QNKpkhZKWibpT5K+\nVHLsNEnz0q97RFKvTX1zSa8AuwEPpt0yW0vaXtJNkt6Q9GdJP5BUk57fV9ITkpZIekvSeEnd0mO3\nA7uWvNZ/SDpCUkOT77m2dS/pIkm/kvRLSUuBUyV1kHShpFfS73O3pH+o0PttBeFwt6x9Cfgs0BfY\nA/gOyb/LcSQt6l2BvwNXA0jqAlwFHBMR2wEHATPTYycA/wWcCNQBvwcmbOqbR0Rf4FXgX9JumQ+A\nW4HVwCeAgcA/AWekXyLgEuDjQD9gF+Ci9LW+3OS1flLme3A88CugGzAeOBc4ATg8/T5vA9eU+Vpm\niYjww49MHkA9cFbJ9rHAK82cNwB4O33eBXgHOAno3OS8h4HTS7Y7AO8Bvcqo4zPp852AD0pfGxgF\nTN7I154AzGjutdLtI4CGTXy/i4CnmhyfBxxZsr0zsArYKuv/Z360n4db7pa110qeLwI+LmkbST+X\ntCjtqngK6CapJiJWACOAs4A3JP1a0p7p1/cCrpT0jqR3gL+RtLR7tKKeXkDH9LUbX+fnwI4AknaU\ndGfaXbMU+CXQfbN/+sRrTbZ7AfeVfP95wBqSPzxmZXG4W9Z2KXm+K/A6cAHwSeDAiOgKHJYeF0BE\nPBIRR5G0aF8EbkyPvwb8W0R0K3l0johnW1HPayQt9+4lr9E1IvZOj18CBLBfWtvJjXWlms6hvQLY\npnEj7buva3JO0695jaTbqfTnqI2IP7fi57CCc7hb1s6R1DO9YPhfwF3AdiT97O+k+7/XeLKknSQN\nT/vePwCWk7RqAa4HviVp7/Tc7SV9oTXFRMQbwKPA/0jqml7c7Cvp8PSU7dLv+Y6kHsA3m7zEmyQX\naBu9BNRK+mdJHUmuKWzdQhnXAz9svBgsqU7S8a35Ocwc7pa1O0jCdGH6+AFwBdAZeAt4DvhNyfkd\nSFr2r5N0uxwOjAGIiPuAHwN3pl0mLwDHbEZNpwCdgLkkFzN/RfIpAeC/gUHAu8Cvgf9t8rWXAN9J\nu1T+PSLeTev7BfBnkpZ8A5t2JTAReFTSMpL34MDN+DmswBThlZjMzPLGLXczsxzy3XCWe5IOJRkm\nuYGI2LaNyzFrE+6WMTPLIXfLmJnlUGbdMt27d4/evXtn9e3NzNqladOmvRURTe+V2EBm4d67d2+m\nTp2a1bc3M2uXJC0q5zx3y5iZ5VCL4S7pZkl/lfTCRo5L0lWSFkiaLWlQ5cs0M7PWKKflfgtw9CaO\nHwPsnj5GA9d99LLMzOyjaLHPPSKektR7E6ccD9wWyZjK5yR1k7RzOkeH2QZWrVpFQ0MD77//ftal\ntEu1tbX07NmTjh07Zl2KVbFKXFDtwfpTljak+xzu1qyGhga22247evfujaSWv8DWigiWLFlCQ0MD\nffr0ybocq2KVuKDa3G9ns3dGSRotaaqkqYsXL67At7b26P3332eHHXZwsG8GSeywww7+1GMtqkS4\nN7D+nNw9SWbs20BE3BARgyNicF1di8M0Lccc7JvP752VoxLhPhE4JR01MxR41/3tVlQHHXTQJo8f\ne+yxvPPOO21UjRVZi33ukiaQrAPZPV3F/Xsky5AREdcDk0jWvlxAsl7lV7dUsZZPvS/8dUVfr/5H\n/1yR11mzZg01NTWt+ppnn930ok+TJk36KCXZR9Caf2f1tf9a9rn79tm17HP/+JU/ln3uR1XOaJlR\nLRwP4JyKVWTWBurr6zn66KM58MADmTFjBnvssQe33XYbe+21F6eddhqPPvooY8eOZciQIZxzzjks\nXryYbbbZhhtvvJE999yTN998k7POOouFCxcCcN1113HQQQex7bbbsnz5ct544w1GjBjB0qVLWb16\nNddddx2HHnro2juzu3fvzmWXXcbNN98MwBlnnMF5551HfX09xxxzDIcccgjPPvssPXr04IEHHqBz\n586b9XMWLdBsHd+haoU1f/58Ro8ezezZs+natSvXXnstkAw1fPrppxk5ciSjR4/mZz/7GdOmTePS\nSy9lzJgxAJx77rkcfvjhzJo1i+nTp7P33nuv99p33HEHn/3sZ5k5cyazZs1iwIAB6x2fNm0a48aN\n4/nnn+e5557jxhtvZMaMGQC8/PLLnHPOOcyZM4du3bpx7733tsG7YXnj+dytsHbZZRcOPvhgAE4+\n+WSuuuoqAEaMGAHA8uXLefbZZ/nCF9Ytw/rBBx8A8MQTT3DbbbcBUFNTw/bbb7/eaw8ZMoTTTjuN\nVatWccIJJ2wQ7k8//TSf+9zn6NKlCwAnnngiv//97xk+fDh9+vRZe/7+++9PfX19hX9yKwK33K2w\nmo46adxuDNwPP/yQbt26MXPmzLWPefPmlfXahx12GE899RQ9evTgy1/+8to/BI02tY7C1luvWz+7\npqaG1atXl/U9zUo53K2wXn31Vf7whz8AMGHCBA455JD1jnft2pU+ffpwzz33AEkgz5o1C4AjjzyS\n665LZtpYs2YNS5cuXe9rFy1axI477siZZ57J6aefzvTp09c7fthhh3H//ffz3nvvsWLFCu677z4O\nPfTQLfJzWjE53K2w+vXrx6233sp+++3H3/72N84+++wNzhk/fjw33XQT/fv3Z++99+aBBx4A4Mor\nr2Ty5Mnsu+++7L///syZM2e9r3vyyScZMGAAAwcO5N577+XrX//6escHDRrEqaeeygEHHMCBBx7I\nGWecwcCBA7fcD2uFk9kye4MHDw7P515M8+bNo1+/fpnWUF9fz3HHHccLLzQ72WnVK/c99GiZdfLy\nXkiaFhGDWzrPLXczsxxyuFsh9e7du9222s3K4XA3M8shh7uZWQ453M3McsjhbmaWQw53swqpr69n\nn332AZJx7scdd1zGFVmReW4Zy95F27d8Tqte791WnR4RRAQdOritY/nhf81WSPX19fTr148xY8Yw\naNAgbr/9doYNG8agQYP4whe+wPLlywGYMmUKBx10EP379+eAAw5g2bJl1NfXc+ihhzJo0CAGDRrU\n4hzuZllwuFthzZ8/n1NOOYXHHnuMm266iccff5zp06czePBgLrvsMlauXMmIESO48sormTVrFo8/\n/jidO3dmxx135LHHHmP69OncddddnHvuuVn/KGYbcLeMFVavXr0YOnQoDz30EHPnzl07/e/KlSsZ\nNmwY8+fPZ+edd2bIkCFAMpEYwIoVKxg7diwzZ86kpqaGl156KbOfwWxjHO5WWI1T+0YERx11FBMm\nTFjv+OzZs5tdjPryyy9np512YtasWXz44YfU1ta2Sb1mreFuGSu8oUOH8swzz7BgwQIA3nvvPV56\n6SX23HNPXn/9daZMmQLAsmXLWL16Ne+++y4777wzHTp04Pbbb2fNmjVZlm/WLIe7FV5dXR233HIL\no0aNYr/99mPo0KG8+OKLdOrUibvuuouvfe1r9O/fn6OOOor333+fMWPGcOuttzJ06FBeeumltZ8A\nzKqJp/y1NlcNU/62d57yt/Xy8l54yl8zswJzuJuZ5ZDD3cwshxzuZmY55HA3M8shh7uZWQ453K2Q\nrrrqKvr168dJJ53EsGHD2Hrrrbn00kuzLsusYjz9gGVu31v3rejrlTOW+Nprr+Xhhx+mS5cuLFq0\niPvvv7+iNZhlzS13K5yzzjqLhQsXMnz4cMaPH8+QIUPo2LFj1mWZVZRb7lY4119/Pb/5zW+YPHky\n3bt3z7ocsy3CLXczsxxyyz0n8jJvhplVRlktd0lHS5ovaYGkC5s5vqukyZJmSJot6djKl2pmZuVq\nseUuqQa4BjgKaACmSJoYEXNLTvsOcHdEXCdpL2AS0HsL1GtWUX/5y18YPHgwS5cupUOHDlxxxRXM\nnTt37apLZu1VOd0yBwALImIhgKQ7geOB0nAPoPG3YXvg9UoWafmWRXdOfX392ucNDQ1t/v3NtrRy\nwr0H8FrJdgNwYJNzLgIelfQ1oAvwmYpUZ2Zmm6WcPvcNF5FMWuqlRgG3RERP4FjgdkkbvLak0ZKm\nSpq6ePHi1ldrZmZlKSfcG4BdSrZ7smG3y+nA3QAR8QegFthgAHFE3BARgyNicF1d3eZVbGZmLSon\n3KcAu0vqI6kTMBKY2OScV4EjAST1Iwl3N81to7Ja3jEP/N5ZOVoM94hYDYwFHgHmkYyKmSPpYknD\n09MuAM6UNAuYAJwa/hdoG1FbW8uSJUscUpshIliyZAm1tbVZl2JVrqybmCJiEsnwxtJ93y15Phc4\nuLKlWV717NmThoYGfN1l89TW1tKzZ8+sy7Aq5ztUrc117NiRPn36ZF2GWa55bhkzsxxq1y13z6di\nZtY8t9zNzHLI4W5mlkMOdzOzHHK4m5nlkMPdzCyHHO5mZjnkcDczyyGHu5lZDjnczcxyyOFuZpZD\nDnczsxxq13PLmDXHcw6ZueVuZpZLDnczsxxyuJuZ5ZDD3cwshxzuZmY55HA3M8shh7uZWQ453M3M\ncsjhbmaWQw53M7MccribmeWQw93MLIcc7mZmOeRwNzPLIYe7mVkOOdzNzHLI4W5mlkMOdzOzHHK4\nm5nlUFnhLuloSfMlLZB04UbO+aKkuZLmSLqjsmWamVlrtLhAtqQa4BrgKKABmCJpYkTMLTlnd+Bb\nwMER8bakHbdUwWZm1rJyWu4HAAsiYmFErATuBI5vcs6ZwDUR8TZARPy1smWamVlrlBPuPYDXSrYb\n0n2l9gD2kPSMpOckHd3cC0kaLWmqpKmLFy/evIrNzKxF5YS7mtkXTba3AnYHjgBGAb+Q1G2DL4q4\nISIGR8Tgurq61tZqZmZlKifcG4BdSrZ7Aq83c84DEbEqIv4EzCcJezMzy0A54T4F2F1SH0mdgJHA\nxCbn3A98CkBSd5JumoWVLNTMzMrXYrhHxGpgLPAIMA+4OyLmSLpY0vD0tEeAJZLmApOBb0bEki1V\ntJmZbVqLQyEBImISMKnJvu+WPA/g/PRhZmYZ8x2qZmY55HA3M8shh7uZWQ453M3McsjhbmaWQw53\nM7MccribmeWQw93MLIcc7mZmOeRwNzPLIYe7mVkOOdzNzHLI4W5mlkMOdzOzHHK4m5nlkMPdzCyH\nHO5mZjnkcDczyyGHu5lZDjnczcxyyOFuZpZDDnczsxxyuJuZ5ZDD3cwshxzuZmY55HA3M8shh7uZ\nWQ453M3McsjhbmaWQw53M7MccribmeWQw93MLIcc7mZmOVRWuEs6WtJ8SQskXbiJ8z4vKSQNrlyJ\nZmbWWi2Gu6Qa4BrgGGAvYJSkvZo5bzvgXOD5ShdpZmatU07L/QBgQUQsjIiVwJ3A8c2c933gJ8D7\nFazPzMw2Qznh3gN4rWS7Id23lqSBwC4R8dCmXkjSaElTJU1dvHhxq4s1M7PylBPuamZfrD0odQAu\nBy5o6YUi4oaIGBwRg+vq6sqv0szMWqWccG8AdinZ7gm8XrK9HbAP8KSkemAoMNEXVc3MslNOuE8B\ndpfUR1InYCQwsfFgRLwbEd0jondE9AaeA4ZHxNQtUrGZmbWoxXCPiNXAWOARYB5wd0TMkXSxpOFb\nukAzM2u9rco5KSImAZOa7PvuRs494qOXZWZmH4XvUDUzyyGHu5lZDjnczcxyyOFuZpZDDnczsxxy\nuJuZ5ZDD3cwshxzuZmY55HA3M8shh7uZWQ453M3McsjhbmaWQw53M7MccribmeWQw93MLIcc7mZm\nOeRwNzPLIYe7mVkOOdzNzHLI4W5mlkMOdzOzHHK4m5nlkMPdzCyHHO5mZjnkcDczyyGHu5lZDjnc\nzcxyyOFuZpZDDnczsxxyuJuZ5ZDD3cwshxzuZmY55HA3M8uhssJd0tGS5ktaIOnCZo6fL2mupNmS\nfiupV+VLNTOzcrUY7pJqgGuAY4C9gFGS9mpy2gxgcETsB/wK+EmlCzUzs/KV03I/AFgQEQsjYiVw\nJ3B86QkRMTki3ks3nwN6VrZMMzNrjXLCvQfwWsl2Q7pvY04HHm7ugKTRkqZKmrp48eLyqzQzs1Yp\nJ9zVzL5o9kTpZGAw8NPmjkfEDRExOCIG19XVlV+lmZm1ylZlnNMA7FKy3RN4velJkj4DfBs4PCI+\nqEx5Zma2OcppuU8BdpfUR1InYCQwsfQESQOBnwPDI+KvlS/TzMxao8Vwj4jVwFjgEWAecHdEzJF0\nsaTh6Wk/BbYF7pE0U9LEjbycmZm1gXK6ZYiIScCkJvu+W/L8MxWuy8zMPgLfoWpmlkMOdzOzHHK4\nm5nlkMPdzCyHHO5mZjnkcDczyyGHu5lZDjnczcxyyOFuZpZDDnczsxxyuJuZ5ZDD3cwshxzuZmY5\n5HA3M8shh7uZWQ453M3McsjhbmaWQw53M7MccribmeWQw93MLIcc7mZmOeRwNzPLIYe7mVkOOdzN\nzHLI4W5mlkMOdzOzHHK4m5nlkMPdzCyHHO5mZjnkcDczyyGHu5lZDjnczcxyyOFuZpZDZYW7pKMl\nzZe0QNKFzRzfWtJd6fHnJfWudKFmZla+FsNdUg1wDXAMsBcwStJeTU47HXg7Ij4BXA78uNKFmplZ\n+cppuR8ALIiIhRGxErgTOL7JOccDt6bPfwUcKUmVK9PMzFpDEbHpE6TPA0dHxBnp9peBAyNibMk5\nL6TnNKTbr6TnvNXktUYDo9PNTwLzK/WDfATdgbdaPKsY/F4k/D6s4/dinWp5L3pFRF1LJ21Vxgs1\n1wJv+hehnHOIiBuAG8r4nm1G0tSIGJx1HdXA70XC78M6fi/WaW/vRTndMg3ALiXbPYHXN3aOpK2A\n7YG/VaJAMzNrvXLCfQqwu6Q+kjoBI4GJTc6ZCHwlff554Iloqb/HzMy2mBa7ZSJitaSxwCNADXBz\nRMyRdDEwNSImAjcBt0taQNJiH7kli66wquomypjfi4Tfh3X8XqzTrt6LFi+omplZ++M7VM3Mcsjh\nbmaWQw53M7MccribmW2EpIPL2VeNCnlBVdIewDeBXpSMGIqIT2dWVBuSNHtjh4CIiP3asp4sSaoF\nRgBvAw8C/wEcCrwCfL/pXdZFIGkb4AJg14g4U9LuwCcj4qGMS2tzkqZHxKCW9lWjcu5QzaN7gOuB\nG4E1GdeShQ9J7iC+gyTQ/p5tOZm6DVgFdCEJtBeAq4FDgFuA4zKrLDvjgGnAsHS7geR3pjDhLmkY\ncBBQJ+n8kkNdSYaEV72ihvvqiLgu6yKyEhEDJO0JjCIJ+Lnpfx+NiNWZFtf29oqIfdI7qxsi4vB0\n/28kzcqysAz1jYgRkkYBRMTfCzgRYCdgW5KM3K5k/1KSGzWrXlHD/UFJY4D7gA8ad0ZEYaZMiIgX\nge8B35M0gqQF+2Pgp5kW1vZWwtqb9ZpOq1HET3UAKyV1Jp0fSlJfSn5PiiAifgf8TtItEbEIQFIH\nYNuIWJptdeUpap/7n5rZHRGxW5sXkxFJPUjuJP4cSX/z3cB9EbE808LamKS/kkxjLZK+9zsbDwFf\njIidsqotK5KOAr5Dsn7Do8DBwKkR8WSWdWVB0h3AWSR/6KeRzJt1WURUfSOokOFedJJ+R/JR826S\n+ffX+8RSpE8wkr6yqeMRceumjueVpB2AoSR/5J4r4oVlAEkz027MLwH7A/8JTGsPgw4KGe6SOgJn\nA4elu54Efh4RqzIrqg1JqmfdlMyl/wAaR8sU5hOMbSjtX/8SsFtEXCxpV+AfI+L/Mi6tzUmaAwwg\nuSZ1dUT8TtKsiOifcWktKmqf+3VAR+DadPvL6b4zMquoDUVE76xrqBaSxtHM2gOpiIjT27KeKnEt\nyYiqTwMXA8uAe4EhWRaVkZ8D9cAs4ClJvUguqla9orbcN/jL217+GleCpLnAL4E7I2Jh1vVkSdJJ\nzezeFTgPqImInm1cUuYax3FLmhERA9N9hfn9aImkrdrDqLKi3qG6Jh0BAICk3SjWyIhRJH3uj0l6\nXtJ5kj6edVFZiIh7Gx/ADJKF4M8GfgQUtXtqlaQa1o2WqSNpyReOpJ0k3STp4XR7L9atXVHVitpy\nP5LkRo2FJP3MvYCvRsTkTAvLgKShJKNETgIWABMi4sZsq2pbkvoB3wYGkgwF/WV7aJltKenFwxHA\nIJKF7z8PfCci7sm0sAykoT4O+HZE9E/vh5gREftmXFqLChnuAJK2JlmkW8CLEVGocbxNSToCuJzk\npp6tMy6nzUi6BxgMXEoyemi9T3BFGjlUKr3J7UiS34/fRsS8jEvKhKQpETGkSRfVzIgYkHVtLSnU\nBVVJn46IJySd2ORQX0lExP9mUlhGJA0h6aI5ieSi0Q0kt5kXyRCS7od/J5l+oPROzKBgXTPpjTqz\nI2If4MWs66kCK9JhoY1dVEOBd7MtqTyFCnfgcOAJ4F+aORZAIcJd0v8Dvgi8Q3LTzsER0ZBtVdnw\nyKH1RcSHkmZJ2jUiXs26nipwPska0X0lPQPU0U6mHyhst0yRSZoE/Cginkq3TyFpvS8CLipwV0QP\nNpwp9KnsKsqGpCdIPtH8H7CicX9EDM+sqAykn2KGkrwPjV2489vL/TBFa7kDIOnrJBdJlpHMDDkI\nuDAiHs20sLbzjySzHyLpMJKRIV8juVnjBtpJy6SSJP2Y5CLiXNb1uwdQuHAnmTCrdDZMkcw7VCjp\np5j/iYhhwJys62mtQoY7cFpEXCnps8COwFdJwr4o4d6hpHU+ArghHQp4r6SZGdaVpRNI5iwv9IX1\n1FbpxFlrpROJFdGj6b0Q/xvtrJujqOHeeNHsWGBcRMwq2JSmW5XciHEkMLr0WEY1ZW0hyV3LhQ13\nSWcDY4Ddmizosh3wTDZVZe58krn+10j6O+um6OiabVktK+ov8jRJjwJ9gG9J2o5i3aQxgWQ607dI\nFur4PYCkT9BORgJsAe8BMyX9lvWngT43u5La3B3Aw8AlwIUl+5cV9TpMRGzX8lnVqZAXVNMLJQOA\nhRHxjqR/AHpGxMaWn8uddEjXziQLdKxI9+1BMl/19EyLy8DGZocs6qyQto6k4ZRMMthelhssargf\nDMyMiBWSTia5oHpl46T8ZmYAkn5EMnJofLprFMmUvxdu/KuqQ1HDfTbQH9gPuB24CTixZIk1K5h0\nEehLSBaoqG3c7+mPiy3NigER8WG6XUMy/UDVz+de1InDVqdXvo8nabFfyfrrJFrxjCOZ9nk18CmS\nZQdvz7QiqxbdSp5vn1kVrVTUC6rLJH2LZB73Q9O/xh0zrsmy1TkifitJaffcRZJ+T7LOrBXXJcAM\nSZNJRsocBnwr25LKU9RwHwH8K8l497+kK81U/ZqItkW9n15of1nSWODPJPdAWIFFxARJT5L0uwv4\nz4j4S7ZVlaeQfe4A6Yoqu0fE45K2IVmYYVnWdVk20knU5pF8BP8+ycfvn0TEc5kWZpmQNDYirk6f\n7x0R7e4O1UKGu6QzSW7c+YeI6JteTLs+Io7MuDQzqwKNq1E1fd6eFLVb5hzgAOB5gIh4WZI/gheQ\npCsi4jxJD9LMWqpFmyzLmtUu714varh/EBErG2ccSFdXKd5HGIN1I2IuzbQKqzbdJH2OZERh16Zr\nQLSHtR+K2i3zE5K5zE8hmQ1xDDA3Ir6daWFWFSR9DNilSHcs2/okjdvE4YiI09qsmM1U1HDvAJwO\n/BPJR65HgF+0t1nfrHLSERHDST7NzgQWA7+LiPOzrMtscxUu3NMx7bdGxMlZ12LVo3GNTElnkLTa\nvydpdnu4E9G2HEndSD7h92b9RVyqfkK5wvW5R8QaSXWSOkXEyqzrsaqxlaSdSZYfdPecNZoEPAf8\nkXY2c2zhwj1VDzwjaSLrLyN2WWYVWdYuJumeezoipkjaDXg545ose7XttWuucN0yAJKavaU8Iv67\nrWsxs+ol6RvAcuAh1p/nv+rnty9kuJs1lY6g+gHJ4iW/IZk19LyI+GWmhVmmJJ0D/JBkdF1jWEZ7\nmC20kOG+kRtW3gWmAj+PiPfbvirLkqSZETEgHdt8AvANYHJE9M+4NMuQpFeAAyPiraxraa2iTvm7\nkOSj1o3pYynwJrBHum3F0zgr6LHAhPbwsdvaxBySJRjbnaJeUB0YEYeVbD8o6amIOExSu5sgyCri\nQUkvknTLjJFUB/gTnK0hWVt3Mu1sbd2ihnudpF0j4lWAdMrf7ukxD48soIi4UNKPgaXpcNkVJIu5\nWLHdnz7anaKG+wXA02l/moA+JK21LoAXRC4gSaeUPC89dFvbV2PVIiJuldSJpMsWYH5ErMqypnIV\n8oIqgKStgT1Jwv1FX0QtNkk/K9msBY4EpkfE5zMqyaqApCNIGnz1JFmxC/CViHgqw7LKUshwTxfn\nOB/oFRFnpvO5fzIiHsq4NKsSkrYHbveUv8UmaRrwrxExP93eg+SC+/7ZVtayoo6WGUfStz4s3W4g\nGeNs1ug9YPesi7DMdWwMdoCIeIl2st5yUfvc+0bECEmjACLi72rS0WrF0uTehw7AXsDd2VVkVWKq\npJtYN+//l4BpGdZTtqKG+0pJnUl/mSX1pWSYkxVS6WIdq4FFEdGQVTFWNc4mWbntXJI+96eAazOt\nqExF7XM/CvgOSevsUeBg4NSIeDLLuszMKqWQ4Q4gaQdgKMlf4+fa4+3FVjmShgI/A/oBnYAaYEVE\ndM20MMuEpD+yiaU328M8/0XtliEilgC/BpD0SUmXRMSZGZdl2bkaGAncAwwmWaDhE5lWZFk6Lv3v\nOel/S/sfr+T5AAAFR0lEQVTc28V0BIUaLSNpP0mPSnpB0g8k7STpXuC3wNys67NsRcQCoCYi1kTE\nOOBTWddk2YiIRRGxCDg4Iv4jIv6YPi4EPpt1feUoVLiTTAp2B3ASyRqZ00kmEftERFyeZWGWuffS\nOxFnSfpJOo93l6yLssx1kXRI44akg2gn/y4K1efeOK1ryfZrQO+IWJNhWVYFJPUimRm0E8l0v12B\n69LWvBWUpP2Bm4Ht013vAKdFxPTsqipP0frcayUNJLmICsm0v/s1jnFvD//DrLIkHQ/0jIhr0u3f\nATuSXEz7A+BwL7CImAb0l9SVpDH8btY1latoLffJmzgcEfHpNivGqoKkZ4CREfFauj0T+DSwLTAu\nIo7Msj7LVjoH1UlAb0oawxFxcVY1latQLfeI8AUya6pTY7Cnnk4X6vhbOkuoFdsDJKu0TaOd3ehY\nqJZ7o3RdxPER8U66/TFgVES0izvPrHIkLYiIZoc8SnolIvq2dU1WPSS9EBH7ZF3H5ijaaJlGZzYG\nO0BEvA14jHsxPS9pg//3kv4N+L8M6rHq8qykfbMuYnMUteU+G+gf6Q8vqQaYHRF7Z1uZtTVJO5Ks\ntPMBydBYgP2BrYETIuLNrGqz7EmaS3Iz259I/o2I5Ppc1d+hWtRw/ynJBZLrSUZFnAW8FhEXZFmX\nZUfSp4HGP+5zIuKJLOux6pAOkd1AeoNTVStquHcA/o1ktR2RTB72C493N7PmpJ/wahu3G9dfrmaF\nDHczs3JIGg78D/Bx4K9AL2Bee+jCLdRQSEl3R8QXNzbjW3voRzOzNvV9ktljH4+IgZI+BYzKuKay\nFCrcga+n/z1uk2eZmSVWRcQSSR0kdYiIyZJ+nHVR5SjUUMiIeCN9OqZx1reS2d/GZFmbmVWldyRt\nS7IC03hJV5Ks1FX1CtnnLml6RAxqsm+2u2XMrFR6l/LfSRrCXyKZQGx8uh5EVStUuEs6m6SF3pf1\nJ4TaDngmIk7OpDAzaxfSe2JGRsT4rGtpSdHCfXvgY8AlwIUlh5al84mYmZHOAnkO0AOYCDyWbn8T\nmBkRx2dYXlkKFe6NJPUFGiLiA0lHAPsBt5VOSWBmxSXpAeBtkmmfjyRpFHYCvh4RM7OsrVxFDfeZ\nJOtk9gYeIfnL/MmIODbLusysOkj6Y0Tsmz6vAd4Cdo2IZdlWVr5CjZYp8WFErAZOBK6IiG8AO2dc\nk5lVj1WNT9I71//UnoIdijfOvdEqSaNIVrj/l3RfxwzrMbPq0l/S0vS5gM7pduPEYV2zK608RQ33\nr5JMFvbDiPiTpD7ALzOuycyqRETUZF3DR1XIPnczs7wrVMvdc8uYWVEUquUuaeeIeKM9z9FsZlaO\nQoW7mVlRFKpbppGkZWzYLfMuMBW4ICIWtn1VZmaVU8hwBy4DXgfuIBnaNBL4R2A+cDNwRGaVmZlV\nQCG7ZSQ9HxEHNtn3XEQMlTQrIvpnVZuZWSUU9g5VSV9snIBf0hdLjhXvr52Z5U5RW+67AVcCw9Jd\nfwC+AfwZ2D8ins6qNjOzSihkuJuZ5V0hu2Uk9ZR0n6S/SnpT0r2SemZdl5lZpRQy3IFxJNP8fpxk\nMv4H031mZrlQyG4ZSTMjYkBL+8zM2quittzfknSypJr0cTJQ9QvempmVq6gt912Bq0lGywTwLHBu\nRLyaaWFmZhVSyHBvjqTzIuKKrOswM6sEh3tK0qsRsWvWdZiZVUJR+9ybo6wLMDOrFIf7Ov4IY2a5\nUahZITcy1S+kC+C2cTlmZluM+9zNzHLI3TJmZjnkcDczyyGHu5lZDjnczcxyyOFuZpZD/x/eSOhH\n+sFdGAAAAABJRU5ErkJggg==\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x1a25ab4f60>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import SVC\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn import tree\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import f1_score,recall_score,precision_score\n",
"import random\n",
"examples = examples_df.values.copy()\n",
"#只取25个流统计特征\n",
"examples = np.c_[examples[:,:25].copy(),examples[:,-1].copy()]\n",
"#print(examples)\n",
"score_df = pd.DataFrame(np.zeros((5,3)),index = ['LogisticRegression', 'SVM', 'GaussianNB', 'tree', 'RandomForest'], \\\n",
" columns = ['precision', 'recall', 'f1'])\n",
"#def a():\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = LogisticRegression()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['LogisticRegression'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = SVC()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['SVM'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = GaussianNB()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['GaussianNB'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = tree.DecisionTreeClassifier()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['tree'] = scores\n",
"\n",
"f1_score_list = list()\n",
"recall_score_list = list()\n",
"precision_score_list = list()\n",
"for i in range(1):\n",
" #np.random.shuffle(examples)\n",
" examples_train = examples[:int(len(examples)*0.75)]\n",
" examples_test = examples[int(len(examples)*0.75):]\n",
" x_train = examples_train[:,0:-1]\n",
" y_train = examples_train[:,-1]\n",
" x_test = examples_test[:,0:-1]\n",
" y_test = examples_test[:,-1]\n",
" classifer = RandomForestClassifier()\n",
" classifer.fit(x_train, y_train)\n",
" y_pred = classifer.predict(x_test)\n",
" f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n",
" recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n",
" precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n",
"scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n",
"score_df.loc['RandomForest'] = scores\n",
"print(score_df)\n",
"ax = score_df.plot.bar(title='base_feature')\n",
"fig = ax.get_figure()\n",
"#fig.savefig('../figure/base_feature.svg')\n",
"#print(score_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}