{ "cells": [ { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import os\n", "import json\n", "import pandas as pd\n", "import numpy as np\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "ciper_suits = {\n", " '1305':0,\n", " 'C030':1,\n", "\t'C02C':2,\n", "\t'C028':3,\n", "\t'C024':4,\n", "\t'C014':5,\n", "\t'C00A':6,\n", "\t'00A5':7,\n", "\t'00A3':8,\n", "\t'00A1':9,\n", "\t'009F':10,\n", "\t'006B':11,\n", "\t'006A':12,\n", "\t'0069':13,\n", "\t'0068':14,\n", "\t'0039':15,\n", "\t'0038':16,\n", "\t'0037':17,\n", "\t'0036':18,\n", "\t'0088':19,\n", "\t'0087':20,\n", "\t'0086':21,\n", "\t'0085':22,\n", "\t'C019':23,\n", "\t'00A7':24,\n", "\t'006D':25,\n", "\t'003A':26,\n", "\t'0089':27,\n", "\t'C032':28,\n", "\t'C02E':29,\n", "\t'C02A':30,\n", "\t'C026':31,\n", "\t'C00F':32,\n", "\t'C005':33,\n", "\t'009D':34,\n", "\t'003D':35,\n", "\t'0035':36,\n", "\t'0084':37,\n", "\t'008D':38,\n", "\t'C02F':39,\n", "\t'C02B':40,\n", "\t'C027':41,\n", "\t'C023':42,\n", "\t'C013':43,\n", "\t'C009':44,\n", "\t'00A4':45,\n", "\t'00A2':46,\n", "\t'00A0':47,\n", "\t'009E':48,\n", "\t'0067':49,\n", "\t'0040':50,\n", "\t'003F':51,\n", "\t'003E':52,\n", "\t'0033':53,\n", "\t'0032':54,\n", "\t'0031':55,\n", "\t'0030':56,\n", "\t'009A':57,\n", "\t'0099':58,\n", "\t'0098':59,\n", "\t'0097':60,\n", "\t'0045':61,\n", "\t'0044':62,\n", "\t'0043':63,\n", "\t'0042':64,\n", "\t'C018':65,\n", "\t'00A6':66,\n", "\t'006C':67,\n", "\t'0034':68,\n", "\t'009B':69,\n", "\t'0046':70,\n", "\t'C031':71,\n", "\t'C02D':72,\n", "\t'C029':73,\n", "\t'C025':74,\n", "\t'C00E':75,\n", "\t'C004':76,\n", "\t'009C':77,\n", "\t'003C':78,\n", "\t'002F':79,\n", "\t'0096':80,\n", "\t'0041':81,\n", "\t'008C':82,\n", "\t'C012':83,\n", "\t'C008':84,\n", "\t'0016':85,\n", "\t'0013':86,\n", "\t'0010':87,\n", "\t'000D':88,\n", "\t'C017':89,\n", "\t'001B':90,\n", "\t'C00D':91,\n", "\t'C003':92,\n", "\t'000A':93,\n", "\t'0007':94,\n", "\t'008B':95,\n", "\t'0021':96,\n", "\t'001F':97,\n", "\t'0025':98,\n", "\t'0023':99,\n", "\t'C011':100,\n", "\t'C007':101,\n", "\t'C016':102,\n", "\t'0018':103,\n", "\t'C00C':104,\n", "\t'C002':105,\n", "\t'0005':106,\n", "\t'0004':107,\n", "\t'008A':108,\n", "\t'0020':109,\n", "\t'0024':110,\n", "\t'C010':111,\n", "\t'C006':112,\n", "\t'C015':113,\n", "\t'C00B':114,\n", "\t'C001':115,\n", "\t'003B':116,\n", "\t'0002':117,\n", "\t'0001':118,\n", " '1301':119,\n", "\t'1302':120,\n", "\t'1303':121,\n", "\t'1304':122\n", "}" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "extensions = { \n", " 0:0, \n", " 1:1, \n", " 2:2, \n", " 3:3, \n", " 4:4, \n", " 5:5, \n", " 6:6, \n", " 7:7, \n", " 8:8, \n", " 9:9, \n", " 10:10, \n", " 11:11, \n", " 12:12, \n", " 13:13, \n", " 14:14, \n", " 15:15, \n", " 16:16, \n", " 17:17, \n", " 18:18, \n", " 19:19, \n", " 20:20, \n", " 21:21, \n", " 22:22, \n", " 23:23, \n", " 24:24, \n", " 25:25, \n", " 26:26, \n", " 27:27, \n", " 28:28, \n", " 29:29, \n", " 30:30, \n", " 31:31, \n", " 35:32, \n", " 65281:33 \n", "}" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mexample_label_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mroot_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'DataSet/result/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'noProxy/All'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'/stream_tag.txt'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mexample_label_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexample_label_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\s+'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mexample_label\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexample_label_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mexample_label_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mexample_label_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;31m#example_json_file = root_dir + 'DataSet/result/' + date + '/stream_stat.txt'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mexample_json_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mroot_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'DataSet/result/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'noProxy/All'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'/stream_stat.txt'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mexample_label_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mroot_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'DataSet/result/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'noProxy/All'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'/stream_tag.txt'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mexample_label_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexample_label_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\s+'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mexample_label\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexample_label_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0mexample_label_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mexample_label_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;31m#example_json_file = root_dir + 'DataSet/result/' + date + '/stream_stat.txt'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mexample_json_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mroot_dir\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'DataSet/result/'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'noProxy/All'\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'/stream_stat.txt'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1320\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1321\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1322\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_scalar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1323\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mKeyError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1324\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_scalar\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[0;31m# a fast-path to scalar access\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1628\u001b[0m \u001b[0;31m# if not, raise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1629\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_value\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtakeable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1630\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1631\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mget_value\u001b[0;34m(self, index, col, takeable)\u001b[0m\n\u001b[1;32m 1815\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtakeable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1816\u001b[0m \u001b[0mseries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iget_item_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1817\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_maybe_box_datetimelike\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1818\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1819\u001b[0m \u001b[0mseries\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_item_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36m_values\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 385\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 386\u001b[0m \u001b[0;34m\"\"\" return the internal repr of this data \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 387\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minternal_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 388\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36minternal_values\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 4220\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4221\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0minternal_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4222\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_block\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minternal_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4223\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4224\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36minternal_values\u001b[0;34m(self, dtype)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 154\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0minternal_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 155\u001b[0m \"\"\" return an internal format, currently just the ndarray\n\u001b[1;32m 156\u001b[0m \u001b[0mthis\u001b[0m \u001b[0mshould\u001b[0m \u001b[0mbe\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mpure\u001b[0m \u001b[0minternal\u001b[0m \u001b[0mAPI\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "date = '2019-12-20_21'\n", "root_dir = '/Users/Leo/Documents/github/GradProj/'\n", "#example_label_file = root_dir + 'DataSet/result/' + date + '/stream_tag.txt'\n", "example_label_file = root_dir + 'DataSet/result/' + 'noProxy/All' + '/stream_tag.txt'\n", "example_label_df = pd.read_table(example_label_file, sep='\\s+', header=None)\n", "example_label = {tuple(example_label_df.iloc[i,0:4].values):example_label_df.iloc[i,4] for i in example_label_df.index}\n", "#example_json_file = root_dir + 'DataSet/result/' + date + '/stream_stat.txt'\n", "example_json_file = root_dir + 'DataSet/result/' + 'noProxy/All' + '/stream_stat.txt'\n", "example_json_f = open(example_json_file, 'r')\n", "result_data = list()\n", "result_label = list()\n", "i = 0\n", "for line in example_json_f.readlines():\n", " example_json = json.loads(line)\n", " #标签\n", " try:\n", " flow_key = (example_json['sip'], example_json['sport'], example_json['dip'], example_json['dport'])\n", " result_label.append(example_label[flow_key])\n", " except Exception:\n", " continue\n", " \n", " #统计特征\n", " packets = example_json['packets']\n", " c2s_packets_bytes = list()\n", " s2c_packets_bytes = list()\n", " c2s_packets_intervals = list()\n", " s2c_packets_intervals = list()\n", " for packet in packets:\n", " if packet['dir'] == 1:\n", " c2s_packets_bytes.append(packet['bytes'])\n", " c2s_packets_intervals.append(packet['interval'])\n", " elif packet['dir'] == 2:\n", " s2c_packets_bytes.append(packet['bytes'])\n", " s2c_packets_intervals.append(packet['interval'])\n", " c2s_bytes = example_json['c2s_bytes']\n", " s2c_bytes = example_json['s2c_bytes']\n", " c2s_pkts = example_json['c2s_pkts']\n", " s2c_pkts = example_json['s2c_pkts']\n", " duration = example_json['duration']\n", " c2s_packets_bytes_mean = 0\n", " c2s_packets_bytes_median = 0\n", " c2s_packets_bytes_std = 0\n", " c2s_packets_bytes_max = 0\n", " c2s_packets_bytes_min = 0\n", "\n", " c2s_packets_intervals_mean = 0\n", " c2s_packets_intervals_median = 0\n", " c2s_packets_intervals_std = 0\n", " c2s_packets_intervals_max = 0\n", " c2s_packets_intervals_min = 0\n", "\n", " s2c_packets_bytes_mean = 0\n", " s2c_packets_bytes_median = 0\n", " s2c_packets_bytes_std = 0\n", " s2c_packets_bytes_max = 0\n", " s2c_packets_bytes_min = 0\n", "\n", " s2c_packets_intervals_mean = 0\n", " s2c_packets_intervals_median = 0\n", " s2c_packets_intervals_std = 0\n", " s2c_packets_intervals_max = 0\n", " s2c_packets_intervals_min = 0\n", " \n", " if c2s_bytes > 0:\n", " c2s_packets_bytes_mean = np.mean(c2s_packets_bytes)\n", " c2s_packets_bytes_median = np.median(c2s_packets_bytes)\n", " c2s_packets_bytes_std = np.std(c2s_packets_bytes)\n", " c2s_packets_bytes_max = np.max(c2s_packets_bytes)\n", " c2s_packets_bytes_min = np.min(c2s_packets_bytes)\n", "\n", " c2s_packets_intervals_mean = np.mean(c2s_packets_intervals)\n", " c2s_packets_intervals_median = np.median(c2s_packets_intervals)\n", " c2s_packets_intervals_std = np.std(c2s_packets_intervals)\n", " c2s_packets_intervals_max = np.max(c2s_packets_intervals)\n", " c2s_packets_intervals_min = np.min(c2s_packets_intervals)\n", " \n", " if s2c_bytes > 0:\n", " s2c_packets_bytes_mean = np.mean(s2c_packets_bytes)\n", " s2c_packets_bytes_median = np.median(s2c_packets_bytes)\n", " s2c_packets_bytes_std = np.std(s2c_packets_bytes)\n", " s2c_packets_bytes_max = np.max(s2c_packets_bytes)\n", " s2c_packets_bytes_min = np.min(s2c_packets_bytes)\n", "\n", " s2c_packets_intervals_mean = np.mean(s2c_packets_intervals)\n", " s2c_packets_intervals_median = np.median(s2c_packets_intervals)\n", " s2c_packets_intervals_std = np.std(s2c_packets_intervals)\n", " s2c_packets_intervals_max = np.max(s2c_packets_intervals)\n", " s2c_packets_intervals_min = np.min(s2c_packets_intervals)\n", "\n", " #tls\n", " tls = example_json['tls']\n", " extensions_list = tls['extensions_list']\n", " #print(extensions_list)\n", " ciphers = tls['cipher_suites']\n", " #print(ciphers)\n", " extensions_arr = np.zeros(34, dtype=np.uint8)\n", " cipher_suits_arr = np.zeros(123, dtype=np.uint8)\n", " for extension in extensions_list:\n", " try:\n", " extensions_arr[extensions[extension]]=1\n", " except Exception:\n", " pass\n", " for cipher in ciphers:\n", " try:\n", " cipher = cipher.upper()\n", " cipher_suits_arr[ciper_suits[cipher]]=1\n", " except Exception:\n", " pass\n", " result = [c2s_bytes, c2s_pkts, s2c_bytes, s2c_pkts, duration, c2s_packets_bytes_mean, c2s_packets_bytes_median, c2s_packets_bytes_std,\\\n", " c2s_packets_bytes_max, c2s_packets_bytes_min, c2s_packets_intervals_mean, c2s_packets_intervals_median, c2s_packets_intervals_std,\\\n", " c2s_packets_intervals_max, c2s_packets_intervals_min, s2c_packets_bytes_mean, s2c_packets_bytes_median, s2c_packets_bytes_std,\\\n", " s2c_packets_bytes_max, s2c_packets_bytes_min, s2c_packets_intervals_mean, s2c_packets_intervals_median, s2c_packets_intervals_std,\\\n", " s2c_packets_intervals_max, s2c_packets_intervals_min]\n", " result += list(cipher_suits_arr)\n", " result += list(extensions_arr)\n", " result_data.append(result)\n", " i += 1\n", "extensions_head = list()\n", "for i in range(len(extensions)):\n", " extensions_head.append('extension'+str(i))\n", "cipher_head = ['cipher'+str(i) for i in range(len(ciper_suits))]\n", "base_head = ['c2s_bytes', 'c2s_pkts', 's2c_bytes', 's2c_pkts', 'duration', 'c2s_packets_bytes_mean', 'c2s_packets_bytes_median', 'c2s_packets_bytes_std',\\\n", " 'c2s_packets_bytes_max', 'c2s_packets_bytes_min', 'c2s_packets_intervals_mean', 'c2s_packets_intervals_median', 'c2s_packets_intervals_std',\\\n", " 'c2s_packets_intervals_max', 'c2s_packets_intervals_min', 's2c_packets_bytes_mean', 's2c_packets_bytes_median', 's2c_packets_bytes_std',\\\n", " 's2c_packets_bytes_max', 's2c_packets_bytes_min', 's2c_packets_intervals_mean', 's2c_packets_intervals_median', 's2c_packets_intervals_std',\\\n", " 's2c_packets_intervals_max', 's2c_packets_intervals_min']\n", "header = base_head+cipher_head+extensions_head\n", "result_df = pd.DataFrame(result_data, columns=header)\n", "result_df['label'] = np.array(result_label)\n", "example_csv_file = root_dir + 'Experiment/StatFeature/CsvFile/' + date + '/examples.csv'\n", "result_df.to_csv(example_csv_file, index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "%matplotlib inline\n", "import os\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 统计每个app的包数\n", "date = '2019-12-20_21'\n", "root_dir = '/Users/Leo/Documents/github/GradProj/'\n", "exmaples_file = root_dir + 'Experiment/StatFeature/CsvFile/' + date + '/examples.csv'\n", "app2pktsDict = dict()\n", "with open(exmaples_file) as f:\n", " lines = f.readlines()\n", " i = 0\n", " for line in lines:\n", " if i == 0:\n", " i += 1\n", " continue;\n", " line = line.split(',')\n", " pkts = int(line[1]) + int(line[3])\n", " appName = line[-1]\n", " if appName not in app2pktsDict.keys():\n", " app2pktsDict[appName] = 0\n", " app2pktsDict[appName] += pkts \n", "for appName, pkts in app2pktsDict.items():\n", " appName = appName[:-1]\n", " print(appName + ': ', pkts)\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "examples_df = pd.read_csv(exmaples_file)\n", "class_counts = examples_df['label'].value_counts()\n", "class_counts.plot.bar()\n", "class_counts" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "183\n", "RandomForest: \n", "[ 0.97770397 0.9787607 0.97951192 0.94233517 0.99710922 0.99770373\n", " 0.96005698 0.98092707 0.97320713 0.98892652]\n", "[ 0.97532838 0.98915483 0.97241536 0.95949549 0.99352041 0.99491914\n", " 0.97667391 0.96311427 0.97944698 0.97825806]\n", "[ 0.9765054 0.98391423 0.97592882 0.95075988 0.99531039 0.99630756\n", " 0.96828624 0.97192363 0.97630536 0.98355669]\n" ] } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn import tree\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import f1_score,recall_score,precision_score\n", "from sklearn.metrics import classification_report\n", "\n", "import random\n", "examples = examples_df.values.copy()\n", "print(len(examples[0]))\n", "#只取25个流统计特征\n", "examples = np.c_[examples[:,:25].copy(),examples[:,-1].copy()]\n", "#print(examples)\n", "score_df = pd.DataFrame(np.zeros((5,3)),index = ['LogisticRegression', 'SVM', 'GaussianNB', 'tree', 'RandomForest'], \\\n", " columns = ['precision', 'recall', 'f1'])\n", "#def a():\n", "\n", "\n", "f1_score_list = list()\n", "recall_score_list = list()\n", "precision_score_list = list()\n", "\n", "class_list = [\"alipay\", \"bilibili\", \"douyin\", \"ele\", \"evernote\", \"gaode\", \"jd\", \"meituan\", \"weibo\", \"zhihu\"]\n", "\n", "\"\"\"\n", "for i in range(1):\n", " np.random.shuffle(examples)\n", " examples_train = examples[:int(len(examples)*0.75)]\n", " examples_test = examples[int(len(examples)*0.75):]\n", " x_train = examples_train[:,0:-1]\n", " y_train = examples_train[:,-1]\n", " x_test = examples_test[:,0:-1]\n", " y_test = examples_test[:,-1]\n", " classifer = LogisticRegression()\n", " classifer.fit(x_train, y_train)\n", " y_pred = classifer.predict(x_test)\n", " f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n", " recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n", " precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n", "scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n", "score_df.loc['LogisticRegression'] = scores\n", "\n", "f1_score_list = list()\n", "recall_score_list = list()\n", "precision_score_list = list()\n", "for i in range(1):\n", " np.random.shuffle(examples)\n", " examples_train = examples[:int(len(examples)*0.75)]\n", " examples_test = examples[int(len(examples)*0.75):]\n", " x_train = examples_train[:,0:-1]\n", " y_train = examples_train[:,-1]\n", " x_test = examples_test[:,0:-1]\n", " y_test = examples_test[:,-1]\n", " classifer = SVC()\n", " classifer.fit(x_train, y_train)\n", " y_pred = classifer.predict(x_test)\n", " f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n", " recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n", " precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n", "scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n", "score_df.loc['SVM'] = scores\n", "\n", "f1_score_list = list()\n", "recall_score_list = list()\n", "precision_score_list = list()\n", "for i in range(1):\n", " np.random.shuffle(examples)\n", " examples_train = examples[:int(len(examples)*0.75)]\n", " examples_test = examples[int(len(examples)*0.75):]\n", " x_train = examples_train[:,0:-1]\n", " y_train = examples_train[:,-1]\n", " x_test = examples_test[:,0:-1]\n", " y_test = examples_test[:,-1]\n", " classifer = GaussianNB()\n", " classifer.fit(x_train, y_train)\n", " y_pred = classifer.predict(x_test)\n", " f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n", " recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n", " precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n", "scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n", "score_df.loc['GaussianNB'] = scores\n", "\n", "f1_score_list = list()\n", "recall_score_list = list()\n", "precision_score_list = list()\n", "for i in range(1):\n", " np.random.shuffle(examples)\n", " examples_train = examples[:int(len(examples)*0.75)]\n", " examples_test = examples[int(len(examples)*0.75):]\n", " x_train = examples_train[:,0:-1]\n", " y_train = examples_train[:,-1]\n", " x_test = examples_test[:,0:-1]\n", " y_test = examples_test[:,-1]\n", " classifer = tree.DecisionTreeClassifier()\n", " classifer.fit(x_train, y_train)\n", " y_pred = classifer.predict(x_test)\n", " f1_score_list.append(f1_score(y_test, y_pred, average='micro'))\n", " recall_score_list.append(recall_score(y_test, y_pred, average='micro'))\n", " precision_score_list.append(precision_score(y_test, y_pred, average='micro'))\n", "scores = [np.mean(precision_score_list), np.mean(recall_score_list), np.mean(f1_score_list)]\n", "score_df.loc['tree'] = scores\n", "\"\"\"\n", "\n", "\n", "f1_score_list = list()\n", "recall_score_list = list()\n", "precision_score_list = list()\n", "for i in range(10):\n", " np.random.shuffle(examples)\n", " examples_train = examples[:int(len(examples)*0.75)]\n", " examples_test = examples[int(len(examples)*0.75):]\n", " x_train = examples_train[:,0:-1]\n", " y_train = examples_train[:,-1]\n", " x_test = examples_test[:,0:-1]\n", " y_test = examples_test[:,-1]\n", " classifer = RandomForestClassifier()\n", " classifer.fit(x_train, y_train)\n", " y_pred = classifer.predict(x_test)\n", " f1_score_list.append(f1_score(y_test, y_pred, average=None))\n", " recall_score_list.append(recall_score(y_test, y_pred, average=None))\n", " precision_score_list.append(precision_score(y_test, y_pred, average=None))\n", " \n", "scores = [np.mean(precision_score_list, axis=0), np.mean(recall_score_list, axis=0), np.mean(f1_score_list, axis=0)]\n", "print(\"RandomForest: \")\n", "for score in scores:\n", " print(score)\n", " \n", "#score_df.loc['RandomForest'] = scores\n", "#print(score_df)\n", "#ax = score_df.plot.bar(title='statistics_feature')\n", "#fig = ax.get_figure()\n", "#fig.savefig('base_feature.svg')\n", "#print(score_df)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.67 1.00 0.80 2\n", " 1 0.00 0.00 0.00 1\n", " 2 1.00 1.00 1.00 2\n", "\n", "avg / total 0.67 0.80 0.72 5\n", "\n", "[ 0.66666667 0. 1. ]\n", "[ 1. 0. 1.]\n", "[ 0.8 0. 1. ]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/Leo/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n", "/Users/Leo/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n", "/Users/Leo/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n" ] }, { "data": { "text/plain": [ "array([ 3., 4., 5., 6.])" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics import classification_report\n", "y_test = [0, 1, 2, 2, 0]\n", "y_pred = [0, 0, 2, 2, 0]\n", "\n", "y_test = [\"ali\", \"douyin\", \"zhifubao\", \"zhifubao\", \"ali\"]\n", "y_pred = [\"ali\", \"ali\", \"zhifubao\", \"zhifubao\", \"ali\"]\n", "target_names = ['0', '1', '2']\n", "print(classification_report(y_test, y_pred, target_names=target_names))\n", "\n", "print(precision_score(y_test, y_pred, average=None))\n", "print(recall_score(y_test, y_pred, average=None))\n", "print(f1_score(y_test, y_pred, average=None))\n", "\n", "z = [[1,2,3,4],[5,6,7,8]]\n", "\n", "np.mean(z, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }