497 lines
16 KiB
Python
497 lines
16 KiB
Python
import re
|
|
import os
|
|
from os.path import join, dirname, abspath, pardir, basename, normpath
|
|
import json
|
|
|
|
import random
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from IPython.display import display as disp
|
|
|
|
from multiprocessing import cpu_count
|
|
from joblib import Parallel, delayed
|
|
from functools import partial
|
|
|
|
BASE_DIR = abspath(join(dirname(__file__), pardir, pardir, pardir))
|
|
CODE_DIR = join(BASE_DIR, 'code')
|
|
COLLEC_DIR = join(CODE_DIR, 'collection')
|
|
# ALL_URL_LIST = join(COLLEC_DIR, 'short_list_1500')
|
|
ALL_URL_LIST = ""
|
|
|
|
|
|
# MATH FUNCTIONS:
|
|
def harmonic_mean(x, y, factor=1.0):
|
|
"""Returns the weighter harmonic mean of x and y.
|
|
|
|
`factor` allows to express how many times we value `x`
|
|
over `y` in the harmonic mean. This is equivalent to
|
|
the F_{\beta} score:
|
|
https://en.wikipedia.org/wiki/F1_score
|
|
|
|
Important: we are assuming x > 0 and y > 0 here.
|
|
"""
|
|
assert x > 0 and y > 0
|
|
x, y = float(x), float(y) # cast to floats
|
|
factor2 = factor ** 2
|
|
return (x * y * (1 + factor2)) / ((factor2 * x) + y)
|
|
|
|
|
|
def round_mult(x, base=10):
|
|
"""Round to nearest multiple of base."""
|
|
a = ((abs(x) - 1) // base) * base # nearest multiple
|
|
b = a + base # round up
|
|
return b * np.sign(x)
|
|
|
|
|
|
# DATASET FORMATTING FUNCTIONS
|
|
def trim(elements, n):
|
|
"""Select `n` number of elements.
|
|
|
|
If there are equal or greater than `n` elements,
|
|
select `n` elements as `True`. Otherwise, leave them untouched,
|
|
as we assume they are initialized to `False`.
|
|
"""
|
|
if len(elements) >= n: # if there are enough elements,
|
|
elements[:n] = True # set `n` to `True` and leave
|
|
return elements # the rest to `False`.
|
|
|
|
|
|
def trim_df(df, num_insts):
|
|
"""Return a dataframe with the same number of instances per class.
|
|
|
|
The dataframe, `df`, has a field with the class id called `class_label`.
|
|
"""
|
|
df2 = df.copy() # the selected field should not appear in the original `df`
|
|
df2['selected'] = False # initialize all instances to not selected
|
|
classes = df2.groupby('class_label') # group instances by class
|
|
trim_part = partial(trim, n=num_insts) # partial trim to n=NUM_INSTS
|
|
df2['selected'] = classes.selected.transform(trim_part) # mark as selected
|
|
selected = df[df2.selected] # get the selected instances
|
|
return selected
|
|
|
|
|
|
def sample_classes(df, classes=None):
|
|
if type(classes) is int:
|
|
sample = random.sample(df.class_label.unique(), classes)
|
|
elif type(classes) is list:
|
|
sample = [str(cl) for cl in classes]
|
|
else:
|
|
raise Exception("Type of classes not recognized.")
|
|
selected_classes = df.class_label.isin(sample)
|
|
return df[selected_classes]
|
|
|
|
|
|
def trim_cross_comparison(df1, df2, num_insts, num_classes):
|
|
# remove classes that have less than `num_insts`
|
|
df1_trimmed = trim_df(df1, num_insts)
|
|
df2_trimmed = trim_df(df2, num_insts)
|
|
result_insts1 = get_num_instances(df1_trimmed).unique()[0]
|
|
result_insts2 = get_num_instances(df2_trimmed).unique()[0]
|
|
print("Num instances df1, df2:", result_insts1, result_insts2)
|
|
|
|
# take a sample from the classes left that are in common
|
|
df1_cl = set(df1_trimmed.class_label)
|
|
df2_cl = set(df2_trimmed.class_label)
|
|
intersection_cl = df1_cl.intersection(df2_cl)
|
|
classes = random.sample(intersection_cl, num_classes)
|
|
print("Num classes in common:", len(intersection_cl))
|
|
|
|
# sample the instances that belongs to the list of selected classes
|
|
df1_sampled = sample_classes(df1_trimmed, classes)
|
|
df2_sampled = sample_classes(df2_trimmed, classes)
|
|
|
|
# sort and re-index
|
|
df1_sampled = df1_sampled.sort_values('class_label')
|
|
df1_sampled.index = range(len(df1_sampled.index))
|
|
df2_sampled = df2_sampled.sort_values('class_label')
|
|
df2_sampled.index = range(len(df2_sampled.index))
|
|
|
|
result_classes1 = set(df1_sampled.class_label)
|
|
result_classes2 = set(df2_sampled.class_label)
|
|
diff_classes = set(result_classes1).difference(result_classes2)
|
|
print("Difference in classes", len(diff_classes))
|
|
assert len(diff_classes) == 0
|
|
|
|
return df1_sampled, df2_sampled
|
|
|
|
|
|
def trim_sample_df(df, num_insts, classes):
|
|
df = trim_df(df, num_insts)
|
|
df = sample_classes(df, classes)
|
|
|
|
# restart index
|
|
df = df.sort_values('class_label')
|
|
df.index = range(len(df.index))
|
|
|
|
return df
|
|
|
|
|
|
def assert_dataset_size(df, num_insts, num_classes):
|
|
df_num_insts = get_num_instances(df).unique()[0]
|
|
df_num_classes = get_num_classes(df)
|
|
print(df_num_insts, "==", num_insts)
|
|
print(df_num_classes, "==", num_classes)
|
|
assert df_num_insts == num_insts
|
|
assert df_num_classes == num_classes
|
|
|
|
|
|
def get_num_instances(df):
|
|
"""Return number of instances per class in the dataframe."""
|
|
non_nan = df.dropna(axis='columns') # nan cols would not have valid counts
|
|
classes = non_nan.groupby('class_label')
|
|
counts = classes.count() # count instances in each group (class)
|
|
first_column = counts.iloc[:, 1] # we could get any column instead
|
|
return first_column
|
|
|
|
|
|
def get_num_classes(df):
|
|
"""Return number of classes in the dataframe."""
|
|
classes = df.groupby('class_label')
|
|
return classes.ngroups
|
|
|
|
|
|
def optimal_instances_per_class(df, factor=1.0, draw=False):
|
|
"""Return 'optimal' number of instances per class.
|
|
|
|
Find number of instances per class that maximizes both number of instances
|
|
and number of classes. We use the harmonic mean to penalize individual
|
|
extreme values.
|
|
|
|
For that we use the histogram for the number of instances to obtain the
|
|
the number of classes that have x instances.
|
|
"""
|
|
# `bincount` returns the number of instances we have for each website
|
|
counts = np.bincount(df.class_label.tolist())
|
|
hist, bin_edges = np.histogram(counts)
|
|
if draw:
|
|
inst_counts = get_num_instances(df)
|
|
inst_counts.hist(cumulative=-1, bins=100)
|
|
plt.xlabel('Num of instances')
|
|
plt.ylabel('Num of classes with x or more insts')
|
|
plt.show()
|
|
|
|
# scale the y-axis
|
|
dx = bin_edges[1] - bin_edges[0]
|
|
cum_hist = np.cumsum(hist) * dx
|
|
|
|
# get the inverse cumulative sum
|
|
inv_cum_hist = max(cum_hist) - cum_hist
|
|
|
|
# compute the harmonic mean of tuples (y=f(x), x)
|
|
hms = [harmonic_mean(x, y, factor) if y > 0 and x > 0 else 0
|
|
for x, y in zip(bin_edges[1:], inv_cum_hist)]
|
|
|
|
print(hms)
|
|
|
|
# find index for max harmonic mean
|
|
i = np.argmax(hms)
|
|
|
|
# this is the optimal number of instances:
|
|
opt_num_insts = int(bin_edges[i])
|
|
|
|
# which leaves us with this number of classes:
|
|
opt_num_classes = len(counts[counts >= opt_num_insts])
|
|
|
|
if draw:
|
|
print("Optimal number of instances:", opt_num_insts)
|
|
print("Optimal number of classes:", opt_num_classes)
|
|
|
|
return opt_num_insts, opt_num_classes
|
|
|
|
|
|
def list_array(column, pad=None, pad_with=0):
|
|
"""Return an array from a dataframe column with lists of the same size."""
|
|
if pad is not None: # in that case it's the array of lists' lengths
|
|
mask = np.arange(pad.max()) < pad[:, None]
|
|
if pad_with == 0:
|
|
arrays = np.zeros(mask.shape, dtype=column.dtype)
|
|
else:
|
|
arrays = np.empty(mask.shape, dtype=column.dtype)
|
|
arrays[:] = pad_with
|
|
arrays[mask] = np.concatenate(column.values)
|
|
else:
|
|
arrays = column.values.tolist()
|
|
return np.array(arrays)
|
|
|
|
|
|
def concat_lists(column):
|
|
"""Return list from concatenating all lists in the column."""
|
|
arrays = list_array(column)
|
|
return np.concatenate(arrays)
|
|
|
|
|
|
# OUTLIER REMOVAL
|
|
def min_inst(df, n=1):
|
|
"""Return only classes with at least one instance."""
|
|
classes = df.groupby('class_label')
|
|
counts = classes.inst.transform('count')
|
|
sel_classes = df[counts > n]
|
|
return sel_classes
|
|
|
|
|
|
def inst_class_stats(df, col='num_pkts'):
|
|
"""Get statistics about number of instances per class."""
|
|
classes = df.groupby('class_label')
|
|
stat = classes[col].describe()
|
|
return stat
|
|
|
|
|
|
def std_thres(df, th=5):
|
|
"""Discard classes that have greater than `th` std."""
|
|
stat = inst_class_stats(df) # num of inst/class stats
|
|
thresholded = stat[stat['std'] < th]
|
|
class_labels = thresholded.reset_index().class_label
|
|
sel_classes = df[df.class_label.isin(class_labels)]
|
|
return sel_classes
|
|
|
|
|
|
# FEATURE FUNCTIONS
|
|
def get_bursts_per_class(df):
|
|
classes = df.groupby('class_label')
|
|
bursts = classes.bursts.apply(concat_lists)
|
|
return bursts
|
|
|
|
|
|
def get_lengths_per_class(df):
|
|
classes = df.groupby('class_label')
|
|
lengths = classes.lengths.apply(concat_lists)
|
|
return lengths
|
|
|
|
|
|
def get_uniq_len_count(lengths, all_lengths):
|
|
"""Return histogram of lengths over all possible lengths."""
|
|
all_lengths = np.sort(all_lengths) # sort array of all possible lengths
|
|
bins = np.append(all_lengths, all_lengths[-1] + 1)
|
|
return np.histogram(lengths, bins)[0]
|
|
|
|
|
|
def recover_order(sent_lengths, received_lengths, order):
|
|
"""Return sequence of lengths from snd/rcv lengths and order.
|
|
|
|
Example:
|
|
sent = [20, 33, 40]
|
|
received = [33, 20, 20]
|
|
order = [1, -1, 1, 1, -1, -1]
|
|
Returns: [20, -33, 33, 40, -20, -20]
|
|
"""
|
|
sequence = np.zeros(len(order))
|
|
sequence[np.argwhere(order > 0).flatten()] = sent_lengths
|
|
sequence[np.argwhere(order < 0).flatten()] = np.negative(received_lengths)
|
|
return sequence
|
|
|
|
|
|
def get_bursts(len_seq):
|
|
"""Returns the sequence split by bursts.
|
|
|
|
Example:
|
|
len_seq = [20, -33, 33, 40, -20, -20]
|
|
Returns: [[20], [-33], [33, 40], [-20, -20]]
|
|
"""
|
|
# len_seq = np.array(eval(len_seq))
|
|
directions = len_seq / abs(len_seq)
|
|
index_dir_change = np.where(directions[1:] - directions[:-1] != 0)[0] + 1
|
|
bursts = np.split(len_seq, index_dir_change)
|
|
return bursts
|
|
|
|
|
|
def ngrams_bursts(len_seq, round=None):
|
|
"""Return sequence of bursts from sequence of lengths.
|
|
|
|
The sequence of bursts is represented as specified for Dyer's VNG.
|
|
|
|
Example:
|
|
len_seq = [20, -33, 33, 40, -20, -20]
|
|
Returns: [20, -33, 73, -40]
|
|
"""
|
|
bursts = get_bursts(len_seq)
|
|
ngrams = np.array(map(sum, bursts))
|
|
if round is not None:
|
|
ngrams = round_mult(ngrams, base=round)
|
|
return ngrams
|
|
|
|
|
|
def join_str(lengths):
|
|
return ' '.join(map(str, lengths))
|
|
|
|
|
|
## tsfresh format functions
|
|
def stack_lists(df_col):
|
|
stacked = df_col.apply(pd.Series).stack()
|
|
dropped = stacked.reset_index(level=1, drop=True).reset_index()
|
|
return dropped.rename(columns={'index': 'id', 0: 'value'})
|
|
|
|
|
|
def stack_lists_df(df, col):
|
|
if col == 'sent':
|
|
df_col = df.lengths.apply(lambda x: x[np.where(x > 0)])
|
|
else:
|
|
df_col = df.lengths.apply(lambda x: np.abs(x[np.where(x < 0)]))
|
|
stacked = stack_lists(df_col)
|
|
stacked['kind'] = col
|
|
return stacked
|
|
|
|
|
|
def convert(df):
|
|
sent = stack_lists_df(df, 'sent')
|
|
received = stack_lists_df(df, 'received')
|
|
return pd.concat([sent, received])
|
|
|
|
|
|
# PARSING OF FILES
|
|
# regular expression used to parse files with traffic traces
|
|
PATH_REGEX = {'name': r'(?P<name>\w+)',
|
|
'dev': r'(?:(?P<dev>[^_]+)_)?',
|
|
'sites': r'(?:(?P<sites>[^_]+)_)?',
|
|
'date': r'(?P<date>\d\d-\d\d-\d\d)',
|
|
'inst': r'(?:_(?P<inst>\d+))?'}
|
|
# TRACE_PATH = os.path.join('traces', '{vm}{dev}{sites}{date}{inst}')
|
|
FNAME_REGEX = re.compile('{name}[/,\\\\]{dev}{sites}{date}{inst}'.format(**PATH_REGEX))
|
|
# FNAME_REGEX = re.compile('{name}/{dev}{sites}{date}{inst}'.format(**PATH_REGEX))
|
|
|
|
# paths
|
|
BASE_DIR = abspath(join(dirname(__file__), pardir, pardir, pardir))
|
|
DATA_DIR = join(BASE_DIR, 'dataset')
|
|
DEFAULT_PICKLE_FILE = join(DATA_DIR, 'index.pickle')
|
|
|
|
|
|
def load_data(path=DEFAULT_PICKLE_FILE, pickle=True):
|
|
"""Load dataset.
|
|
|
|
If `path` is a file that exists, it should be a pickle and we load it.
|
|
Otherwise, it should be a directory and we parse it.
|
|
"""
|
|
if type(path) is list:
|
|
dfs = [load_data(p, pickle='%s.pickle' % os.path.basename(p))
|
|
for p in path]
|
|
return pd.concat(dfs)
|
|
elif type(path) is str:
|
|
print("Loading", path)
|
|
if os.path.isfile(path):
|
|
df = pd.read_pickle(path)
|
|
else:
|
|
df = parse_directory(path)
|
|
dataset = basename(normpath(path))
|
|
PICKLE_FILE = join(DATA_DIR, '%s.pickle' % dataset)
|
|
if pickle:
|
|
pickle_path = PICKLE_FILE
|
|
if type(pickle) is str:
|
|
pickle_path = pickle
|
|
print("Pickling to", pickle_path)
|
|
df.to_pickle(pickle_path)
|
|
return df
|
|
|
|
|
|
def it_webpages(fpath):
|
|
"""Iterate over all the websites contained in a file."""
|
|
with open(fpath) as f:
|
|
data_dict = json.loads(f.read())
|
|
try:
|
|
for pcap_filename, values in data_dict.items():
|
|
webpage_num = pcap_filename[:-5]
|
|
snd, rcv = values['sent'], values['received']
|
|
order = values['order']
|
|
lengths = recover_order(*map(np.array, [snd, rcv, order]))
|
|
yield webpage_num, lengths
|
|
except KeyError:
|
|
print(fpath, "does not have a known order sequence")
|
|
return
|
|
yield
|
|
except Exception as e:
|
|
print("ERROR:", fpath, pcap_filename, e)
|
|
|
|
|
|
def sel_files(dpath):
|
|
"""Yield files that satisfy conditions."""
|
|
sel_files = []
|
|
for root, _, files in os.walk(dpath):
|
|
for fname in files:
|
|
if not fname.endswith('.json'): # skip non-json files
|
|
continue
|
|
fpath = os.path.join(root, fname)
|
|
sel_files.append(fpath)
|
|
return sel_files
|
|
|
|
|
|
def parse_directory(dpath):
|
|
"""Traverse the directory and parse all the captures in it.
|
|
|
|
Returns a dataframe containing encoded lengths.
|
|
"""
|
|
print("Starting to parse")
|
|
selected_files = sel_files(dpath)
|
|
print("Number of selected files", len(selected_files))
|
|
|
|
# iterave over selected files and build dataframe
|
|
empties = 0
|
|
idx = pd.DataFrame(columns=PATH_REGEX.keys())
|
|
for fpath in selected_files:
|
|
m = FNAME_REGEX.search(fpath)
|
|
if m is None:
|
|
print("ERROR:", fpath, FNAME_REGEX.pattern)
|
|
continue
|
|
row_head = {k: m.group(k) for k in PATH_REGEX.keys()}
|
|
# row_head = {k: m.group(k) for k in PATH_REGEX.iterkeys()}
|
|
for i, (webpage_id, lengths) in enumerate(it_webpages(fpath)):
|
|
if len(lengths) == 0:
|
|
empties += 1
|
|
continue
|
|
row_head['fname'] = os.path.basename(fpath)
|
|
row_head['class_label'] = webpage_id
|
|
row_head['lengths'] = lengths
|
|
idx = idx.append(row_head, ignore_index=True)
|
|
print(i, 'sites in', fpath)
|
|
print("Empty traces:", empties)
|
|
|
|
# fix some naming issues:
|
|
idx['inst'] = idx.inst.fillna(0)
|
|
idx['date'] = pd.to_datetime(idx.date.str.replace('-18', '-2018'),
|
|
format='%d-%m-%Y')
|
|
# idx['dev'] = idx.dev.replace('browse', 'desktop')
|
|
# idx.loc[idx.sites == 'desktop', ['dev', 'sites']] = ['desktop', None]
|
|
return idx
|
|
|
|
|
|
# OPTIMIZATION
|
|
def apply_parallel(dfGrouped, func):
|
|
retLst = Parallel(n_jobs=cpu_count())(delayed(func)(group) for name, group in dfGrouped)
|
|
return pd.concat(retLst)
|
|
|
|
|
|
# # OTHER UTILS
|
|
# def load_mapping():
|
|
# """Return Alexa as a list."""
|
|
# return [l.strip() for l in open(ALL_URL_LIST)]
|
|
# ALEXA_MAP = load_mapping()
|
|
#
|
|
#
|
|
# def alexa_rank(url):
|
|
# """Return index in Alexa."""
|
|
# return ALEXA_MAP.index(url)
|
|
#
|
|
#
|
|
# def url(index):
|
|
# """Return URL for the index in Alexa."""
|
|
# return ALEXA_MAP[index]
|
|
|
|
|
|
# def display(df, urls=False):
|
|
# """Redefine display to show URLs instead of indices."""
|
|
# dft = df
|
|
# if urls:
|
|
# dft = df.copy()
|
|
# if df.index.name == 'class_label':
|
|
# dft = dft.reset_index()
|
|
# if 'index' in dft.columns:
|
|
# dft = dft.drop(['index'], axis=1)
|
|
# if 'class_label' in dft.columns:
|
|
# dft['class_label'] = dft.class_label.apply(lambda x: url(int(x)))
|
|
# if df.index.name == 'class_label':
|
|
# dft.set_index('class_label')
|
|
# disp(dft)
|
|
|
|
|
|
# identify function handle
|
|
def ident(x):
|
|
return x
|