wujiating-fingerprinting/utils/util.py

import re
import os
from os.path import join, dirname, abspath, pardir, basename, normpath
import json

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display as disp

from multiprocessing import cpu_count
from joblib import Parallel, delayed
from functools import partial

BASE_DIR = abspath(join(dirname(__file__), pardir, pardir, pardir))
CODE_DIR = join(BASE_DIR, 'code')
COLLEC_DIR = join(CODE_DIR, 'collection')
# ALL_URL_LIST = join(COLLEC_DIR, 'short_list_1500')
ALL_URL_LIST = ""


# MATH FUNCTIONS:
def harmonic_mean(x, y, factor=1.0):
    """Returns the weighter harmonic mean of x and y.

    `factor` allows to express how many times we value `x`
    over `y` in the harmonic mean. This is equivalent to
    the F_{\beta} score:
        https://en.wikipedia.org/wiki/F1_score

    Important: we are assuming x > 0 and y > 0 here.
    """
    assert x > 0 and y > 0
    x, y = float(x), float(y)  # cast to floats
    factor2 = factor ** 2
    return (x * y * (1 + factor2)) / ((factor2 * x) + y)


def round_mult(x, base=10):
    """Round to nearest multiple of base."""
    a = ((abs(x) - 1) // base) * base  # nearest multiple
    b = a + base  # round up
    return b * np.sign(x)


# DATASET FORMATTING FUNCTIONS
def trim(elements, n):
    """Select `n` number of elements.

    If there are equal or greater than `n` elements,
    select `n` elements as `True`. Otherwise, leave them untouched,
    as we assume they are initialized to `False`.
    """
    if len(elements) >= n:  # if there are enough elements,
        elements[:n] = True  # set `n` to `True` and leave
    return elements  # the rest to `False`.


def trim_df(df, num_insts):
    """Return a dataframe with the same number of instances per class.

    The dataframe, `df`, has a field with the class id called `class_label`.
    """
    df2 = df.copy()  # the selected field should not appear in the original `df`
    df2['selected'] = False  # initialize all instances to not selected
    classes = df2.groupby('class_label')  # group instances by class
    trim_part = partial(trim, n=num_insts)  # partial trim to n=NUM_INSTS
    df2['selected'] = classes.selected.transform(trim_part)  # mark as selected
    selected = df[df2.selected]  # get the selected instances
    return selected


def sample_classes(df, classes=None):
    if type(classes) is int:
        sample = random.sample(df.class_label.unique(), classes)
    elif type(classes) is list:
        sample = [str(cl) for cl in classes]
    else:
        raise Exception("Type of classes not recognized.")
    selected_classes = df.class_label.isin(sample)
    return df[selected_classes]


def trim_cross_comparison(df1, df2, num_insts, num_classes):
    # remove classes that have less than `num_insts`
    df1_trimmed = trim_df(df1, num_insts)
    df2_trimmed = trim_df(df2, num_insts)
    result_insts1 = get_num_instances(df1_trimmed).unique()[0]
    result_insts2 = get_num_instances(df2_trimmed).unique()[0]
    print("Num instances df1, df2:", result_insts1, result_insts2)

    # take a sample from the classes left that are in common
    df1_cl = set(df1_trimmed.class_label)
    df2_cl = set(df2_trimmed.class_label)
    intersection_cl = df1_cl.intersection(df2_cl)
    classes = random.sample(intersection_cl, num_classes)
    print("Num classes in common:", len(intersection_cl))

    # sample the instances that belongs to the list of selected classes
    df1_sampled = sample_classes(df1_trimmed, classes)
    df2_sampled = sample_classes(df2_trimmed, classes)

    # sort and re-index
    df1_sampled = df1_sampled.sort_values('class_label')
    df1_sampled.index = range(len(df1_sampled.index))
    df2_sampled = df2_sampled.sort_values('class_label')
    df2_sampled.index = range(len(df2_sampled.index))

    result_classes1 = set(df1_sampled.class_label)
    result_classes2 = set(df2_sampled.class_label)
    diff_classes = set(result_classes1).difference(result_classes2)
    print("Difference in classes", len(diff_classes))
    assert len(diff_classes) == 0

    return df1_sampled, df2_sampled


def trim_sample_df(df, num_insts, classes):
    df = trim_df(df, num_insts)
    df = sample_classes(df, classes)

    # restart index
    df = df.sort_values('class_label')
    df.index = range(len(df.index))

    return df


def assert_dataset_size(df, num_insts, num_classes):
    df_num_insts = get_num_instances(df).unique()[0]
    df_num_classes = get_num_classes(df)
    print(df_num_insts, "==", num_insts)
    print(df_num_classes, "==", num_classes)
    assert df_num_insts == num_insts
    assert df_num_classes == num_classes


def get_num_instances(df):
    """Return number of instances per class in the dataframe."""
    non_nan = df.dropna(axis='columns')  # nan cols would not have valid counts
    classes = non_nan.groupby('class_label')
    counts = classes.count()  # count instances in each group (class)
    first_column = counts.iloc[:, 1]  # we could get any column instead
    return first_column


def get_num_classes(df):
    """Return number of classes in the dataframe."""
    classes = df.groupby('class_label')
    return classes.ngroups


def optimal_instances_per_class(df, factor=1.0, draw=False):
    """Return 'optimal' number of instances per class.

    Find number of instances per class that maximizes both number of instances
    and number of classes. We use the harmonic mean to penalize individual
    extreme values.

    For that we use the histogram for the number of instances to obtain the
    the number of classes that have x instances.
    """
    # `bincount` returns the number of instances we have for each website
    counts = np.bincount(df.class_label.tolist())
    hist, bin_edges = np.histogram(counts)
    if draw:
        inst_counts = get_num_instances(df)
        inst_counts.hist(cumulative=-1, bins=100)
        plt.xlabel('Num of instances')
        plt.ylabel('Num of classes with x or more insts')
        plt.show()

    # scale the y-axis
    dx = bin_edges[1] - bin_edges[0]
    cum_hist = np.cumsum(hist) * dx

    # get the inverse cumulative sum
    inv_cum_hist = max(cum_hist) - cum_hist

    # compute the harmonic mean of tuples (y=f(x), x)
    hms = [harmonic_mean(x, y, factor) if y > 0 and x > 0 else 0
           for x, y in zip(bin_edges[1:], inv_cum_hist)]

    print(hms)

    # find index for max harmonic mean
    i = np.argmax(hms)

    # this is the optimal number of instances:
    opt_num_insts = int(bin_edges[i])

    # which leaves us with this number of classes:
    opt_num_classes = len(counts[counts >= opt_num_insts])

    if draw:
        print("Optimal number of instances:", opt_num_insts)
        print("Optimal number of classes:", opt_num_classes)

    return opt_num_insts, opt_num_classes


def list_array(column, pad=None, pad_with=0):
    """Return an array from a dataframe column with lists of the same size."""
    if pad is not None:  # in that case it's the array of lists' lengths
        mask = np.arange(pad.max()) < pad[:, None]
        if pad_with == 0:
            arrays = np.zeros(mask.shape, dtype=column.dtype)
        else:
            arrays = np.empty(mask.shape, dtype=column.dtype)
            arrays[:] = pad_with
        arrays[mask] = np.concatenate(column.values)
    else:
        arrays = column.values.tolist()
    return np.array(arrays)


def concat_lists(column):
    """Return list from concatenating all lists in the column."""
    arrays = list_array(column)
    return np.concatenate(arrays)


# OUTLIER REMOVAL
def min_inst(df, n=1):
    """Return only classes with at least one instance."""
    classes = df.groupby('class_label')
    counts = classes.inst.transform('count')
    sel_classes = df[counts > n]
    return sel_classes


def inst_class_stats(df, col='num_pkts'):
    """Get statistics about number of instances per class."""
    classes = df.groupby('class_label')
    stat = classes[col].describe()
    return stat


def std_thres(df, th=5):
    """Discard classes that have greater than `th` std."""
    stat = inst_class_stats(df)  # num of inst/class stats
    thresholded = stat[stat['std'] < th]
    class_labels = thresholded.reset_index().class_label
    sel_classes = df[df.class_label.isin(class_labels)]
    return sel_classes


# FEATURE FUNCTIONS
def get_bursts_per_class(df):
    classes = df.groupby('class_label')
    bursts = classes.bursts.apply(concat_lists)
    return bursts


def get_lengths_per_class(df):
    classes = df.groupby('class_label')
    lengths = classes.lengths.apply(concat_lists)
    return lengths


def get_uniq_len_count(lengths, all_lengths):
    """Return histogram of lengths over all possible lengths."""
    all_lengths = np.sort(all_lengths)  # sort array of all possible lengths
    bins = np.append(all_lengths, all_lengths[-1] + 1)
    return np.histogram(lengths, bins)[0]


def recover_order(sent_lengths, received_lengths, order):
    """Return sequence of lengths from snd/rcv lengths and order.

    Example:
        sent = [20, 33, 40]
        received = [33, 20, 20]
        order = [1, -1, 1, 1, -1, -1]
        Returns: [20, -33, 33, 40, -20, -20]
    """
    sequence = np.zeros(len(order))
    sequence[np.argwhere(order > 0).flatten()] = sent_lengths
    sequence[np.argwhere(order < 0).flatten()] = np.negative(received_lengths)
    return sequence


def get_bursts(len_seq):
    """Returns the sequence split by bursts.

    Example:
        len_seq = [20, -33, 33, 40, -20, -20]
        Returns: [[20], [-33], [33, 40], [-20, -20]]
    """
    # len_seq = np.array(eval(len_seq))
    directions = len_seq / abs(len_seq)
    index_dir_change = np.where(directions[1:] - directions[:-1] != 0)[0] + 1
    bursts = np.split(len_seq, index_dir_change)
    return bursts


def ngrams_bursts(len_seq, round=None):
    """Return sequence of bursts from sequence of lengths.

    The sequence of bursts is represented as specified for Dyer's VNG.

    Example:
        len_seq = [20, -33, 33, 40, -20, -20]
        Returns: [20, -33, 73, -40]
    """
    bursts = get_bursts(len_seq)
    ngrams = np.array(map(sum, bursts))
    if round is not None:
        ngrams = round_mult(ngrams, base=round)
    return ngrams


def join_str(lengths):
    return ' '.join(map(str, lengths))


## tsfresh format functions
def stack_lists(df_col):
    stacked = df_col.apply(pd.Series).stack()
    dropped = stacked.reset_index(level=1, drop=True).reset_index()
    return dropped.rename(columns={'index': 'id', 0: 'value'})


def stack_lists_df(df, col):
    if col == 'sent':
        df_col = df.lengths.apply(lambda x: x[np.where(x > 0)])
    else:
        df_col = df.lengths.apply(lambda x: np.abs(x[np.where(x < 0)]))
    stacked = stack_lists(df_col)
    stacked['kind'] = col
    return stacked


def convert(df):
    sent = stack_lists_df(df, 'sent')
    received = stack_lists_df(df, 'received')
    return pd.concat([sent, received])


# PARSING OF FILES
# regular expression used to parse files with traffic traces
PATH_REGEX = {'name': r'(?P<name>\w+)',
              'dev': r'(?:(?P<dev>[^_]+)_)?',
              'sites': r'(?:(?P<sites>[^_]+)_)?',
              'date': r'(?P<date>\d\d-\d\d-\d\d)',
              'inst': r'(?:_(?P<inst>\d+))?'}
# TRACE_PATH = os.path.join('traces', '{vm}{dev}{sites}{date}{inst}')
FNAME_REGEX = re.compile('{name}[/,\\\\]{dev}{sites}{date}{inst}'.format(**PATH_REGEX))
# FNAME_REGEX = re.compile('{name}/{dev}{sites}{date}{inst}'.format(**PATH_REGEX))

# paths
BASE_DIR = abspath(join(dirname(__file__), pardir, pardir, pardir))
DATA_DIR = join(BASE_DIR, 'dataset')
DEFAULT_PICKLE_FILE = join(DATA_DIR, 'index.pickle')


def load_data(path=DEFAULT_PICKLE_FILE, pickle=True):
    """Load dataset.

    If `path` is a file that exists, it should be a pickle and we load it.
    Otherwise, it should be a directory and we parse it.
    """
    if type(path) is list:
        dfs = [load_data(p, pickle='%s.pickle' % os.path.basename(p))
               for p in path]
        return pd.concat(dfs)
    elif type(path) is str:
        print("Loading", path)
        if os.path.isfile(path):
            df = pd.read_pickle(path)
        else:
            df = parse_directory(path)
            dataset = basename(normpath(path))
            PICKLE_FILE = join(DATA_DIR, '%s.pickle' % dataset)
            if pickle:
                pickle_path = PICKLE_FILE
                if type(pickle) is str:
                    pickle_path = pickle
                print("Pickling to", pickle_path)
                df.to_pickle(pickle_path)
        return df


def it_webpages(fpath):
    """Iterate over all the websites contained in a file."""
    with open(fpath) as f:
        data_dict = json.loads(f.read())
        try:
            for pcap_filename, values in data_dict.items():
                webpage_num = pcap_filename[:-5]
                snd, rcv = values['sent'], values['received']
                order = values['order']
                lengths = recover_order(*map(np.array, [snd, rcv, order]))
                yield webpage_num, lengths
        except KeyError:
            print(fpath, "does not have a known order sequence")
            return
            yield
        except Exception as e:
            print("ERROR:", fpath, pcap_filename, e)


def sel_files(dpath):
    """Yield files that satisfy conditions."""
    sel_files = []
    for root, _, files in os.walk(dpath):
        for fname in files:
            if not fname.endswith('.json'):  # skip non-json files
                continue
            fpath = os.path.join(root, fname)
            sel_files.append(fpath)
    return sel_files


def parse_directory(dpath):
    """Traverse the directory and parse all the captures in it.

    Returns a dataframe containing encoded lengths.
    """
    print("Starting to parse")
    selected_files = sel_files(dpath)
    print("Number of selected files", len(selected_files))

    # iterave over selected files and build dataframe
    empties = 0
    idx = pd.DataFrame(columns=PATH_REGEX.keys())
    for fpath in selected_files:
        m = FNAME_REGEX.search(fpath)
        if m is None:
            print("ERROR:", fpath, FNAME_REGEX.pattern)
            continue
        row_head = {k: m.group(k) for k in PATH_REGEX.keys()}
        # row_head = {k: m.group(k) for k in PATH_REGEX.iterkeys()}
        for i, (webpage_id, lengths) in enumerate(it_webpages(fpath)):
            if len(lengths) == 0:
                empties += 1
                continue
            row_head['fname'] = os.path.basename(fpath)
            row_head['class_label'] = webpage_id
            row_head['lengths'] = lengths
            idx = idx.append(row_head, ignore_index=True)
        print(i, 'sites in', fpath)
    print("Empty traces:", empties)

    # fix some naming issues:
    idx['inst'] = idx.inst.fillna(0)
    idx['date'] = pd.to_datetime(idx.date.str.replace('-18', '-2018'),
                                 format='%d-%m-%Y')
    # idx['dev'] = idx.dev.replace('browse', 'desktop')
    # idx.loc[idx.sites == 'desktop', ['dev', 'sites']] = ['desktop', None]
    return idx


# OPTIMIZATION
def apply_parallel(dfGrouped, func):
    retLst = Parallel(n_jobs=cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)


# # OTHER UTILS
# def load_mapping():
#     """Return Alexa as a list."""
#     return [l.strip() for l in open(ALL_URL_LIST)]
# ALEXA_MAP = load_mapping()
#
#
# def alexa_rank(url):
#     """Return index in Alexa."""
#     return ALEXA_MAP.index(url)
#
#
# def url(index):
#     """Return URL for the index in Alexa."""
#     return ALEXA_MAP[index]


# def display(df, urls=False):
#     """Redefine display to show URLs instead of indices."""
#     dft = df
#     if urls:
#         dft = df.copy()
#         if df.index.name == 'class_label':
#             dft = dft.reset_index()
#             if 'index' in dft.columns:
#                 dft = dft.drop(['index'], axis=1)
#         if 'class_label' in dft.columns:
#             dft['class_label'] = dft.class_label.apply(lambda x: url(int(x)))
#         if df.index.name == 'class_label':
#             dft.set_index('class_label')
#     disp(dft)


# identify function handle
def ident(x):
    return x