wujiating-fingerprinting/pipeline/ngrams_classif.py

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from utils.util import *


class NgramsExtractor:
    def __init__(self, max_ngram_len=2):
        #print "Feature extraction: ngrams"

        # initialize tf-idf vectorizer
        self.packet_counter = CountVectorizer(analyzer='word',
                                              tokenizer=lambda x: x.split(),
                                              stop_words=None,
                                              ngram_range=(1, max_ngram_len),)
        self.burst_counter = CountVectorizer(analyzer='word',
                                             tokenizer=lambda x: x.split(),
                                             stop_words=None,
                                             ngram_range=(1, max_ngram_len),)

    def fit(self, x, y=None):
        bursts = x.lengths.apply(get_bursts)
        self.packet_counter.fit(x.lengths.apply(join_str))
        self.burst_counter.fit(bursts.apply(join_str))
        return self

    def transform(self, data_list):
        bursts = data_list.lengths.apply(get_bursts)
        data_str = data_list.lengths.apply(join_str)
        bursts_str = bursts.apply(join_str)

        packet_ngrams = self.packet_counter.transform(data_str)
        burst_ngrams = self.burst_counter.transform(bursts_str)

        return np.concatenate((packet_ngrams.todense(),
                               burst_ngrams.todense()), axis=1)