37 lines
1.4 KiB
Python
37 lines
1.4 KiB
Python
import numpy as np
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
from utils.util import *
|
|
|
|
|
|
class NgramsExtractor:
|
|
def __init__(self, max_ngram_len=2):
|
|
#print "Feature extraction: ngrams"
|
|
|
|
# initialize tf-idf vectorizer
|
|
self.packet_counter = CountVectorizer(analyzer='word',
|
|
tokenizer=lambda x: x.split(),
|
|
stop_words=None,
|
|
ngram_range=(1, max_ngram_len),)
|
|
self.burst_counter = CountVectorizer(analyzer='word',
|
|
tokenizer=lambda x: x.split(),
|
|
stop_words=None,
|
|
ngram_range=(1, max_ngram_len),)
|
|
|
|
def fit(self, x, y=None):
|
|
bursts = x.lengths.apply(get_bursts)
|
|
self.packet_counter.fit(x.lengths.apply(join_str))
|
|
self.burst_counter.fit(bursts.apply(join_str))
|
|
return self
|
|
|
|
def transform(self, data_list):
|
|
bursts = data_list.lengths.apply(get_bursts)
|
|
data_str = data_list.lengths.apply(join_str)
|
|
bursts_str = bursts.apply(join_str)
|
|
|
|
packet_ngrams = self.packet_counter.transform(data_str)
|
|
burst_ngrams = self.burst_counter.transform(bursts_str)
|
|
|
|
return np.concatenate((packet_ngrams.todense(),
|
|
burst_ngrams.todense()), axis=1)
|