abc
This commit is contained in:
36
pipeline/ngrams_classif.py
Normal file
36
pipeline/ngrams_classif.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
from utils.util import *
|
||||
|
||||
|
||||
class NgramsExtractor:
|
||||
def __init__(self, max_ngram_len=2):
|
||||
#print "Feature extraction: ngrams"
|
||||
|
||||
# initialize tf-idf vectorizer
|
||||
self.packet_counter = CountVectorizer(analyzer='word',
|
||||
tokenizer=lambda x: x.split(),
|
||||
stop_words=None,
|
||||
ngram_range=(1, max_ngram_len),)
|
||||
self.burst_counter = CountVectorizer(analyzer='word',
|
||||
tokenizer=lambda x: x.split(),
|
||||
stop_words=None,
|
||||
ngram_range=(1, max_ngram_len),)
|
||||
|
||||
def fit(self, x, y=None):
|
||||
bursts = x.lengths.apply(get_bursts)
|
||||
self.packet_counter.fit(x.lengths.apply(join_str))
|
||||
self.burst_counter.fit(bursts.apply(join_str))
|
||||
return self
|
||||
|
||||
def transform(self, data_list):
|
||||
bursts = data_list.lengths.apply(get_bursts)
|
||||
data_str = data_list.lengths.apply(join_str)
|
||||
bursts_str = bursts.apply(join_str)
|
||||
|
||||
packet_ngrams = self.packet_counter.transform(data_str)
|
||||
burst_ngrams = self.burst_counter.transform(bursts_str)
|
||||
|
||||
return np.concatenate((packet_ngrams.todense(),
|
||||
burst_ngrams.todense()), axis=1)
|
||||
Reference in New Issue
Block a user