abc

2022-05-31 21:37:21 +08:00
commit 7e716ff740
42 changed files with 194197 additions and 0 deletions
--- a/pipeline/ngrams_classif.py
+++ b/pipeline/ngrams_classif.py
@@ -0,0 +1,36 @@
+import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer
+
+from utils.util import *
+
+
+class NgramsExtractor:
+    def __init__(self, max_ngram_len=2):
+        #print "Feature extraction: ngrams"
+
+        # initialize tf-idf vectorizer
+        self.packet_counter = CountVectorizer(analyzer='word',
+                                              tokenizer=lambda x: x.split(),
+                                              stop_words=None,
+                                              ngram_range=(1, max_ngram_len),)
+        self.burst_counter = CountVectorizer(analyzer='word',
+                                             tokenizer=lambda x: x.split(),
+                                             stop_words=None,
+                                             ngram_range=(1, max_ngram_len),)
+
+    def fit(self, x, y=None):
+        bursts = x.lengths.apply(get_bursts)
+        self.packet_counter.fit(x.lengths.apply(join_str))
+        self.burst_counter.fit(bursts.apply(join_str))
+        return self
+
+    def transform(self, data_list):
+        bursts = data_list.lengths.apply(get_bursts)
+        data_str = data_list.lengths.apply(join_str)
+        bursts_str = bursts.apply(join_str)
+
+        packet_ngrams = self.packet_counter.transform(data_str)
+        burst_ngrams = self.burst_counter.transform(bursts_str)
+
+        return np.concatenate((packet_ngrams.todense(),
+                               burst_ngrams.todense()), axis=1)