This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
wujiating-fingerprinting/pipeline/ngrams_classif.py
nanwct 7e716ff740 abc
2022-05-31 21:37:21 +08:00

37 lines
1.4 KiB
Python

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from utils.util import *
class NgramsExtractor:
def __init__(self, max_ngram_len=2):
#print "Feature extraction: ngrams"
# initialize tf-idf vectorizer
self.packet_counter = CountVectorizer(analyzer='word',
tokenizer=lambda x: x.split(),
stop_words=None,
ngram_range=(1, max_ngram_len),)
self.burst_counter = CountVectorizer(analyzer='word',
tokenizer=lambda x: x.split(),
stop_words=None,
ngram_range=(1, max_ngram_len),)
def fit(self, x, y=None):
bursts = x.lengths.apply(get_bursts)
self.packet_counter.fit(x.lengths.apply(join_str))
self.burst_counter.fit(bursts.apply(join_str))
return self
def transform(self, data_list):
bursts = data_list.lengths.apply(get_bursts)
data_str = data_list.lengths.apply(join_str)
bursts_str = bursts.apply(join_str)
packet_ngrams = self.packet_counter.transform(data_str)
burst_ngrams = self.burst_counter.transform(bursts_str)
return np.concatenate((packet_ngrams.todense(),
burst_ngrams.todense()), axis=1)