import csv import os import pandas as pd import _pickle as pkl import numpy as np def merge_csv(input_dir="C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\myData\\OW\\web", save_filename="./result/ow_doh_features.csv", truncated_num=5, label=0): files = os.listdir(input_dir) # df = pd.DataFrame(columns=["features", "labels"]) frames = [] for filename in files: if not filename.endswith(".csv"): continue full_filename = os.path.join(input_dir, filename) df = pd.read_csv(full_filename) # print(len(df)) frames.append(df) index = 0 df = pd.concat(frames).fillna(1e10) for row in df.iloc[:, :-1].values.tolist(): proto = row[5] if proto != 6: continue features = row[7:] if features[1] + features[2] < truncated_num or features[1] < 1e-5 or features[2] < 1e-5: continue index += 1 # print(len(df)) save_df = pd.DataFrame(columns=["features", "label"], index=range(index)) index = 0 for row in df.iloc[:, :-1].values.tolist(): proto = row[5] if proto != 6: continue features = row[7:] features = features[0:3] + features[5:13] + features[37:41] + features[15:23] + features[24:28] + features[50:51] # print(type(features[1])) # print(row) # print(features,features[-1]) if features[1] + features[2] < truncated_num or features[1] < 1e-5 or features[2] < 1e-5: continue for i in range(len(features)): feature = features[i] if isinstance(feature, str): # print(type(feature),feature) features[i] = float(feature) feature = float(feature) if np.isnan(feature) or np.isinf(feature) or not np.isfinite(feature): print(np.isnan(feature), np.isinf(feature), not np.isfinite(feature), feature) print(features) features[i] = 1e7 save_df.loc[index] = [features, label] index += 1 # print(index) # print(save_df) save_df.to_csv(save_filename) pkl_name = save_filename.replace("csv", "pkl") f_pkl = open(pkl_name, "wb") pkl.dump(save_df, f_pkl) f_pkl.close() def merge_all_pkl(): cw_doh_dataset = pkl.load(open("./result/cw_doh_features.pkl", "rb")) cw_web_dataset = pkl.load(open("./result/cw_web_features.pkl", "rb")) cw_file_dataset = pkl.load(open("./result/cw_file_features.pkl", "rb")) cw_voip_dataset = pkl.load(open("./result/cw_voip_features.pkl", "rb")) cw_chat_dataset = pkl.load(open("./result/cw_chat_features.pkl", "rb")) cw_email_dataset = pkl.load(open("./result/cw_email_features.pkl", "rb")) cw_streaming_dataset = pkl.load(open("./result/cw_streaming_features.pkl", "rb")) # cw_web_dataset['label'] = cw_web_dataset['label'].map(lambda x: 1) # cw_web_dataset.to_csv("./result/cw_web_features.pkl") # # cw_file_dataset['label'] = cw_file_dataset['label'].map(lambda x: 2) # cw_file_dataset.to_csv("./result/cw_file_features.pkl") # # cw_voip_dataset['label'] = cw_voip_dataset['label'].map(lambda x: 3) # cw_voip_dataset.to_csv("./result/cw_voip_features.pkl") # # cw_chat_dataset['label'] = cw_chat_dataset['label'].map(lambda x: 4) # cw_chat_dataset.to_csv("./result/cw_chat_features.pkl") # # cw_email_dataset['label'] = cw_email_dataset['label'].map(lambda x: 5) # cw_email_dataset.to_csv("./result/cw_email_features.pkl") # # cw_streaming_dataset['label'] = cw_streaming_dataset['label'].map(lambda x: 6) # cw_streaming_dataset.to_csv("./result/cw_streaming_features.pkl") frames = [cw_doh_dataset, cw_web_dataset, cw_chat_dataset, cw_email_dataset, cw_streaming_dataset, cw_file_dataset, cw_voip_dataset] df = pd.concat(frames) save_filename = "./result/all_features.csv" df.to_csv(save_filename) pkl_name = save_filename.replace("csv", "pkl") f_pkl = open(pkl_name, "wb") pkl.dump(df, f_pkl) f_pkl.close() if __name__ == '__main__': input_and_output_tuple = [] input_and_output_tuple.append( ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\myData\\doh", "./result/doh_features.csv")) input_and_output_tuple.append( ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\myData\\web", "./result/web_features.csv")) input_and_output_tuple.append( ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\File", "./result/file_features.csv")) input_and_output_tuple.append( ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\Email", "./result/email_features.csv")) input_and_output_tuple.append( ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\VoIP", "./result/voip_features.csv")) input_and_output_tuple.append( ("C:\\Users\\JiaTing\\Desktop\\CICFlowMeter-master\\result\\ISCX\\Chat", "./result/chat_features.csv")) label = 0 for input_dir, save_filename in input_and_output_tuple: print(input_dir) print(save_filename) truncated_num = 5 merge_csv(input_dir, save_filename, truncated_num, label=label) label += 1 # merge_all_pkl()