From 8bf48ba1f309012877d79cc790040a1d6701389c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=83=91=E8=B6=85?= Date: Tue, 15 Jun 2021 01:32:46 +0000 Subject: [PATCH] Feature faster bool matcher on reptead item --- inc/bool_matcher.h | 18 +- src/CMakeLists.txt | 4 +- src/entry/Maat_hierarchy.cpp | 10 +- src/entry/Maat_rule.cpp | 2 +- src/entry/bool_matcher.cpp | 370 +- test/perf_test_maatframe.cpp | 135 +- test/testdata/bool-matcher-test-exprs.txt | 77893 ++++++++++++++++++++ 7 files changed, 78193 insertions(+), 239 deletions(-) create mode 100644 test/testdata/bool-matcher-test-exprs.txt diff --git a/inc/bool_matcher.h b/inc/bool_matcher.h index 195d78c..20cde6d 100644 --- a/inc/bool_matcher.h +++ b/inc/bool_matcher.h @@ -7,7 +7,7 @@ * All rights reserved * * Written by: LIU YANBING (liuyanbing@iie.ac.cn) - * Last modification: 2018-12-31 + * Last modification: 2021-06-12 * * This code is the exclusive and proprietary property of IIE-CAS and NELIST. * Usage for direct or indirect commercial advantage is not allowed without @@ -25,27 +25,35 @@ extern "C" #endif #define MAX_ITEMS_PER_BOOL_EXPR 8 - /*not_flag=0表示布尔项item_id必须出现;not_flag=1表示布尔项item_id不能出现*/ + /* not_flag=0表示布尔项item_id必须出现;not_flag=1表示布尔项item_id不能出现 */ struct bool_item { unsigned long long item_id; unsigned char not_flag; }; - /*注意:不支持布尔项全“非”的情形*/ + /* At least one item's not_flag should be 0. */ struct bool_expr { + unsigned long long expr_id; void * user_tag; size_t item_num; struct bool_item items[MAX_ITEMS_PER_BOOL_EXPR]; }; + struct bool_expr_match + { + unsigned long long expr_id; + void * user_tag; + }; + struct bool_matcher; - /*注意:本函数调用会交换bool_exprs中元素的位置*/ struct bool_matcher * bool_matcher_new(struct bool_expr * exprs, size_t expr_num, unsigned int max_thread_num, size_t * mem_size); - int bool_matcher_match(struct bool_matcher * matcher, unsigned int thread_id, const unsigned long long * item_ids, size_t item_num, void ** result, size_t size); + /* Returned results are sorted by expr_id in descending order. */ + /* 本函数将对数组item_ids进行排序,会改变item_ids中元素的顺序 */ + int bool_matcher_match(struct bool_matcher * matcher, unsigned int thread_id, unsigned long long * item_ids, size_t item_num, struct bool_expr_match * results, size_t n_result); void bool_matcher_free(struct bool_matcher * matcher); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5fd0dd2..607fc74 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 3.5) set(MAAT_FRAME_MAJOR_VERSION 3) -set(MAAT_FRAME_MINOR_VERSION 1) -set(MAAT_FRAME_PATCH_VERSION 20) +set(MAAT_FRAME_MINOR_VERSION 2) +set(MAAT_FRAME_PATCH_VERSION 1) set(MAAT_FRAME_VERSION ${MAAT_FRAME_MAJOR_VERSION}.${MAAT_FRAME_MINOR_VERSION}.${MAAT_FRAME_PATCH_VERSION}) message(STATUS "Maat Frame, Version: ${MAAT_FRAME_VERSION}") diff --git a/src/entry/Maat_hierarchy.cpp b/src/entry/Maat_hierarchy.cpp index 3387a50..ede86e4 100644 --- a/src/entry/Maat_hierarchy.cpp +++ b/src/entry/Maat_hierarchy.cpp @@ -123,7 +123,7 @@ struct Maat_hierarchy int thread_num; struct Maat_garbage_bin* ref_garbage_bin; void* logger; - void **expr_match_buff; + struct bool_expr_match *expr_match_buff; }; int compare_literal_id(const void *pa, const void *pb) @@ -344,7 +344,7 @@ struct Maat_hierarchy* Maat_hierarchy_new(int thread_num, void* mesa_handle_logg hier->hash_dedup_clause_by_literals=NULL; hier->clause_id_generator=0; hier->ref_garbage_bin=bin; - hier->expr_match_buff=ALLOC(void*, thread_num*MAX_SCANNER_HIT_NUM); + hier->expr_match_buff=ALLOC(struct bool_expr_match, thread_num*MAX_SCANNER_HIT_NUM); ret=igraph_empty(&hier->group_graph, 0, IGRAPH_DIRECTED); assert(ret==IGRAPH_SUCCESS); @@ -923,6 +923,7 @@ static struct bool_matcher* Maat_hierarchy_build_bool_matcher(struct Maat_hierar //some compile may have zero groups, e.g. default policy. if(j==(size_t)compile->declared_clause_num&&j>0) { + bool_expr_array[expr_cnt].expr_id=compile->compile_id; bool_expr_array[expr_cnt].user_tag=compile; bool_expr_array[expr_cnt].item_num=j; expr_cnt++; @@ -1356,7 +1357,7 @@ int Maat_hierarchy_region_compile(struct Maat_hierarchy* hier, struct Maat_hiera { int bool_match_ret=0, i=0; struct Maat_hierarchy_compile* compile=NULL; - void **expr_match=hier->expr_match_buff+mid->thread_num*MAX_SCANNER_HIT_NUM; + struct bool_expr_match *expr_match=hier->expr_match_buff+mid->thread_num*MAX_SCANNER_HIT_NUM; size_t r_in_c_cnt=0, this_scan_region_hits=mid->this_scan_region_hit_cnt; size_t ud_result_cnt=0; @@ -1371,8 +1372,9 @@ int Maat_hierarchy_region_compile(struct Maat_hierarchy* hier, struct Maat_hiera expr_match, MAX_SCANNER_HIT_NUM); for(i=0; imagic==MAAT_HIER_COMPILE_MAGIC); + assert((unsigned long long)compile->compile_id==expr_match[i].expr_id); if(compile->actual_clause_num==0) { continue; diff --git a/src/entry/Maat_rule.cpp b/src/entry/Maat_rule.cpp index aa351b2..9db757e 100644 --- a/src/entry/Maat_rule.cpp +++ b/src/entry/Maat_rule.cpp @@ -57,7 +57,7 @@ extern "C" } #endif -int MAAT_FRAME_VERSION_3_1_22_20210601=1; +int MAAT_FRAME_VERSION_3_2_1_20210613=1; int is_valid_table_name(const char* str) { diff --git a/src/entry/bool_matcher.cpp b/src/entry/bool_matcher.cpp index 01ceaa1..5ca3461 100644 --- a/src/entry/bool_matcher.cpp +++ b/src/entry/bool_matcher.cpp @@ -5,274 +5,198 @@ using namespace std; #include #include +#include -static const unsigned int MAX_ARRAY_SIZE=65536; - -struct thread_local_data_t +struct bool_expr_item { - unsigned int mapped_ids[MAX_ARRAY_SIZE]; - unsigned int used_cells[MAX_ARRAY_SIZE]; - void * cached_results[MAX_ARRAY_SIZE]; - unsigned char * multiexpr_bitmap; - unsigned int * singlexpr_bitmap; + size_t item_num; + struct bool_item * items; }; struct bool_matcher { - unsigned int max_thread_num; - unsigned int bool_expr_num; - unsigned int multi_expr_num; - void ** bool_expr_ids; - unsigned char * multi_expr_size; - unsigned char * multi_expr_mask; - unsigned int bool_item_id_num; - unsigned long long min_item_id; - unsigned long long max_item_id; - unsigned long long * bool_item_ids; - unsigned int * mapped_ptr; - unsigned int * mapped_ids; - unsigned int theta; - unsigned int L[65537]; - thread_local_data_t * thread_data; + unsigned int bool_expr_num; + struct bool_expr_match * bool_expr_ids; + struct bool_expr_item * bool_expr_items; + unsigned int bool_item_num; + unsigned long long * bool_items; + unsigned int * mapped_ptr; + unsigned int * mapped_ids; + unsigned int bitmap_size; + unsigned char * bitmap; }; +bool operator<(const struct bool_item & lhs, const struct bool_item & rhs) +{ + return lhs.item_idMAX_ITEMS_PER_BOOL_EXPR) - { - return NULL; - } - } - - int I=-1, J=(int)expr_num; - while(I1) I++; - if(I==J) break; - J--; - while(J>I && exprs[J].item_num==1) J--; - if(J==I) break; - swap(exprs[I], exprs[J]); - } - - for(int k=0; k<(int)expr_num; k++) - { - if((k=I && exprs[k].item_num>1)) - { - printf("[%s:%d]: fatal error!\n", __FILE__, __LINE__); - return NULL; - } - } - unsigned int mem_bytes=0; struct bool_matcher * matcher=new struct bool_matcher; mem_bytes+=sizeof(bool_matcher); - matcher->max_thread_num=max_thread_num; matcher->bool_expr_num=(unsigned int)expr_num; - matcher->multi_expr_num=I; - - matcher->bool_expr_ids=new void *[expr_num]; - mem_bytes+=(unsigned int)expr_num*sizeof(void *); - - matcher->multi_expr_size=new unsigned char[matcher->multi_expr_num+1]; - mem_bytes+=(matcher->multi_expr_num+1)*sizeof(unsigned char); - - matcher->multi_expr_mask=new unsigned char[matcher->multi_expr_num+1]; - mem_bytes+=(matcher->multi_expr_num+1)*sizeof(unsigned char); - - matcher->thread_data=new thread_local_data_t[max_thread_num]; - mem_bytes+=max_thread_num*sizeof(thread_local_data_t); - - for(unsigned int i=0; ithread_data[i].multiexpr_bitmap=new unsigned char[matcher->multi_expr_num+1]; - mem_bytes+=(matcher->multi_expr_num+1)*sizeof(unsigned char); - - unsigned int size=(unsigned int)(expr_num-matcher->multi_expr_num); - size=(size>>5)+1; - matcher->thread_data[i].singlexpr_bitmap=new unsigned int[size]; - mem_bytes+=size*sizeof(unsigned int); - } - - map< unsigned long long, vector > M; - unsigned int count=0; + matcher->bool_expr_ids =new struct bool_expr_match[expr_num]; + matcher->bool_expr_items=new struct bool_expr_item[expr_num]; + mem_bytes+=(unsigned int)expr_num*(sizeof(struct bool_expr_match)+sizeof(struct bool_expr_item)); for(unsigned int i=0; ibool_expr_ids[i]=exprs[i].user_tag; - if(imulti_expr_num) - { - matcher->multi_expr_size[i]=(unsigned int)exprs[i].item_num; - } - count+=(unsigned int)exprs[i].item_num; - unsigned char mask=0; - for(unsigned int j=0; jmulti_expr_num) matcher->multi_expr_mask[i]=mask; + matcher->bool_expr_ids[i].expr_id =exprs[i].expr_id; + matcher->bool_expr_ids[i].user_tag =exprs[i].user_tag; + matcher->bool_expr_items[i].item_num=exprs[i].item_num; + matcher->bool_expr_items[i].items=new struct bool_item[exprs[i].item_num]; + mem_bytes+=(unsigned int)exprs[i].item_num*sizeof(struct bool_item); + copy(exprs[i].items, exprs[i].items+exprs[i].item_num, matcher->bool_expr_items[i].items); + sort(matcher->bool_expr_items[i].items, matcher->bool_expr_items[i].items+exprs[i].item_num); } - matcher->bool_item_id_num=(unsigned int)M.size(); - matcher->bool_item_ids=new unsigned long long[M.size()]; - matcher->mapped_ptr =new unsigned int[M.size()+1]; - matcher->mapped_ids =new unsigned int[count]; - mem_bytes+=((unsigned int)M.size()+1+count)*sizeof(unsigned int)+(unsigned int)M.size()*sizeof(unsigned long long); + map M1; + for(unsigned int i=0; i > M2; + for(unsigned int i=0; ibool_item_num=(unsigned int)M2.size(); + matcher->bool_items =new unsigned long long[M2.size()]; + matcher->mapped_ptr =new unsigned int[M2.size()+1]; + matcher->mapped_ids =new unsigned int[matcher->bool_expr_num]; + mem_bytes+=((unsigned int)M2.size()+1+matcher->bool_expr_num)*sizeof(unsigned int)+(unsigned int)M2.size()*sizeof(unsigned long long); matcher->mapped_ptr[0]=0; - map< unsigned long long, vector >::const_iterator it=M.begin(); - for(unsigned int k=0; k >::const_iterator it=M2.begin(); + for(unsigned int k=0; kbool_item_ids[k]=it->first; + matcher->bool_items[k]=it->first; copy(it->second.begin(), it->second.end(), matcher->mapped_ids+matcher->mapped_ptr[k]); matcher->mapped_ptr[k+1]=matcher->mapped_ptr[k]+(unsigned int)it->second.size(); } - matcher->min_item_id=matcher->bool_item_ids[0]; - matcher->max_item_id=matcher->bool_item_ids[M.size()-1]; - for(unsigned int k=0; kbitmap_size=(1U<<27); + matcher->bitmap=new unsigned char[(matcher->bitmap_size)>>3]; + mem_bytes+=(matcher->bitmap_size)>>3; + memset(matcher->bitmap, 0, (matcher->bitmap_size)>>3); + + for(unsigned int i=0; ibool_item_num; i++) { - matcher->bool_item_ids[k]-=matcher->min_item_id; + unsigned int j=matcher->bool_items[i]&(matcher->bitmap_size-1); + matcher->bitmap[j>>3]|=(1U<<(j&7)); } - const unsigned long long ONE=1; - unsigned int theta=0; - while((ONE<<(theta+16))<=matcher->bool_item_ids[M.size()-1]) theta++; - matcher->theta=theta; - - matcher->L[0]=0; - for(unsigned int i=1; i<65536; i++) - { - matcher->L[i]=(unsigned int)(lower_bound(matcher->bool_item_ids, matcher->bool_item_ids+M.size(), i*(ONE<bool_item_ids); - } - matcher->L[65536]=(unsigned int)M.size(); - - M.clear(); - - *mem_size=mem_bytes; + if(mem_size!=NULL) *mem_size=mem_bytes; return matcher; } -int bool_matcher_match(struct bool_matcher * matcher, unsigned int thread_id, const unsigned long long * item_ids, size_t item_num, void ** result, size_t size) +int res_comp(const void * lhs, const void * rhs) +{ + bool_expr_match * _lhs=(bool_expr_match *)lhs; + bool_expr_match * _rhs=(bool_expr_match *)rhs; + return (_lhs->expr_id<_rhs->expr_id) ? 1 : -1; +} + +int do_match(struct bool_expr_item * expr, unsigned long long * item_ids, size_t item_num) +{ + unsigned int i=0; + for(unsigned int j=0; jitem_num; ++j) + { + if(expr->items[j].not_flag==0) + { + while(iitems[j].item_id) ++i; + if(i==item_num || item_ids[i]>expr->items[j].item_id) return 0; + ++i; + } + else + { + while(iitems[j].item_id) ++i; + if(iitems[j].item_id) return 0; + } + } + + return 1; +} + +int bool_matcher_match(struct bool_matcher * matcher, unsigned int thread_id, unsigned long long * item_ids, size_t item_num, struct bool_expr_match * results, size_t n_result) { if(matcher==NULL) return -1; - if(thread_id>=matcher->max_thread_num) return -1; + if(item_num==0) return 0; + + sort(item_ids, item_ids+item_num); + size_t J=0; + for(unsigned int i=1; ithread_data[thread_id].mapped_ids; - unsigned int ids_num=0; for(unsigned int i=0; imin_item_id || item_ids[i]>matcher->max_item_id) continue; + unsigned int t=item_ids[i]&(matcher->bitmap_size-1); + if((matcher->bitmap[t>>3]&(1U<<(t&7)))==0) continue; - unsigned long long id=item_ids[i]-matcher->min_item_id; - unsigned int k=(unsigned int)(id>>matcher->theta); - - int l=matcher->L[k], h=(int)matcher->L[k+1]-1; - if(hbool_item_num-1; while(l<=h) { int m=(l+h)/2; - if(idbool_item_ids[m]) h=m-1; - else l=m+1; - } - if(h<(int)matcher->L[k] || matcher->bool_item_ids[h]!=id) continue; - - for(unsigned int j=matcher->mapped_ptr[h]; jmapped_ptr[h+1]; j++) - { - if(ids_num==MAX_ARRAY_SIZE) return -1; - mapped_ids[ids_num++]=matcher->mapped_ids[j]; - } - } - - unsigned int * used_cells=matcher->thread_data[thread_id].used_cells; - unsigned int used_num=0; - for(unsigned int i=0; i>3); - } - - unsigned char * m_bitmap=matcher->thread_data[thread_id].multiexpr_bitmap; - unsigned int * s_bitmap=matcher->thread_data[thread_id].singlexpr_bitmap; - unsigned char * m_mask=matcher->multi_expr_mask; - for(unsigned int i=0; imulti_expr_num) - { - m_bitmap[used_cells[i]]=m_mask[used_cells[i]]; - } - else - { - unsigned int j=used_cells[i]-matcher->multi_expr_num; - s_bitmap[j>>5]&=~(1U<<(j&31)); - } - } - - for(unsigned int i=0; i>3); - if(xmulti_expr_num) - { - unsigned int y=(mapped_ids[i]&7); - if(m_mask[x]&(1U<bool_items[m]) { - m_bitmap[x]&=~(1U<mapped_ptr[m]; jmapped_ptr[m+1]; j++) + { + unsigned int idx=matcher->mapped_ids[j]; + int ret=do_match(matcher->bool_expr_items+idx, item_ids, item_num); + if(ret==1) + { + if(r==n_result) goto END; + results[r++]=matcher->bool_expr_ids[idx]; + } + } + break; + } + else if(item_ids[i]bool_items[m]) + { + h=m-1; } else { - m_bitmap[x]|=(1U<multi_expr_num; - s_bitmap[j>>5]|=(1U<<(j&31)); - } - } - - unsigned int r=0; - void ** cached_results=matcher->thread_data[thread_id].cached_results; - - for(unsigned int i=0; imulti_expr_num) - { - if(m_bitmap[x]==(1U<multi_expr_size[x])-1) - { - if(rbool_expr_ids[x]; - } - } - else - { - unsigned int j=used_cells[i]-matcher->multi_expr_num; - if((s_bitmap[j>>5]&(1U<<(j&31)))!=0) - { - if(rbool_expr_ids[x]; + l=m+1; } } } - sort(cached_results, cached_results+r); - - int I=0; - for(unsigned int J=0; Jbool_expr_ids; - delete [] matcher->multi_expr_size; - delete [] matcher->multi_expr_mask; - delete [] matcher->bool_item_ids; + for(unsigned int i=0; ibool_expr_num; i++) delete [] matcher->bool_expr_items[i].items; + delete [] matcher->bool_expr_items; + + delete [] matcher->bool_items; delete [] matcher->mapped_ptr; delete [] matcher->mapped_ids; - for(unsigned int i=0; imax_thread_num; i++) - { - delete [] matcher->thread_data[i].multiexpr_bitmap; - delete [] matcher->thread_data[i].singlexpr_bitmap; - } - delete [] matcher->thread_data; + delete [] matcher->bitmap; delete matcher; return; } diff --git a/test/perf_test_maatframe.cpp b/test/perf_test_maatframe.cpp index fe7395c..8789215 100644 --- a/test/perf_test_maatframe.cpp +++ b/test/perf_test_maatframe.cpp @@ -1,10 +1,141 @@ #include "Maat_rule.h" +#include "bool_matcher.h" #include "stream_fuzzy_hash.h" #include "Maat_command.h" #include #include #include #include +#include +struct bool_expr_wrapper +{ + struct bool_expr expr; + TAILQ_ENTRY(bool_expr_wrapper) entries; +}; +TAILQ_HEAD(bool_expr_q, bool_expr_wrapper); + +TEST(BoolMatcher, Match) +{ + struct bool_matcher * bm=NULL; + struct bool_expr *expr_array=NULL; + struct bool_expr_wrapper *p=NULL; + bool_expr_q expr_queue; + unsigned long long i=0; + TAILQ_INIT(&expr_queue); + const char* bool_expr_filename="./testdata/bool-matcher-test-exprs.txt"; + char line[512]={0}; + int ret=0, expr_num=0; + FILE* fp=fopen(bool_expr_filename, "r"); + memset(line, 0, sizeof(line)); + while(NULL!=fgets(line,sizeof(line),fp)) + { + if(line[0]=='#'||line[0]==' '||line[0]=='\t'||strlen(line)<4) + { + continue; + } + p=(struct bool_expr_wrapper*)calloc(sizeof(struct bool_expr_wrapper), 1); + ret=sscanf(line, "%lld %lld %lld %lld %lld %lld %lld %lld %lld", + &p->expr.expr_id, + &p->expr.items[0].item_id, + &p->expr.items[1].item_id, + &p->expr.items[2].item_id, + &p->expr.items[3].item_id, + &p->expr.items[4].item_id, + &p->expr.items[5].item_id, + &p->expr.items[6].item_id, + &p->expr.items[7].item_id); + if(ret<2) + { + free(p); + continue; + } + p->expr.item_num=ret-1; + p->expr.user_tag=NULL; + TAILQ_INSERT_TAIL(&expr_queue, p, entries); + expr_num++; + memset(line, 0, sizeof(line)); + } + fclose(fp); + expr_array=(struct bool_expr*)malloc(sizeof(struct bool_expr)*expr_num); + p=TAILQ_FIRST(&expr_queue); + while(p != NULL) + { + TAILQ_REMOVE(&expr_queue, p, entries); + memcpy(expr_array+i, &(p->expr), sizeof(p->expr)); + free(p); + p = TAILQ_FIRST(&expr_queue); + i++; + } + size_t mem_size=0; + bm=bool_matcher_new(expr_array, expr_num, 4, &mem_size); + unsigned long long test_count=2*1000*1000, match_count=0, unmatch_count=0; + long int j=0; + size_t k=0; + unsigned long long input_item_ids[256], time_elapse_ms=0, scan_per_second=0; + size_t input_item_num=0; + struct bool_expr_match result_array[1024]; + srand(19); + struct timespec start,end; + clock_gettime(CLOCK_MONOTONIC, &start); + + for(i=0; i0) + { + match_count++; + } + } + EXPECT_EQ(match_count, test_count); + input_item_ids[0]=123; + input_item_ids[1]=124; + input_item_ids[2]=125; + input_item_ids[3]=7; + input_item_ids[4]=3; + input_item_ids[5]=128; + input_item_ids[6]=129; + input_item_ids[7]=130; + input_item_ids[8]=131; + input_item_ids[9]=132; + input_item_ids[10]=133; + input_item_ids[11]=777; + input_item_ids[12]=999; + input_item_ids[13]=788; + input_item_ids[14]=222; + input_item_ids[15]=333; + input_item_num=8; + for(i=0; i