diff --git a/inc/Maat_rule.h b/inc/Maat_rule.h index a6e7be4..f7399c1 100644 --- a/inc/Maat_rule.h +++ b/inc/Maat_rule.h @@ -24,7 +24,13 @@ enum MAAT_CHARSET CHARSET_BIG5, CHARSET_UNICODE, CHARSET_UTF8, // 4 - CHARSET_BIN //5 + CHARSET_BIN, //5 + CHARSET_UNICODE_ASCII_ESC, // Unicode Escape format, prefix backslash-u hex, e.g. "\u627;" + CHARSET_UNICODE_ASCII_ALIGNED,//Unicode Escape format, prefix backslash-u with 4 bytes aligned, e.g. "\u0627" + CHARSET_UNICODE_NCR_DEC, //SGML Numeric character reference,decimal base, e.g. "ا" + CHARSET_UNICODE_NCR_HEX, //SGML Numeric character reference,hexdecimal base, e.g. "ا" + CHARSET_URL_ENCODE_GB2312, //URL encode with GB2312, e.g. the chinese word "china" was encoded to %D6%D0%B9%FA + CHARSET_URL_ENCODE_UTF8 //11, URL encode with UTF8,e.g. the chinese word "china" was encoded to %E4%B8%AD%E5%9B%BD }; enum MAAT_ACTION { diff --git a/src/entry/Maat_rule.cpp b/src/entry/Maat_rule.cpp index d24b010..7d0b605 100644 --- a/src/entry/Maat_rule.cpp +++ b/src/entry/Maat_rule.cpp @@ -29,7 +29,8 @@ int MAAT_FRAME_VERSION_1_5_20160311=1; const char *maat_module="MAAT Frame"; -const char* CHARSET_STRING[]={"CHARSET_NONE","GBK","BIG5","UNICODE","UTF-8"}; +const char* CHARSET_STRING[]={"NONE","gbk","big5","unicode","utf8","bin", + "unicode_ascii_esc","unicode_ascii_aligned","unicode_ncr_dec","unicode_ncr_hex","url_encode_gb2312","url_encode_utf8",""}; int converHextoint(char srctmp) { if(isdigit(srctmp)) @@ -79,6 +80,7 @@ iconv_t maat_iconv_open(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET to,enu cd=scanner->iconv_handle[to][from]; return cd; } + int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MAAT_CHARSET to,char *src,int srclen,char *dst,int *dstlen) { size_t ret; @@ -104,7 +106,8 @@ int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MA if(ret!=(size_t)(-1)) { - if(to==CHARSET_UNICODE)//jump unicode 2 bytes head 0xFF 0xFE + if(to==CHARSET_UNICODE&& + (*(unsigned short*)pOutBuff==0xFFFE||*(unsigned short*)pOutBuff==0XFEFF))//jump unicode 2 bytes BOM, 0xFF 0xFE { copy_len=iOutBuffLen-iLeftLen-2; copy_buf=pOutBuff+2; @@ -133,7 +136,137 @@ int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MA } } +int URLEncode(const char* str, const int strSize, char* result, const int resultSize) +{ + int i; + int j = 0;//for result index + char ch; + if ((str==NULL) || (result==NULL) || (strSize<=0) || (resultSize<=0)) + { + return -1; + } + + for ( i=0; (i='A') && (ch<'Z')) || + ((ch>='a') && (ch<'z')) || + ((ch>='0') && (ch<'9'))) + { + result[j++] = ch; + } + else if (ch == ' ') + { + result[j++] = '+'; + } + else if (ch == '.' || ch == '-' || ch == '_' || ch == '*') + { + result[j++] = ch; + } + else + { + if (j+3 < resultSize) + { + sprintf(result+j, "%%%02X", (unsigned char)ch); + j += 3; + } + else + { + return -1; + } + } + } + + result[j] = '\0'; + return j; +} +int uni2ascii(const char* fmt,const char* src, const int srclen, char* dst, const int dstsize) +{ + int i=0,j=0; + assert(srclen%2==0);//unicode must be 2 bytes aligned. + while(i0) + { + map_register(string2int_map,CHARSET_STRING[i], i); + } + else + { + break; + } + } + +/* map_register(string2int_map,"gbk", CHARSET_GBK); map_register(string2int_map,"big5", CHARSET_BIG5); map_register(string2int_map,"unicode", CHARSET_UNICODE); map_register(string2int_map,"utf8", CHARSET_UTF8); + map_register(string2int_map,"unicode_hex", CHARSET_UNICODE_ASCII_ESC); + map_register(string2int_map,"unicode_hex", CHARSET_UNICODE_ASCII_ESC); +*/ map_register(string2int_map,"yes", 1); map_register(string2int_map,"no", 0); @@ -277,6 +425,7 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char* MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module, "Maat read table info %s error.\n",table_info_path); } + i=0; while(NULL!=fgets(line,sizeof(line),fp)) { i++; @@ -301,9 +450,9 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char* { if(ret[j]<0) { - fprintf(stderr,"Maat read table info %s line %d error.\n",table_info_path,i); + fprintf(stderr,"Maat read table info %s line %d error:unknown column.\n",table_info_path,i); MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module, - "Maat read table info %s line %d error.\n",table_info_path,i); + "Maat read table info %s line %d error:unknown column.\n",table_info_path,i); goto error_jump; } } @@ -324,9 +473,9 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char* } else { - fprintf(stderr,"Maat read table info %s line %d error.\n",table_info_path,i); + fprintf(stderr,"Maat read table info %s line %d error:unknown dest charset %s.\n",table_info_path,i,sub_token); MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module, - "Maat read table info %s line %d error.\n",table_info_path,i); + "Maat read table info %s line %d error: unknown dest charset %s.\n",table_info_path,i,sub_token); goto error_jump; } @@ -1261,12 +1410,12 @@ int add_expr_rule(struct _Maat_table_info_t* table,struct db_str_rule_t* db_rule { continue; } - region_str_len=strlen(sub_key_array[k])*2+1; + region_str_len=strlen(sub_key_array[k])*8+1; // 1 byte map to 8 bytes maximum, e.g. "ا" or "\u63221;" region_string=(char*)calloc(sizeof(char),region_str_len); if(table->src_charset!=dst_charset)//need convert { - ret=iconv_convert(scanner,table->src_charset, dst_charset, + ret=universal_charset_convert(scanner,table->src_charset, dst_charset, sub_key_array[k],strlen(sub_key_array[k]), region_string, ®ion_str_len); if(ret<0) @@ -1279,7 +1428,7 @@ int add_expr_rule(struct _Maat_table_info_t* table,struct db_str_rule_t* db_rule free(region_string); continue; } - //if convert take no effect + //if convert take no effect and src charset is one of the dst. if(region_str_len==(int)strlen(sub_key_array[k])&& 0==memcmp(sub_key_array[k],region_string,region_str_len)&& TRUE==table->src_charset_in_dst) diff --git a/src/entry/Maat_rule_internal.h b/src/entry/Maat_rule_internal.h index d79ec31..f7f9ef0 100644 --- a/src/entry/Maat_rule_internal.h +++ b/src/entry/Maat_rule_internal.h @@ -36,7 +36,7 @@ typedef int atomic_t; #define FALSE 0 #define MAX_TABLE_NUM 256 -#define MAX_CHARSET_NUM 6 +#define MAX_CHARSET_NUM 16 #define MAX_TABLE_NAME_LEN 256 #define MAX_TABLE_LINE_SIZE (1024*4) #define MAX_EXPR_KEYLEN 1024 diff --git a/src/entry/great_index_engine.c b/src/entry/great_index_engine.c index 683085f..c907384 100644 --- a/src/entry/great_index_engine.c +++ b/src/entry/great_index_engine.c @@ -97,7 +97,7 @@ GIE_handle_t * GIE_create(const GIE_create_para_t * para) idtable_args.hash_slot_size = HTABLE_SIZE; idtable_args.max_elem_num = 4 * HTABLE_SIZE; idtable_args.expire_time = 0; - idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU; + idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO; idtable_args.key_comp = NULL; idtable_args.key2index = NULL; idtable_args.data_free = idtable_free; @@ -108,7 +108,7 @@ GIE_handle_t * GIE_create(const GIE_create_para_t * para) indextable_args.hash_slot_size = HTABLE_SIZE; indextable_args.max_elem_num = 4 * HTABLE_SIZE; indextable_args.expire_time = 0; - indextable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU; + indextable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO; indextable_args.key_comp = NULL; indextable_args.key2index = NULL; indextable_args.data_free = indextable_free; diff --git a/src/entry/json2iris.cpp b/src/entry/json2iris.cpp index 397aa85..d515995 100644 --- a/src/entry/json2iris.cpp +++ b/src/entry/json2iris.cpp @@ -89,7 +89,7 @@ int set_iris_descriptor(const char* json_file,cJSON *json,struct iris_descriptio hargs.thread_safe=1; hargs.hash_slot_size = 1024; hargs.max_elem_num = 0; - hargs.eliminate_type = HASH_ELIMINATE_ALGO_LRU; + hargs.eliminate_type = HASH_ELIMINATE_ALGO_FIFO; hargs.expire_time = 0; hargs.key_comp = NULL; hargs.key2index = NULL; diff --git a/src/entry/map_str2int.cpp b/src/entry/map_str2int.cpp index 66ec288..2e8752e 100644 --- a/src/entry/map_str2int.cpp +++ b/src/entry/map_str2int.cpp @@ -25,7 +25,7 @@ MESA_htable_handle map_create(void) hargs.thread_safe=8; hargs.hash_slot_size = 4*1024; hargs.max_elem_num = 0; - hargs.eliminate_type = HASH_ELIMINATE_ALGO_LRU; + hargs.eliminate_type = HASH_ELIMINATE_ALGO_FIFO; hargs.expire_time = 0; hargs.key_comp = NULL; hargs.key2index = NULL; diff --git a/test/maat_json.json b/test/maat_json.json index 4ee7c21..9895d59 100644 --- a/test/maat_json.json +++ b/test/maat_json.json @@ -216,6 +216,60 @@ ] } ] + }, + { + "compile_id": 129, + "service": 1, + "action": 1, + "do_blacklist": 1, + "do_log": 1, + "effective_rage": 0, + "user_region": "utf8_中文", + "is_valid": "yes", + "groups": [ + { + "group_name": "group_9", + "regions": [ + { + "table_name": "HTTP_URL", + "table_type": "none", + "table_content": { + "keywords": "C#中国", + "expr_type": "and", + "match_method": "sub", + "format": "uncase plain" + } + } + ] + } + ] + }, + { + "compile_id": 130, + "service": 1, + "action": 1, + "do_blacklist": 1, + "do_log": 1, + "effective_rage": 0, + "user_region": "utf8_维语", + "is_valid": "yes", + "groups": [ + { + "group_name": "group_10", + "regions": [ + { + "table_name": "KEYWORDS_TABLE", + "table_type": "none", + "table_content": { + "keywords": "2010–يىلىدىكى", + "expr_type": "and", + "match_method": "sub", + "format": "uncase plain" + } + } + ] + } + ] } ], "plugin_table": [ diff --git a/test/table_info.conf b/test/table_info.conf index 0facdd5..a4cb5b9 100644 --- a/test/table_info.conf +++ b/test/table_info.conf @@ -8,8 +8,8 @@ #id name type src_charset dst_charset do_merge 0 COMPILE compile GBK GBK no 0 1 GROUP group GBK GBK no 0 -2 HTTP_URL expr GBK GBK/BIG5/UNICODE/UTF8 yes 128 -3 KEYWORDS_TABLE expr GBK GBK/BIG5/UNICODE/UTF8 yes 0 +2 HTTP_URL expr UTF8 GBK/BIG5/UNICODE/UTF8/url_encode_gb2312/url_encode_utf8 yes 128 +3 KEYWORDS_TABLE expr UTF8 GBK/BIG5/UNICODE/UTF8/unicode_ascii_esc/unicode_ascii_aligned/unicode_ncr_dec/unicode_ncr_hex yes 0 4 IP_CONFIG ip GBK GBK no 0 5 CONTENT_SIZE intval GBK GBK no 0 6 QD_ENTRY_INFO plugin GBK GBK no 0