升级关键字配置转码功能,增加"unicode_ascii_esc","unicode_ascii_aligned","unicode_ncr_dec","unicode_ncr_hex"共4种unicode网页编码,增加"url_encode_gb2312","url_encode_utf8"共2种URL编码。

This commit is contained in:
zhengchao
2016-04-03 12:29:41 +08:00
parent 422899cc81
commit f3f43fd499
8 changed files with 228 additions and 19 deletions

View File

@@ -24,7 +24,13 @@ enum MAAT_CHARSET
CHARSET_BIG5, CHARSET_BIG5,
CHARSET_UNICODE, CHARSET_UNICODE,
CHARSET_UTF8, // 4 CHARSET_UTF8, // 4
CHARSET_BIN //5 CHARSET_BIN, //5
CHARSET_UNICODE_ASCII_ESC, // Unicode Escape format, prefix backslash-u hex, e.g. "\u627;"
CHARSET_UNICODE_ASCII_ALIGNED,//Unicode Escape format, prefix backslash-u with 4 bytes aligned, e.g. "\u0627"
CHARSET_UNICODE_NCR_DEC, //SGML Numeric character reference,decimal base, e.g. "ا"
CHARSET_UNICODE_NCR_HEX, //SGML Numeric character reference,hexdecimal base, e.g. "ا"
CHARSET_URL_ENCODE_GB2312, //URL encode with GB2312, e.g. the chinese word "china" was encoded to %D6%D0%B9%FA
CHARSET_URL_ENCODE_UTF8 //11, URL encode with UTF8,e.g. the chinese word "china" was encoded to %E4%B8%AD%E5%9B%BD
}; };
enum MAAT_ACTION enum MAAT_ACTION
{ {

View File

@@ -29,7 +29,8 @@
int MAAT_FRAME_VERSION_1_5_20160311=1; int MAAT_FRAME_VERSION_1_5_20160311=1;
const char *maat_module="MAAT Frame"; const char *maat_module="MAAT Frame";
const char* CHARSET_STRING[]={"CHARSET_NONE","GBK","BIG5","UNICODE","UTF-8"}; const char* CHARSET_STRING[]={"NONE","gbk","big5","unicode","utf8","bin",
"unicode_ascii_esc","unicode_ascii_aligned","unicode_ncr_dec","unicode_ncr_hex","url_encode_gb2312","url_encode_utf8",""};
int converHextoint(char srctmp) int converHextoint(char srctmp)
{ {
if(isdigit(srctmp)) if(isdigit(srctmp))
@@ -79,6 +80,7 @@ iconv_t maat_iconv_open(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET to,enu
cd=scanner->iconv_handle[to][from]; cd=scanner->iconv_handle[to][from];
return cd; return cd;
} }
int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MAAT_CHARSET to,char *src,int srclen,char *dst,int *dstlen) int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MAAT_CHARSET to,char *src,int srclen,char *dst,int *dstlen)
{ {
size_t ret; size_t ret;
@@ -104,7 +106,8 @@ int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MA
if(ret!=(size_t)(-1)) if(ret!=(size_t)(-1))
{ {
if(to==CHARSET_UNICODE)//jump unicode 2 bytes head 0xFF 0xFE if(to==CHARSET_UNICODE&&
(*(unsigned short*)pOutBuff==0xFFFE||*(unsigned short*)pOutBuff==0XFEFF))//jump unicode 2 bytes BOM, 0xFF 0xFE
{ {
copy_len=iOutBuffLen-iLeftLen-2; copy_len=iOutBuffLen-iLeftLen-2;
copy_buf=pOutBuff+2; copy_buf=pOutBuff+2;
@@ -133,7 +136,137 @@ int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MA
} }
} }
int URLEncode(const char* str, const int strSize, char* result, const int resultSize)
{
int i;
int j = 0;//for result index
char ch;
if ((str==NULL) || (result==NULL) || (strSize<=0) || (resultSize<=0))
{
return -1;
}
for ( i=0; (i<strSize)&&(j<resultSize); ++i)
{
ch = str[i];
if (((ch>='A') && (ch<'Z')) ||
((ch>='a') && (ch<'z')) ||
((ch>='0') && (ch<'9')))
{
result[j++] = ch;
}
else if (ch == ' ')
{
result[j++] = '+';
}
else if (ch == '.' || ch == '-' || ch == '_' || ch == '*')
{
result[j++] = ch;
}
else
{
if (j+3 < resultSize)
{
sprintf(result+j, "%%%02X", (unsigned char)ch);
j += 3;
}
else
{
return -1;
}
}
}
result[j] = '\0';
return j;
}
int uni2ascii(const char* fmt,const char* src, const int srclen, char* dst, const int dstsize)
{
int i=0,j=0;
assert(srclen%2==0);//unicode must be 2 bytes aligned.
while(i<srclen&&j<dstsize)
{
if(*(unsigned short*)(src+i)<0x7f)
{
dst[j]=*(unsigned short*)(src+i);
j++;
}
else
{
j+=snprintf(dst+j,dstsize-j,fmt,*(unsigned short*)(src+i));
}
i+=2;
}
return j;
}
int universal_charset_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MAAT_CHARSET to,char *src,int srclen,char *dst,int *dstlen)
{
int ret=0;
char* tmp_buff=NULL;
int tmp_buff_size=0;
MAAT_CHARSET tmp_dst_code=CHARSET_NONE;
const char* fmt=NULL;
switch(to)
{
case CHARSET_GBK:
case CHARSET_BIG5:
case CHARSET_UNICODE:
case CHARSET_UTF8:
ret=iconv_convert(scanner,from,to,src,srclen,dst,dstlen);
return ret;
break;
case CHARSET_UNICODE_ASCII_ESC:
tmp_dst_code=CHARSET_UNICODE;
fmt="\\u%x;";
break;
case CHARSET_UNICODE_ASCII_ALIGNED:
tmp_dst_code=CHARSET_UNICODE;
fmt="\\u%04x";
break;
case CHARSET_UNICODE_NCR_DEC:
tmp_dst_code=CHARSET_UNICODE;
fmt="&#%u;";
break;
case CHARSET_UNICODE_NCR_HEX:
tmp_dst_code=CHARSET_UNICODE;
fmt="&#x%x;";
break;
case CHARSET_URL_ENCODE_GB2312:
tmp_dst_code=CHARSET_GBK;
fmt=NULL;
break;
case CHARSET_URL_ENCODE_UTF8:
tmp_dst_code=CHARSET_UTF8;
fmt=NULL;
break;
default:
return -1;
break;
}
tmp_buff_size=*dstlen;
tmp_buff=(char*)malloc(tmp_buff_size);
ret=iconv_convert(scanner,from,tmp_dst_code,src,srclen,tmp_buff,&tmp_buff_size);
if(ret<0)
{
goto error_out;
}
if(fmt!=NULL)
{
ret=uni2ascii(fmt, tmp_buff, tmp_buff_size, dst,*dstlen);
}
else
{
ret=URLEncode(tmp_buff,tmp_buff_size,dst,*dstlen);
}
*dstlen=ret;
error_out:
free(tmp_buff);
tmp_buff=NULL;
return ret;
}
char* strlwr(char* string) char* strlwr(char* string)
{ {
int i=0; int i=0;
@@ -249,7 +382,7 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
FILE*fp=NULL; FILE*fp=NULL;
char line[MAX_TABLE_LINE_SIZE]; char line[MAX_TABLE_LINE_SIZE];
int i=0,j=0,ret[4]={0},table_cnt=0; int i=0,j=0,ret[4]={0},table_cnt=0;
char table_type[16],src_charset[16],dst_charset[64],merge[4]; char table_type[16],src_charset[256],dst_charset[256],merge[4];
MESA_htable_handle string2int_map=map_create(); MESA_htable_handle string2int_map=map_create();
char *token=NULL,*sub_token=NULL,*saveptr; char *token=NULL,*sub_token=NULL,*saveptr;
struct _Maat_table_info_t*p=NULL; struct _Maat_table_info_t*p=NULL;
@@ -262,11 +395,26 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
map_register(string2int_map,"digest", TABLE_TYPE_DIGEST); map_register(string2int_map,"digest", TABLE_TYPE_DIGEST);
map_register(string2int_map,"expr_plus", TABLE_TYPE_EXPR_PLUS); map_register(string2int_map,"expr_plus", TABLE_TYPE_EXPR_PLUS);
map_register(string2int_map,"group", TABLE_TYPE_GROUP); map_register(string2int_map,"group", TABLE_TYPE_GROUP);
map_register(string2int_map,"bin", CHARSET_NONE); for(i=0;i<MAX_CHARSET_NUM;i++)
{
if(strlen(CHARSET_STRING[i])>0)
{
map_register(string2int_map,CHARSET_STRING[i], i);
}
else
{
break;
}
}
/*
map_register(string2int_map,"gbk", CHARSET_GBK); map_register(string2int_map,"gbk", CHARSET_GBK);
map_register(string2int_map,"big5", CHARSET_BIG5); map_register(string2int_map,"big5", CHARSET_BIG5);
map_register(string2int_map,"unicode", CHARSET_UNICODE); map_register(string2int_map,"unicode", CHARSET_UNICODE);
map_register(string2int_map,"utf8", CHARSET_UTF8); map_register(string2int_map,"utf8", CHARSET_UTF8);
map_register(string2int_map,"unicode_hex", CHARSET_UNICODE_ASCII_ESC);
map_register(string2int_map,"unicode_hex", CHARSET_UNICODE_ASCII_ESC);
*/
map_register(string2int_map,"yes", 1); map_register(string2int_map,"yes", 1);
map_register(string2int_map,"no", 0); map_register(string2int_map,"no", 0);
@@ -277,6 +425,7 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module, MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module,
"Maat read table info %s error.\n",table_info_path); "Maat read table info %s error.\n",table_info_path);
} }
i=0;
while(NULL!=fgets(line,sizeof(line),fp)) while(NULL!=fgets(line,sizeof(line),fp))
{ {
i++; i++;
@@ -301,9 +450,9 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
{ {
if(ret[j]<0) if(ret[j]<0)
{ {
fprintf(stderr,"Maat read table info %s line %d error.\n",table_info_path,i); fprintf(stderr,"Maat read table info %s line %d error:unknown column.\n",table_info_path,i);
MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module, MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module,
"Maat read table info %s line %d error.\n",table_info_path,i); "Maat read table info %s line %d error:unknown column.\n",table_info_path,i);
goto error_jump; goto error_jump;
} }
} }
@@ -324,9 +473,9 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
} }
else else
{ {
fprintf(stderr,"Maat read table info %s line %d error.\n",table_info_path,i); fprintf(stderr,"Maat read table info %s line %d error:unknown dest charset %s.\n",table_info_path,i,sub_token);
MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module, MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module,
"Maat read table info %s line %d error.\n",table_info_path,i); "Maat read table info %s line %d error: unknown dest charset %s.\n",table_info_path,i,sub_token);
goto error_jump; goto error_jump;
} }
@@ -1261,12 +1410,12 @@ int add_expr_rule(struct _Maat_table_info_t* table,struct db_str_rule_t* db_rule
{ {
continue; continue;
} }
region_str_len=strlen(sub_key_array[k])*2+1; region_str_len=strlen(sub_key_array[k])*8+1; // 1 byte map to 8 bytes maximum, e.g. "&#x0627;" or "\u63221;"
region_string=(char*)calloc(sizeof(char),region_str_len); region_string=(char*)calloc(sizeof(char),region_str_len);
if(table->src_charset!=dst_charset)//need convert if(table->src_charset!=dst_charset)//need convert
{ {
ret=iconv_convert(scanner,table->src_charset, dst_charset, ret=universal_charset_convert(scanner,table->src_charset, dst_charset,
sub_key_array[k],strlen(sub_key_array[k]), sub_key_array[k],strlen(sub_key_array[k]),
region_string, &region_str_len); region_string, &region_str_len);
if(ret<0) if(ret<0)
@@ -1279,7 +1428,7 @@ int add_expr_rule(struct _Maat_table_info_t* table,struct db_str_rule_t* db_rule
free(region_string); free(region_string);
continue; continue;
} }
//if convert take no effect //if convert take no effect and src charset is one of the dst.
if(region_str_len==(int)strlen(sub_key_array[k])&& if(region_str_len==(int)strlen(sub_key_array[k])&&
0==memcmp(sub_key_array[k],region_string,region_str_len)&& 0==memcmp(sub_key_array[k],region_string,region_str_len)&&
TRUE==table->src_charset_in_dst) TRUE==table->src_charset_in_dst)

View File

@@ -36,7 +36,7 @@ typedef int atomic_t;
#define FALSE 0 #define FALSE 0
#define MAX_TABLE_NUM 256 #define MAX_TABLE_NUM 256
#define MAX_CHARSET_NUM 6 #define MAX_CHARSET_NUM 16
#define MAX_TABLE_NAME_LEN 256 #define MAX_TABLE_NAME_LEN 256
#define MAX_TABLE_LINE_SIZE (1024*4) #define MAX_TABLE_LINE_SIZE (1024*4)
#define MAX_EXPR_KEYLEN 1024 #define MAX_EXPR_KEYLEN 1024

View File

@@ -97,7 +97,7 @@ GIE_handle_t * GIE_create(const GIE_create_para_t * para)
idtable_args.hash_slot_size = HTABLE_SIZE; idtable_args.hash_slot_size = HTABLE_SIZE;
idtable_args.max_elem_num = 4 * HTABLE_SIZE; idtable_args.max_elem_num = 4 * HTABLE_SIZE;
idtable_args.expire_time = 0; idtable_args.expire_time = 0;
idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU; idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
idtable_args.key_comp = NULL; idtable_args.key_comp = NULL;
idtable_args.key2index = NULL; idtable_args.key2index = NULL;
idtable_args.data_free = idtable_free; idtable_args.data_free = idtable_free;
@@ -108,7 +108,7 @@ GIE_handle_t * GIE_create(const GIE_create_para_t * para)
indextable_args.hash_slot_size = HTABLE_SIZE; indextable_args.hash_slot_size = HTABLE_SIZE;
indextable_args.max_elem_num = 4 * HTABLE_SIZE; indextable_args.max_elem_num = 4 * HTABLE_SIZE;
indextable_args.expire_time = 0; indextable_args.expire_time = 0;
indextable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU; indextable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
indextable_args.key_comp = NULL; indextable_args.key_comp = NULL;
indextable_args.key2index = NULL; indextable_args.key2index = NULL;
indextable_args.data_free = indextable_free; indextable_args.data_free = indextable_free;

View File

@@ -89,7 +89,7 @@ int set_iris_descriptor(const char* json_file,cJSON *json,struct iris_descriptio
hargs.thread_safe=1; hargs.thread_safe=1;
hargs.hash_slot_size = 1024; hargs.hash_slot_size = 1024;
hargs.max_elem_num = 0; hargs.max_elem_num = 0;
hargs.eliminate_type = HASH_ELIMINATE_ALGO_LRU; hargs.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
hargs.expire_time = 0; hargs.expire_time = 0;
hargs.key_comp = NULL; hargs.key_comp = NULL;
hargs.key2index = NULL; hargs.key2index = NULL;

View File

@@ -25,7 +25,7 @@ MESA_htable_handle map_create(void)
hargs.thread_safe=8; hargs.thread_safe=8;
hargs.hash_slot_size = 4*1024; hargs.hash_slot_size = 4*1024;
hargs.max_elem_num = 0; hargs.max_elem_num = 0;
hargs.eliminate_type = HASH_ELIMINATE_ALGO_LRU; hargs.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
hargs.expire_time = 0; hargs.expire_time = 0;
hargs.key_comp = NULL; hargs.key_comp = NULL;
hargs.key2index = NULL; hargs.key2index = NULL;

View File

@@ -216,6 +216,60 @@
] ]
} }
] ]
},
{
"compile_id": 129,
"service": 1,
"action": 1,
"do_blacklist": 1,
"do_log": 1,
"effective_rage": 0,
"user_region": "utf8_中文",
"is_valid": "yes",
"groups": [
{
"group_name": "group_9",
"regions": [
{
"table_name": "HTTP_URL",
"table_type": "none",
"table_content": {
"keywords": "C#中国",
"expr_type": "and",
"match_method": "sub",
"format": "uncase plain"
}
}
]
}
]
},
{
"compile_id": 130,
"service": 1,
"action": 1,
"do_blacklist": 1,
"do_log": 1,
"effective_rage": 0,
"user_region": "utf8_维语",
"is_valid": "yes",
"groups": [
{
"group_name": "group_10",
"regions": [
{
"table_name": "KEYWORDS_TABLE",
"table_type": "none",
"table_content": {
"keywords": "2010يىلىدىكى",
"expr_type": "and",
"match_method": "sub",
"format": "uncase plain"
}
}
]
}
]
} }
], ],
"plugin_table": [ "plugin_table": [

View File

@@ -8,8 +8,8 @@
#id name type src_charset dst_charset do_merge #id name type src_charset dst_charset do_merge
0 COMPILE compile GBK GBK no 0 0 COMPILE compile GBK GBK no 0
1 GROUP group GBK GBK no 0 1 GROUP group GBK GBK no 0
2 HTTP_URL expr GBK GBK/BIG5/UNICODE/UTF8 yes 128 2 HTTP_URL expr UTF8 GBK/BIG5/UNICODE/UTF8/url_encode_gb2312/url_encode_utf8 yes 128
3 KEYWORDS_TABLE expr GBK GBK/BIG5/UNICODE/UTF8 yes 0 3 KEYWORDS_TABLE expr UTF8 GBK/BIG5/UNICODE/UTF8/unicode_ascii_esc/unicode_ascii_aligned/unicode_ncr_dec/unicode_ncr_hex yes 0
4 IP_CONFIG ip GBK GBK no 0 4 IP_CONFIG ip GBK GBK no 0
5 CONTENT_SIZE intval GBK GBK no 0 5 CONTENT_SIZE intval GBK GBK no 0
6 QD_ENTRY_INFO plugin GBK GBK no 0 6 QD_ENTRY_INFO plugin GBK GBK no 0