升级关键字配置转码功能,增加"unicode_ascii_esc","unicode_ascii_aligned","unicode_ncr_dec","unicode_ncr_hex"共4种unicode网页编码,增加"url_encode_gb2312","url_encode_utf8"共2种URL编码。

This commit is contained in:
zhengchao
2016-04-03 12:29:41 +08:00
parent 422899cc81
commit f3f43fd499
8 changed files with 228 additions and 19 deletions

View File

@@ -29,7 +29,8 @@
int MAAT_FRAME_VERSION_1_5_20160311=1;
const char *maat_module="MAAT Frame";
const char* CHARSET_STRING[]={"CHARSET_NONE","GBK","BIG5","UNICODE","UTF-8"};
const char* CHARSET_STRING[]={"NONE","gbk","big5","unicode","utf8","bin",
"unicode_ascii_esc","unicode_ascii_aligned","unicode_ncr_dec","unicode_ncr_hex","url_encode_gb2312","url_encode_utf8",""};
int converHextoint(char srctmp)
{
if(isdigit(srctmp))
@@ -79,6 +80,7 @@ iconv_t maat_iconv_open(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET to,enu
cd=scanner->iconv_handle[to][from];
return cd;
}
int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MAAT_CHARSET to,char *src,int srclen,char *dst,int *dstlen)
{
size_t ret;
@@ -104,7 +106,8 @@ int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MA
if(ret!=(size_t)(-1))
{
if(to==CHARSET_UNICODE)//jump unicode 2 bytes head 0xFF 0xFE
if(to==CHARSET_UNICODE&&
(*(unsigned short*)pOutBuff==0xFFFE||*(unsigned short*)pOutBuff==0XFEFF))//jump unicode 2 bytes BOM, 0xFF 0xFE
{
copy_len=iOutBuffLen-iLeftLen-2;
copy_buf=pOutBuff+2;
@@ -133,7 +136,137 @@ int iconv_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MA
}
}
int URLEncode(const char* str, const int strSize, char* result, const int resultSize)
{
int i;
int j = 0;//for result index
char ch;
if ((str==NULL) || (result==NULL) || (strSize<=0) || (resultSize<=0))
{
return -1;
}
for ( i=0; (i<strSize)&&(j<resultSize); ++i)
{
ch = str[i];
if (((ch>='A') && (ch<'Z')) ||
((ch>='a') && (ch<'z')) ||
((ch>='0') && (ch<'9')))
{
result[j++] = ch;
}
else if (ch == ' ')
{
result[j++] = '+';
}
else if (ch == '.' || ch == '-' || ch == '_' || ch == '*')
{
result[j++] = ch;
}
else
{
if (j+3 < resultSize)
{
sprintf(result+j, "%%%02X", (unsigned char)ch);
j += 3;
}
else
{
return -1;
}
}
}
result[j] = '\0';
return j;
}
int uni2ascii(const char* fmt,const char* src, const int srclen, char* dst, const int dstsize)
{
int i=0,j=0;
assert(srclen%2==0);//unicode must be 2 bytes aligned.
while(i<srclen&&j<dstsize)
{
if(*(unsigned short*)(src+i)<0x7f)
{
dst[j]=*(unsigned short*)(src+i);
j++;
}
else
{
j+=snprintf(dst+j,dstsize-j,fmt,*(unsigned short*)(src+i));
}
i+=2;
}
return j;
}
int universal_charset_convert(struct _Maat_scanner_t* scanner,enum MAAT_CHARSET from,enum MAAT_CHARSET to,char *src,int srclen,char *dst,int *dstlen)
{
int ret=0;
char* tmp_buff=NULL;
int tmp_buff_size=0;
MAAT_CHARSET tmp_dst_code=CHARSET_NONE;
const char* fmt=NULL;
switch(to)
{
case CHARSET_GBK:
case CHARSET_BIG5:
case CHARSET_UNICODE:
case CHARSET_UTF8:
ret=iconv_convert(scanner,from,to,src,srclen,dst,dstlen);
return ret;
break;
case CHARSET_UNICODE_ASCII_ESC:
tmp_dst_code=CHARSET_UNICODE;
fmt="\\u%x;";
break;
case CHARSET_UNICODE_ASCII_ALIGNED:
tmp_dst_code=CHARSET_UNICODE;
fmt="\\u%04x";
break;
case CHARSET_UNICODE_NCR_DEC:
tmp_dst_code=CHARSET_UNICODE;
fmt="&#%u;";
break;
case CHARSET_UNICODE_NCR_HEX:
tmp_dst_code=CHARSET_UNICODE;
fmt="&#x%x;";
break;
case CHARSET_URL_ENCODE_GB2312:
tmp_dst_code=CHARSET_GBK;
fmt=NULL;
break;
case CHARSET_URL_ENCODE_UTF8:
tmp_dst_code=CHARSET_UTF8;
fmt=NULL;
break;
default:
return -1;
break;
}
tmp_buff_size=*dstlen;
tmp_buff=(char*)malloc(tmp_buff_size);
ret=iconv_convert(scanner,from,tmp_dst_code,src,srclen,tmp_buff,&tmp_buff_size);
if(ret<0)
{
goto error_out;
}
if(fmt!=NULL)
{
ret=uni2ascii(fmt, tmp_buff, tmp_buff_size, dst,*dstlen);
}
else
{
ret=URLEncode(tmp_buff,tmp_buff_size,dst,*dstlen);
}
*dstlen=ret;
error_out:
free(tmp_buff);
tmp_buff=NULL;
return ret;
}
char* strlwr(char* string)
{
int i=0;
@@ -249,7 +382,7 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
FILE*fp=NULL;
char line[MAX_TABLE_LINE_SIZE];
int i=0,j=0,ret[4]={0},table_cnt=0;
char table_type[16],src_charset[16],dst_charset[64],merge[4];
char table_type[16],src_charset[256],dst_charset[256],merge[4];
MESA_htable_handle string2int_map=map_create();
char *token=NULL,*sub_token=NULL,*saveptr;
struct _Maat_table_info_t*p=NULL;
@@ -262,11 +395,26 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
map_register(string2int_map,"digest", TABLE_TYPE_DIGEST);
map_register(string2int_map,"expr_plus", TABLE_TYPE_EXPR_PLUS);
map_register(string2int_map,"group", TABLE_TYPE_GROUP);
map_register(string2int_map,"bin", CHARSET_NONE);
for(i=0;i<MAX_CHARSET_NUM;i++)
{
if(strlen(CHARSET_STRING[i])>0)
{
map_register(string2int_map,CHARSET_STRING[i], i);
}
else
{
break;
}
}
/*
map_register(string2int_map,"gbk", CHARSET_GBK);
map_register(string2int_map,"big5", CHARSET_BIG5);
map_register(string2int_map,"unicode", CHARSET_UNICODE);
map_register(string2int_map,"utf8", CHARSET_UTF8);
map_register(string2int_map,"unicode_hex", CHARSET_UNICODE_ASCII_ESC);
map_register(string2int_map,"unicode_hex", CHARSET_UNICODE_ASCII_ESC);
*/
map_register(string2int_map,"yes", 1);
map_register(string2int_map,"no", 0);
@@ -277,6 +425,7 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module,
"Maat read table info %s error.\n",table_info_path);
}
i=0;
while(NULL!=fgets(line,sizeof(line),fp))
{
i++;
@@ -301,9 +450,9 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
{
if(ret[j]<0)
{
fprintf(stderr,"Maat read table info %s line %d error.\n",table_info_path,i);
fprintf(stderr,"Maat read table info %s line %d error:unknown column.\n",table_info_path,i);
MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module,
"Maat read table info %s line %d error.\n",table_info_path,i);
"Maat read table info %s line %d error:unknown column.\n",table_info_path,i);
goto error_jump;
}
}
@@ -324,9 +473,9 @@ int read_table_info(struct _Maat_table_info_t** p_table_info,int num,const char*
}
else
{
fprintf(stderr,"Maat read table info %s line %d error.\n",table_info_path,i);
fprintf(stderr,"Maat read table info %s line %d error:unknown dest charset %s.\n",table_info_path,i,sub_token);
MESA_handle_runtime_log(logger, RLOG_LV_FATAL,maat_module,
"Maat read table info %s line %d error.\n",table_info_path,i);
"Maat read table info %s line %d error: unknown dest charset %s.\n",table_info_path,i,sub_token);
goto error_jump;
}
@@ -1261,12 +1410,12 @@ int add_expr_rule(struct _Maat_table_info_t* table,struct db_str_rule_t* db_rule
{
continue;
}
region_str_len=strlen(sub_key_array[k])*2+1;
region_str_len=strlen(sub_key_array[k])*8+1; // 1 byte map to 8 bytes maximum, e.g. "&#x0627;" or "\u63221;"
region_string=(char*)calloc(sizeof(char),region_str_len);
if(table->src_charset!=dst_charset)//need convert
{
ret=iconv_convert(scanner,table->src_charset, dst_charset,
ret=universal_charset_convert(scanner,table->src_charset, dst_charset,
sub_key_array[k],strlen(sub_key_array[k]),
region_string, &region_str_len);
if(ret<0)
@@ -1279,7 +1428,7 @@ int add_expr_rule(struct _Maat_table_info_t* table,struct db_str_rule_t* db_rule
free(region_string);
continue;
}
//if convert take no effect
//if convert take no effect and src charset is one of the dst.
if(region_str_len==(int)strlen(sub_key_array[k])&&
0==memcmp(sub_key_array[k],region_string,region_str_len)&&
TRUE==table->src_charset_in_dst)