From 2faa589628c9740850490d1a5ab4c5bf4e00ef0d Mon Sep 17 00:00:00 2001 From: zhengchao Date: Fri, 11 Aug 2017 11:09:06 +0800 Subject: [PATCH] =?UTF-8?q?digest=5Fgen=E5=B7=A5=E5=85=B7=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E7=9B=B8=E4=BC=BC=E5=BA=A6=E8=AE=A1=E7=AE=97=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- {src/entry => inc}/gram_index_engine.h | 3 +- src/entry/gram_index_engine.c | 22 +++++---- test/digest_gen.c | 64 ++++++++++++++++++++++---- 3 files changed, 71 insertions(+), 18 deletions(-) rename {src/entry => inc}/gram_index_engine.h (89%) diff --git a/src/entry/gram_index_engine.h b/inc/gram_index_engine.h similarity index 89% rename from src/entry/gram_index_engine.h rename to inc/gram_index_engine.h index 09420d8..a69e924 100644 --- a/src/entry/gram_index_engine.h +++ b/inc/gram_index_engine.h @@ -59,7 +59,8 @@ int GIE_update(GIE_handle_t * handle, GIE_digest_t ** digests, int size); int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size); void GIE_destory(GIE_handle_t * handle); - +int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2); +int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2); #ifdef __cplusplus } diff --git a/src/entry/gram_index_engine.c b/src/entry/gram_index_engine.c index cd0a086..8a4576f 100644 --- a/src/entry/gram_index_engine.c +++ b/src/entry/gram_index_engine.c @@ -1092,14 +1092,22 @@ inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, uns blocksize = (unsigned long long)atoi(blk); return blocksize; } +int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2) +{ + int edit_distance=0; + int conf=0; + edit_distance = edit_distn(str1, len1,str2,len2); + conf = 100-(edit_distance*100)/(len1 + len2); + return conf; +} -int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_str, int len2) +int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2) { int j = 0, t = 0; unsigned long long query_blocksize = 0, index_blocksize = 0; unsigned int query_real_length = 0, index_real_length = 0; - const char *query_gram_begin = query_str; - const char *index_gram_begin = index_str; + const char *query_gram_begin = sfh1; + const char *index_gram_begin = sfh2; char *splice_str = (char *)malloc(sizeof(char)*len1); memset(splice_str,'\0',len1); char *spli_str_begin = splice_str; @@ -1121,7 +1129,7 @@ int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_st index_gram_begin++; } index_real_length = get_real_length(index_gram_begin, len2); - query_gram_begin = query_str; + query_gram_begin = sfh1; for(t = 0; t < 2; t++) { query_blocksize = get_blocksize_from_head(query_gram_begin, len1); @@ -1259,7 +1267,6 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result struct id_table_data * ret_tmp = NULL; short conf = 0; int ret_size = 0; - int edit_distance = 0; for(i = 0; i <= union_index; i++) { if( i == union_index || *tmp_id != current_id ) @@ -1286,12 +1293,11 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result { if(_handle->input_format == GIE_INPUT_FORMAT_SFH) { - conf = GIE_comp_edit_distance(data, fuzzy_string_len, tmp_gram, length); + conf = GIE_sfh_similiarity(data, fuzzy_string_len, tmp_gram, length); } else { - edit_distance = edit_distn(data, fuzzy_string_len,tmp_gram,length); - conf = 100-(edit_distance*100)/(fuzzy_string_len + length); + conf=GIE_string_similiarity(data, fuzzy_string_len, tmp_gram, length); } } diff --git a/test/digest_gen.c b/test/digest_gen.c index 96232a9..fbcf687 100644 --- a/test/digest_gen.c +++ b/test/digest_gen.c @@ -6,7 +6,9 @@ #include #include #include +#include #include "stream_fuzzy_hash.h" +#include "gram_index_engine.h" void* entropy_start(void) { @@ -77,23 +79,67 @@ void hash_file(const char* path) free(digest_result_buff); fclose(fp); } - +void digest_gen_print_usage(void) +{ + printf("digest_gen uasge:\n\t-f [FILE], caculate a file's SFH digest.\n"); + printf("\t-s specify first string for comparing.\n"); + printf("\t-d specify second string for comparing.\n"); + printf("\t-c compare two simple string with similairity.\n"); + printf("\t-m compare two SFH signature.\n"); + return; +} int main(int argc, char * argv[]) { char path[256]; - if(argc == 2) + char str1[4096],str2[4096]; + int oc=0; + int confidence=0; + int model=0; + const char* b_opt_arg=NULL; + if(argc<2) { - hash_file(argv[1]); + digest_gen_print_usage(); + return 0; } - else if(NULL!=fgets(path,sizeof(path),stdin)) + while((oc=getopt(argc,argv,"f:cms:d:"))!=-1) { - hash_file(path); + switch(oc) + { + case 'f': + strncpy(path,optarg,sizeof(path)); + break; + case 'c': + case 'm': + model=oc; + break; + case 's': + strncpy(str1,optarg,sizeof(str1)); + break; + case 'd': + strncpy(str2,optarg,sizeof(str2)); + break; + case '?': + default: + digest_gen_print_usage(); + break; + } } - else + switch(model) { - printf("SFH uasge: ./digest_gen [Dir]\n"); - exit(-1); + case 'f': + hash_file(path); + break; + case 'c': + confidence=GIE_string_similiarity(str1, strlen(str1), str2, strlen(str2)); + printf("%d\n",confidence); + break; + case 'm': + sscanf(optarg,"%s,%s",str1,str2); + confidence=GIE_sfh_similiarity(str1, strlen(str1), str2, strlen(str2)); + printf("%d\n",confidence); + break; + default: + assert(0); } - return 0; }