digest_gen工具支持相似度计算。

This commit is contained in:
zhengchao
2017-08-11 11:09:06 +08:00
parent 0a64602d2a
commit 2faa589628
3 changed files with 71 additions and 18 deletions

View File

@@ -59,7 +59,8 @@ int GIE_update(GIE_handle_t * handle, GIE_digest_t ** digests, int size);
int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size);
void GIE_destory(GIE_handle_t * handle);
int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2);
int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2);
#ifdef __cplusplus
}

View File

@@ -1092,14 +1092,22 @@ inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, uns
blocksize = (unsigned long long)atoi(blk);
return blocksize;
}
int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2)
{
int edit_distance=0;
int conf=0;
edit_distance = edit_distn(str1, len1,str2,len2);
conf = 100-(edit_distance*100)/(len1 + len2);
return conf;
}
int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_str, int len2)
int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2)
{
int j = 0, t = 0;
unsigned long long query_blocksize = 0, index_blocksize = 0;
unsigned int query_real_length = 0, index_real_length = 0;
const char *query_gram_begin = query_str;
const char *index_gram_begin = index_str;
const char *query_gram_begin = sfh1;
const char *index_gram_begin = sfh2;
char *splice_str = (char *)malloc(sizeof(char)*len1);
memset(splice_str,'\0',len1);
char *spli_str_begin = splice_str;
@@ -1121,7 +1129,7 @@ int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_st
index_gram_begin++;
}
index_real_length = get_real_length(index_gram_begin, len2);
query_gram_begin = query_str;
query_gram_begin = sfh1;
for(t = 0; t < 2; t++)
{
query_blocksize = get_blocksize_from_head(query_gram_begin, len1);
@@ -1259,7 +1267,6 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result
struct id_table_data * ret_tmp = NULL;
short conf = 0;
int ret_size = 0;
int edit_distance = 0;
for(i = 0; i <= union_index; i++)
{
if( i == union_index || *tmp_id != current_id )
@@ -1286,12 +1293,11 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result
{
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
conf = GIE_comp_edit_distance(data, fuzzy_string_len, tmp_gram, length);
conf = GIE_sfh_similiarity(data, fuzzy_string_len, tmp_gram, length);
}
else
{
edit_distance = edit_distn(data, fuzzy_string_len,tmp_gram,length);
conf = 100-(edit_distance*100)/(fuzzy_string_len + length);
conf=GIE_string_similiarity(data, fuzzy_string_len, tmp_gram, length);
}
}

View File

@@ -6,7 +6,9 @@
#include <sys/stat.h>
#include <time.h>
#include <math.h>
#include <assert.h>
#include "stream_fuzzy_hash.h"
#include "gram_index_engine.h"
void* entropy_start(void)
{
@@ -77,23 +79,67 @@ void hash_file(const char* path)
free(digest_result_buff);
fclose(fp);
}
void digest_gen_print_usage(void)
{
printf("digest_gen uasge:\n\t-f [FILE], caculate a file's SFH digest.\n");
printf("\t-s specify first string for comparing.\n");
printf("\t-d specify second string for comparing.\n");
printf("\t-c compare two simple string with similairity.\n");
printf("\t-m compare two SFH signature.\n");
return;
}
int main(int argc, char * argv[])
{
char path[256];
if(argc == 2)
char str1[4096],str2[4096];
int oc=0;
int confidence=0;
int model=0;
const char* b_opt_arg=NULL;
if(argc<2)
{
hash_file(argv[1]);
digest_gen_print_usage();
return 0;
}
else if(NULL!=fgets(path,sizeof(path),stdin))
while((oc=getopt(argc,argv,"f:cms:d:"))!=-1)
{
hash_file(path);
switch(oc)
{
case 'f':
strncpy(path,optarg,sizeof(path));
break;
case 'c':
case 'm':
model=oc;
break;
case 's':
strncpy(str1,optarg,sizeof(str1));
break;
case 'd':
strncpy(str2,optarg,sizeof(str2));
break;
case '?':
default:
digest_gen_print_usage();
break;
}
}
else
switch(model)
{
printf("SFH uasge: ./digest_gen [Dir]\n");
exit(-1);
case 'f':
hash_file(path);
break;
case 'c':
confidence=GIE_string_similiarity(str1, strlen(str1), str2, strlen(str2));
printf("%d\n",confidence);
break;
case 'm':
sscanf(optarg,"%s,%s",str1,str2);
confidence=GIE_sfh_similiarity(str1, strlen(str1), str2, strlen(str2));
printf("%d\n",confidence);
break;
default:
assert(0);
}
return 0;
}