digest_gen工具支持相似度计算。
This commit is contained in:
@@ -59,7 +59,8 @@ int GIE_update(GIE_handle_t * handle, GIE_digest_t ** digests, int size);
|
||||
int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size);
|
||||
|
||||
void GIE_destory(GIE_handle_t * handle);
|
||||
|
||||
int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2);
|
||||
int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
@@ -1092,14 +1092,22 @@ inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, uns
|
||||
blocksize = (unsigned long long)atoi(blk);
|
||||
return blocksize;
|
||||
}
|
||||
int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2)
|
||||
{
|
||||
int edit_distance=0;
|
||||
int conf=0;
|
||||
edit_distance = edit_distn(str1, len1,str2,len2);
|
||||
conf = 100-(edit_distance*100)/(len1 + len2);
|
||||
return conf;
|
||||
}
|
||||
|
||||
int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_str, int len2)
|
||||
int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2)
|
||||
{
|
||||
int j = 0, t = 0;
|
||||
unsigned long long query_blocksize = 0, index_blocksize = 0;
|
||||
unsigned int query_real_length = 0, index_real_length = 0;
|
||||
const char *query_gram_begin = query_str;
|
||||
const char *index_gram_begin = index_str;
|
||||
const char *query_gram_begin = sfh1;
|
||||
const char *index_gram_begin = sfh2;
|
||||
char *splice_str = (char *)malloc(sizeof(char)*len1);
|
||||
memset(splice_str,'\0',len1);
|
||||
char *spli_str_begin = splice_str;
|
||||
@@ -1121,7 +1129,7 @@ int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_st
|
||||
index_gram_begin++;
|
||||
}
|
||||
index_real_length = get_real_length(index_gram_begin, len2);
|
||||
query_gram_begin = query_str;
|
||||
query_gram_begin = sfh1;
|
||||
for(t = 0; t < 2; t++)
|
||||
{
|
||||
query_blocksize = get_blocksize_from_head(query_gram_begin, len1);
|
||||
@@ -1259,7 +1267,6 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result
|
||||
struct id_table_data * ret_tmp = NULL;
|
||||
short conf = 0;
|
||||
int ret_size = 0;
|
||||
int edit_distance = 0;
|
||||
for(i = 0; i <= union_index; i++)
|
||||
{
|
||||
if( i == union_index || *tmp_id != current_id )
|
||||
@@ -1286,12 +1293,11 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result
|
||||
{
|
||||
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
|
||||
{
|
||||
conf = GIE_comp_edit_distance(data, fuzzy_string_len, tmp_gram, length);
|
||||
conf = GIE_sfh_similiarity(data, fuzzy_string_len, tmp_gram, length);
|
||||
}
|
||||
else
|
||||
{
|
||||
edit_distance = edit_distn(data, fuzzy_string_len,tmp_gram,length);
|
||||
conf = 100-(edit_distance*100)/(fuzzy_string_len + length);
|
||||
conf=GIE_string_similiarity(data, fuzzy_string_len, tmp_gram, length);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,9 @@
|
||||
#include <sys/stat.h>
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include "stream_fuzzy_hash.h"
|
||||
#include "gram_index_engine.h"
|
||||
|
||||
void* entropy_start(void)
|
||||
{
|
||||
@@ -77,23 +79,67 @@ void hash_file(const char* path)
|
||||
free(digest_result_buff);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void digest_gen_print_usage(void)
|
||||
{
|
||||
printf("digest_gen uasge:\n\t-f [FILE], caculate a file's SFH digest.\n");
|
||||
printf("\t-s specify first string for comparing.\n");
|
||||
printf("\t-d specify second string for comparing.\n");
|
||||
printf("\t-c compare two simple string with similairity.\n");
|
||||
printf("\t-m compare two SFH signature.\n");
|
||||
return;
|
||||
}
|
||||
int main(int argc, char * argv[])
|
||||
{
|
||||
char path[256];
|
||||
if(argc == 2)
|
||||
char str1[4096],str2[4096];
|
||||
int oc=0;
|
||||
int confidence=0;
|
||||
int model=0;
|
||||
const char* b_opt_arg=NULL;
|
||||
if(argc<2)
|
||||
{
|
||||
hash_file(argv[1]);
|
||||
digest_gen_print_usage();
|
||||
return 0;
|
||||
}
|
||||
else if(NULL!=fgets(path,sizeof(path),stdin))
|
||||
while((oc=getopt(argc,argv,"f:cms:d:"))!=-1)
|
||||
{
|
||||
hash_file(path);
|
||||
switch(oc)
|
||||
{
|
||||
case 'f':
|
||||
strncpy(path,optarg,sizeof(path));
|
||||
break;
|
||||
case 'c':
|
||||
case 'm':
|
||||
model=oc;
|
||||
break;
|
||||
case 's':
|
||||
strncpy(str1,optarg,sizeof(str1));
|
||||
break;
|
||||
case 'd':
|
||||
strncpy(str2,optarg,sizeof(str2));
|
||||
break;
|
||||
case '?':
|
||||
default:
|
||||
digest_gen_print_usage();
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
switch(model)
|
||||
{
|
||||
printf("SFH uasge: ./digest_gen [Dir]\n");
|
||||
exit(-1);
|
||||
case 'f':
|
||||
hash_file(path);
|
||||
break;
|
||||
case 'c':
|
||||
confidence=GIE_string_similiarity(str1, strlen(str1), str2, strlen(str2));
|
||||
printf("%d\n",confidence);
|
||||
break;
|
||||
case 'm':
|
||||
sscanf(optarg,"%s,%s",str1,str2);
|
||||
confidence=GIE_sfh_similiarity(str1, strlen(str1), str2, strlen(str2));
|
||||
printf("%d\n",confidence);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user