digest_gen工具支持相似度计算。

This commit is contained in:
zhengchao
2017-08-11 11:09:06 +08:00
parent 0a64602d2a
commit 2faa589628
3 changed files with 71 additions and 18 deletions

View File

@@ -1092,14 +1092,22 @@ inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, uns
blocksize = (unsigned long long)atoi(blk);
return blocksize;
}
int GIE_string_similiarity(const char *str1, int len1, const char *str2, int len2)
{
int edit_distance=0;
int conf=0;
edit_distance = edit_distn(str1, len1,str2,len2);
conf = 100-(edit_distance*100)/(len1 + len2);
return conf;
}
int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_str, int len2)
int GIE_sfh_similiarity(const char *sfh1, int len1, const char *sfh2, int len2)
{
int j = 0, t = 0;
unsigned long long query_blocksize = 0, index_blocksize = 0;
unsigned int query_real_length = 0, index_real_length = 0;
const char *query_gram_begin = query_str;
const char *index_gram_begin = index_str;
const char *query_gram_begin = sfh1;
const char *index_gram_begin = sfh2;
char *splice_str = (char *)malloc(sizeof(char)*len1);
memset(splice_str,'\0',len1);
char *spli_str_begin = splice_str;
@@ -1121,7 +1129,7 @@ int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_st
index_gram_begin++;
}
index_real_length = get_real_length(index_gram_begin, len2);
query_gram_begin = query_str;
query_gram_begin = sfh1;
for(t = 0; t < 2; t++)
{
query_blocksize = get_blocksize_from_head(query_gram_begin, len1);
@@ -1259,7 +1267,6 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result
struct id_table_data * ret_tmp = NULL;
short conf = 0;
int ret_size = 0;
int edit_distance = 0;
for(i = 0; i <= union_index; i++)
{
if( i == union_index || *tmp_id != current_id )
@@ -1286,12 +1293,11 @@ int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result
{
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
conf = GIE_comp_edit_distance(data, fuzzy_string_len, tmp_gram, length);
conf = GIE_sfh_similiarity(data, fuzzy_string_len, tmp_gram, length);
}
else
{
edit_distance = edit_distn(data, fuzzy_string_len,tmp_gram,length);
conf = 100-(edit_distance*100)/(fuzzy_string_len + length);
conf=GIE_string_similiarity(data, fuzzy_string_len, tmp_gram, length);
}
}

View File

@@ -1,67 +0,0 @@
#ifndef _GRAM_INDEX_ENGINE_
#define _GRAM_INDEX_ENGINE_
#ifdef __cplusplus
extern "C" {
#endif
#define GIE_INSERT_OPT 0
#define GIE_DELETE_OPT 1
#define GIE_INPUT_FORMAT_SFH 1
#define GIE_INPUT_FORMAT_PLAIN 0
typedef struct
{
/* data */
}GIE_handle_t;
typedef struct
{
unsigned int id;
unsigned int sfh_length;//size of fuzzy_hash
short operation;//GIE_INSERT_OPT or GIE_DELETE_OPT.if operation is GIE_DELETE_OPT, only id is needed;
short cfds_lvl;
char * sfh;
void * tag;
}GIE_digest_t;
typedef struct
{
unsigned int id;
short cfds_lvl;
void * tag;
}GIE_result_t;
typedef struct
{
unsigned int gram_value;
//unsigned int htable_num;
unsigned int position_accuracy;
short format; //if format==GIE_INPUT_FORMAT_SFH, means the input string is a GIE_INPUT_FORMAT_SFH string
//else id format==PALIN, means the input string is common string
short ED_reexamine;//if ED_reexamine==1, calculate edit distance to verify the final result
}GIE_create_para_t;
GIE_handle_t * GIE_create(const GIE_create_para_t * para);
int GIE_update(GIE_handle_t * handle, GIE_digest_t ** digests, int size);
//return actual matched result count
//return 0 when matched nothing;
//return -1 when error occurs;
int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size);
void GIE_destory(GIE_handle_t * handle);
#ifdef __cplusplus
}
#endif
#endif