This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
tango-maat/src/entry/gram_index_engine.c

1360 lines
37 KiB
C
Raw Normal View History

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<math.h>
#include<assert.h>
#include<MESA/MESA_htable.h>
#include<unistd.h>
#include "gram_index_engine.h"
#include "queue.h"
#define HTABLE_SIZE 32*1024
#define GRAM_CNT_MAX 2
#define GRAM_MAX 128
#define TOLERENCE_SIZE 0
#define UNION_INIT_SIZE 1000
#define BLOCKSIZE_MIN 3
#define MEM_OCCUPY 1
#define CNT_MAX 10
#define GRAM_CNT_THRESHOLD 10
#define QUERY_LEN_ACCURACY 0.1
#define HTABLE_NUM 8
//#define GIE_INPUT_FORMAT_SFH 1
//#define GIE_INPUT_FORMAT_PLAIN 0
#define MAX_LENGTH 10000
#define KEY_MAX_LENGTH 10
#define EDIT_DISTN_INSERT_COST 1
#define EDIT_DISTN_REMOVE_COST 1
#define EDIT_DISTN_REPLACE_COST 2
#define MIN(x,y) ((x)<(y)?(x):(y))
int before(unsigned int off1, unsigned int off2)
{
return (signed int)(off1-off2)<0;
}
#define after(off2,off1) before(off1,off2)
typedef struct
{
unsigned int user_gram_value;
unsigned int user_position_accuracy;
short ED_reexamine;
short input_format;
MESA_htable_handle id_table;
MESA_htable_handle index_table[HTABLE_NUM];
unsigned long long mem_occupy;
unsigned long long hash_cnt;
}GIE_handle_inner_t;
struct linklist_node
{
short * position;
struct id_table_data * basicinfo;
short size;
short index;
unsigned long long blocksize;
TAILQ_ENTRY(linklist_node) listentry;
};
struct index_table_data
{
struct TQ * listhead;
int cnt;
};
struct id_table_data
{
unsigned int id;
short sfh_length;
short gram_cnt;
unsigned long long blocksize;
char * sfh;
void * tag;
char cfds_lvl;
};
struct htable_handle
{
MESA_htable_handle runtime_table;
MESA_htable_handle para;
};
struct key_list_node
{
char * key;
int digest_id;
int pos;
unsigned long long blocksize;
TAILQ_ENTRY(key_list_node) keylistentry;
};
unsigned long long hash_cnt;
unsigned long long cnt_sum;
TAILQ_HEAD(TQ, linklist_node);
TAILQ_HEAD(KL, key_list_node);
void idtable_free(void * data);
void indextable_free(void * data);
int key_compare(const uchar * key1, uint size1, const uchar * key2, uint size2);
int GIE_insert_indextable(MESA_htable_handle handle, struct id_table_data * info, char * key, unsigned int index,unsigned long long blocksize);
int GIE_delete_from_indextable_by_key(MESA_htable_handle handle, char * key, unsigned int id);
int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t * digest);
int GIE_cmp(const void * a, const void * b);
inline unsigned int get_real_length(const char * string, unsigned int length);
void print_item_iterate(const uchar * key, unsigned int size, void * data, void * user);
inline unsigned long long calc_fh_blocksize(unsigned long long orilen);
inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, unsigned int str_len);
MESA_htable_handle copy_htable(void * htable_para,void (* func)(const uchar * key, uint size, void * data, void *user),void (*free_fuc)(void * data));
void copy_idtable_item_iterate(const uchar * key, uint size, void * data, void * user);
void copy_indextable_item_iterate(const uchar * key, uint size, void * data, void * user);
GIE_handle_t * GIE_create(const GIE_create_para_t * para)
{
int i = 0;
GIE_handle_inner_t * handle = (GIE_handle_inner_t *)calloc(1, sizeof(GIE_handle_inner_t));
handle->mem_occupy = 0;
handle->mem_occupy += sizeof(GIE_handle_inner_t);
handle->user_gram_value = para->gram_value;
handle->user_position_accuracy = para->position_accuracy;
handle->input_format = para->format;
//handle->user_cmp = GIE_INPUT_FORMAT_PLAIN;
handle->ED_reexamine = para->ED_reexamine;
handle->hash_cnt = 0;
MESA_htable_create_args_t idtable_args,indextable_args[HTABLE_NUM];
memset(&idtable_args, 0, sizeof(idtable_args));
idtable_args.thread_safe = 0;
idtable_args.hash_slot_size = HTABLE_SIZE;
idtable_args.max_elem_num = 0;
idtable_args.expire_time = 0;
idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
idtable_args.key_comp = NULL;
idtable_args.key2index = NULL;
idtable_args.data_free = idtable_free;
idtable_args.data_expire_with_condition = NULL;
idtable_args.recursive = 0;
handle->id_table = MESA_htable_create(&idtable_args, sizeof(idtable_args));
for(i = 0;i < HTABLE_NUM;i++)
{
memset(&indextable_args[i], 0, sizeof(indextable_args[i]));
indextable_args[i].thread_safe = 0;
indextable_args[i].hash_slot_size = HTABLE_SIZE;
indextable_args[i].max_elem_num = 0;
indextable_args[i].expire_time = 0;
indextable_args[i].eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
indextable_args[i].key_comp = key_compare;
indextable_args[i].key2index = NULL;
indextable_args[i].data_free = indextable_free;
indextable_args[i].data_expire_with_condition = NULL;
indextable_args[i].recursive = 0;
handle->index_table[i] = MESA_htable_create(&indextable_args[i], sizeof(indextable_args[i]));
}
return (GIE_handle_t *)(handle);
}
int key_compare(const uchar * key1, uint size1, const uchar * key2, uint size2)
{
return ( (*(long*)key1) - (*(long*)key2));
}
void idtable_free(void * data)
{
struct id_table_data * tmp = (struct id_table_data *)data;
free(tmp->sfh);
tmp->sfh = NULL;
tmp->tag = NULL;
free(tmp);
tmp = NULL;
return;
}
void indextable_delete_with_threshold(MESA_htable_handle * htable_handle, struct index_table_data * tmp, char * key)
{
int key_length = strnlen(key,KEY_MAX_LENGTH);
struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
while(tmp_node != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node,listentry);
if(tmp_node->basicinfo->gram_cnt <= GRAM_CNT_THRESHOLD)
{
tmp_node = linklist_tmp;
continue;
}
TAILQ_REMOVE(tmp->listhead, tmp_node, listentry);
tmp_node->basicinfo->gram_cnt--;
tmp->cnt--;
if(TAILQ_EMPTY(tmp->listhead) == 1)
{
//_handle->hash_cnt--;
//_handle->mem_occupy -= (sizeof(struct index_table_data) + sizeof(struct TQ));
if(MESA_htable_del(htable_handle, (const uchar *)(key), key_length, indextable_free) < 0)
{
printf("indextable backtrack delete error!\n");
assert(0);
return;
}
}
//_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(tmp_node->size));
free(tmp_node->position);
tmp_node->position = NULL;
free(tmp_node);
tmp_node = NULL;
tmp_node = linklist_tmp;
}
return;
}
void indextable_free(void * data)
{
struct index_table_data * tmp = (struct index_table_data *)data;
struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
while(tmp_node != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry);
TAILQ_REMOVE(tmp->listhead, tmp_node, listentry);
tmp->cnt--;
free(tmp_node->position);
tmp_node->position = NULL;
free(tmp_node);
tmp_node = NULL;
tmp_node = linklist_tmp;
}
free(tmp->listhead);
tmp->listhead = NULL;
free(tmp);
tmp = NULL;
return;
}
void indextable_free_cnt(void * data)
{
struct index_table_data * tmp = (struct index_table_data *)data;
hash_cnt++;
cnt_sum += tmp->cnt;
struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
while(tmp_node != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry);
TAILQ_REMOVE(tmp->listhead, tmp_node, listentry);
tmp->cnt--;
free(tmp_node->position);
tmp_node->position = NULL;
free(tmp_node);
tmp_node = NULL;
tmp_node = linklist_tmp;
}
free(tmp->listhead);
tmp->listhead = NULL;
free(tmp);
tmp = NULL;
return;
}
void print_item_iterate_idtable(const uchar * key, uint size, void * data, void * user)
{
struct id_table_data * id_data = (struct id_table_data *)data;
printf("id:%u\n",id_data->id);
}
void print_item_iterate(const uchar * key, uint size, void * data, void * user)
{
struct index_table_data * index_data = (struct index_table_data *)data;
printf("%s %d\n", (char *)key, index_data->cnt);
struct linklist_node * tmp_node = NULL;
int i = 0;
TAILQ_FOREACH(tmp_node, index_data->listhead, listentry)
{
printf("id = %u\n",tmp_node->basicinfo->id);
printf("position is :\n");
for(i = 0;i < tmp_node->index;i++)
{
printf("%d ",tmp_node->position[i]);
}
printf("\n");
}
printf("\n");
}
int edit_distn(const char *s1, int s1len, const char *s2, int s2len)
{
long int max_len = 0;
if(s1len >= s2len)
{
max_len = s1len;
}
else
{
max_len = s2len;
}
int **t = (int **)malloc(2*sizeof(int *));
t[0] = (int *)malloc((max_len +1)*sizeof(int));
t[1] = (int *)malloc((max_len +1)*sizeof(int));
//int t[2][EDIT_DISTN_MAXLEN+1];
int *t1 = t[0];
int *t2 = t[1];
int *t3;
size_t i1, i2;
for (i2 = 0; i2 <= s2len; i2++)
t[0][i2] = i2 * EDIT_DISTN_REMOVE_COST;
for (i1 = 0; i1 < s1len; i1++) {
t2[0] = (i1 + 1) * EDIT_DISTN_INSERT_COST;
for (i2 = 0; i2 < s2len; i2++) {
int cost_a = t1[i2+1] + EDIT_DISTN_INSERT_COST;
int cost_d = t2[i2] + EDIT_DISTN_REMOVE_COST;
int cost_r = t1[i2] + (s1[i1] == s2[i2] ? 0 : EDIT_DISTN_REPLACE_COST);
t2[i2+1] = MIN(MIN(cost_a, cost_d), cost_r);
}
t3 = t1;
t1 = t2;
t2 = t3;
}
long int ret = t1[s2len];
free(t[0]);
free(t[1]);
free(t);
return ret;
//return t1[s2len];
}
void GIE_destory(GIE_handle_t * handle)
{
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle);
//printf("hash_cnt:%llu\n",_handle->hash_cnt);
//printf("mem_occupy:%llu\n",_handle->mem_occupy);
int i = 0;
for(i = 0;i < HTABLE_NUM;i++)
{
MESA_htable_destroy(_handle->index_table[i], indextable_free_cnt);
}
MESA_htable_destroy(_handle->id_table, idtable_free);
//printf("index_free hash_cnt :%llu\n", hash_cnt);
//printf("cnt sum :%llu\n",cnt_sum);
free(_handle);
_handle = NULL;
}
int grab_key_set(char * str_begin,short str_length,int i,unsigned int gram_value,short * gram_cnt,struct KL** to_process_list, unsigned long long blocksize)
{
int k = 0,j = 0;
char * tmp_gram = str_begin;
char key[gram_value+1];
int sum = 0,htable_index = 0;
if(str_length < gram_value)
{
return 0;
}
str_length = MIN(str_length,strnlen(str_begin,str_length));
*gram_cnt = str_length - gram_value + 1;
//printf("str_length:%d\n",str_length);
for(k = 0; k < str_length - gram_value + 1; k++)
{
sum = 0;
memset(key,'\0', gram_value+1);
memcpy(key, tmp_gram++, gram_value);
//printf("k:%d key:%s\n",k,key);
for(j = 0; j < gram_value; j++)
{
sum += key[j];
}
htable_index = sum%HTABLE_NUM;
struct key_list_node *tmp_node = (struct key_list_node *)calloc(1,sizeof(struct key_list_node));
tmp_node->key = (char *)calloc(gram_value+1,sizeof(char));
memcpy(tmp_node->key,key,gram_value);
tmp_node->digest_id = i;
tmp_node->pos = k;
tmp_node->blocksize = blocksize;
TAILQ_INSERT_TAIL(to_process_list[htable_index], tmp_node, keylistentry);
}
return 1;
}
int sfh_grab_key_set(char *sfh,short sfh_length,int i,unsigned int gram_value,short * gram_cnt,struct KL** to_process_list)
{
int t = 0;
char * tmp_gram = sfh;
unsigned long long blocksize = 0;
for(t = 0; t < 2;t++)
{
blocksize = get_blocksize_from_head(tmp_gram, sfh_length);
while(*tmp_gram != '\0')
{
if(*tmp_gram == ':')
{
tmp_gram++;
break;
}
tmp_gram++;
}
unsigned int real_length = get_real_length(tmp_gram, sfh_length);
if(real_length < gram_value)
{
if(t==0)
{
return 0;
}
else
{
continue;
}
}
grab_key_set(tmp_gram, real_length, i, gram_value, gram_cnt, to_process_list, blocksize);
while(*tmp_gram != '\0')
{
if(*tmp_gram == '#')
{
tmp_gram++;
break;
}
tmp_gram++;
}
}
return 1;
}
void free_key_set(struct KL ** to_process_list,int size)
{
int i = 0;
for(i = 0;i < size;i++)
{
struct key_list_node *tmp_node = TAILQ_FIRST(to_process_list[i]);
while(tmp_node != NULL)
{
struct key_list_node *key_list_tmp = TAILQ_NEXT(tmp_node, keylistentry);
TAILQ_REMOVE(to_process_list[i], tmp_node, keylistentry);
free(tmp_node->key);
tmp_node->key = NULL;
free(tmp_node);
tmp_node = NULL;
tmp_node = key_list_tmp;
}
free(to_process_list[i]);
to_process_list[i]= NULL;
}
}
int GIE_update(GIE_handle_t * handle,GIE_digest_t * * digests,int size)
{
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle);
struct id_table_data * info = NULL;
int success_cnt = 0;
int m = 0, i = 0, grab_ret = 0;
short gram_cnt = 0;
unsigned int input_fh_len = 0;
unsigned int gram_value = _handle->user_gram_value;
struct KL* to_process_list[HTABLE_NUM];
MESA_htable_handle htable_index_copy;
MESA_htable_handle htable_id_copy;
MESA_htable_handle htable_tmp_index=NULL,htable_tmp_id=NULL;
struct htable_handle * htable_copied_id_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle));
struct htable_handle * htable_copied_index_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle));
htable_copied_id_para->runtime_table = _handle->id_table;
htable_copied_id_para->para = NULL;
htable_id_copy = copy_htable((void *)htable_copied_id_para, copy_idtable_item_iterate,idtable_free);
MESA_htable_handle garbage_htable[HTABLE_NUM];
/*if(MESA_htable_iterate(htable_id_copy, print_item_iterate_idtable, NULL) == -1)
{
printf("iterate error!\n");
}
printf("size:%u\n",id_size);*/
for(m = 0;m < HTABLE_NUM;m++)
{
to_process_list[m]=(struct KL*)calloc(1,sizeof(struct KL));
TAILQ_INIT(to_process_list[m]);
}
for(i = 0; i < size; i++)
{
switch(digests[i]->operation)
{
case GIE_INSERT_OPT:
{
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
grab_ret = sfh_grab_key_set(digests[i]->sfh,digests[i]->sfh_length,i,gram_value,&gram_cnt,to_process_list);
}
else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
{
grab_ret = grab_key_set(digests[i]->sfh,digests[i]->sfh_length,i,gram_value,&gram_cnt,to_process_list,0);
}
if(grab_ret == 0)
{
continue;
}
else
{
info = (struct id_table_data *)calloc(1,sizeof(struct id_table_data));
input_fh_len = digests[i]->sfh_length;
info->sfh = (char *)calloc(input_fh_len + 1,sizeof(char));
memcpy(info->sfh, digests[i]->sfh, input_fh_len);
_handle->mem_occupy += sizeof(struct id_table_data) + sizeof(char)*(input_fh_len+1);
info->sfh_length = digests[i]->sfh_length;
info->gram_cnt = gram_cnt;
/*int tag_len = strnlen(digests[i]->tag,MAX_LENGTH);
info->tag = (char *)calloc(tag_len+1,sizeof(char));
memcpy(info->tag,digests[i]->tag,tag_len);*/
info->tag = digests[i]->tag;
info->id = digests[i]->id;
info->cfds_lvl = digests[i]->cfds_lvl;
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
info->blocksize = get_blocksize_from_head(digests[i]->sfh, digests[i]->sfh_length);
}
else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
{
info->blocksize = 0;
}
if(MESA_htable_add(htable_id_copy, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), (const void *)info) < 0)
{
_handle->mem_occupy -= (sizeof(struct id_table_data) + sizeof(char)*(input_fh_len+1));
free(info->sfh);
info->sfh = NULL;
free(info);
info = NULL;
continue;
}
}
success_cnt ++;
break;
}
case GIE_DELETE_OPT:
{
struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(htable_id_copy, \
(const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id));
if(ret!= NULL)
{
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
success_cnt += sfh_grab_key_set(ret->sfh,ret->sfh_length,i,gram_value,&gram_cnt,to_process_list);
}
else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
{
success_cnt += grab_key_set(ret->sfh,ret->sfh_length,i,gram_value,&gram_cnt,to_process_list,0);
}
}
else
{
break;
}
if(MESA_htable_del(htable_id_copy, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), idtable_free) < 0)
{
printf("delete id failed!");
assert(0);
}
//success_cnt += GIE_delete(_handle, digests[i]);
break;
}
default:
break;
}
}
unsigned int digest_id = 0;
struct id_table_data * tmp_info= NULL;
for(i = 0;i < HTABLE_NUM;i++)
{
htable_copied_index_para->runtime_table = _handle->index_table[i];
htable_copied_index_para->para = htable_id_copy;
htable_index_copy = copy_htable((void *)htable_copied_index_para,copy_indextable_item_iterate,indextable_free);
struct key_list_node * tmp_node;
TAILQ_FOREACH(tmp_node, to_process_list[i], keylistentry)
{
digest_id = tmp_node->digest_id;
if(digests[digest_id]->operation == GIE_INSERT_OPT)
{
tmp_info =(struct id_table_data *)MESA_htable_search(htable_id_copy, (const uchar *)(&(digests[digest_id])->id), \
sizeof((digests[digest_id])->id));
if(tmp_info == NULL)
{
printf("id %u not insert\n",digests[digest_id]->id);
}
if(GIE_insert_indextable(htable_index_copy, tmp_info, tmp_node->key, tmp_node->pos,tmp_node->blocksize) < 0)
{
printf("insert %d indextable failed!\n",digests[digest_id]->id);
continue;
}
}
else if(digests[digest_id]->operation == GIE_DELETE_OPT)
{
if(GIE_delete_from_indextable_by_key(htable_index_copy, tmp_node->key, (digests[digest_id])->id) < 0)
{
printf("delete %d indextable failed!\n",digests[digest_id]->id);
continue;
}
}
}
htable_tmp_index= _handle->index_table[i];
_handle->index_table[i] = htable_index_copy;
garbage_htable[i]=htable_tmp_index;
}
htable_tmp_id = _handle->id_table;
_handle->id_table = htable_id_copy;
usleep(200);
MESA_htable_destroy(htable_tmp_id, idtable_free);
/*if(MESA_htable_iterate(_handle->index_table, print_item_iterate, NULL) == -1)
{
printf("iterate error!\n");
}*/
for(i=0;i<HTABLE_NUM;i++)
{
MESA_htable_destroy(garbage_htable[i], indextable_free_cnt);
}
free_key_set(to_process_list,HTABLE_NUM);
free(htable_copied_id_para);
htable_copied_id_para = NULL;
free(htable_copied_index_para);
htable_copied_index_para = NULL;
return success_cnt;
}
MESA_htable_handle copy_htable(void * htable_para,void (* func)(const uchar * key, uint size, void * data, void *user),void (*free_fuc)(void * data))
{
MESA_htable_create_args_t copy_table_args;
memset(&copy_table_args, 0, sizeof(copy_table_args));
copy_table_args.thread_safe = 0;
copy_table_args.hash_slot_size = HTABLE_SIZE;
copy_table_args.max_elem_num = 0;
copy_table_args.expire_time = 0;
copy_table_args.eliminate_type = HASH_ELIMINATE_ALGO_FIFO;
copy_table_args.key_comp = NULL;
copy_table_args.key2index = NULL;
copy_table_args.data_free = free_fuc;
copy_table_args.data_expire_with_condition = NULL;
copy_table_args.recursive = 0;
MESA_htable_handle copy_htable_handle = MESA_htable_create(&copy_table_args, sizeof(copy_table_args));
struct htable_handle * htable_copied_para = (struct htable_handle *)htable_para;
struct htable_handle * htable_iterate_para = (struct htable_handle *)calloc(1,sizeof(struct htable_handle));
htable_iterate_para->runtime_table = copy_htable_handle;
htable_iterate_para->para = htable_copied_para->para;
if(MESA_htable_iterate(htable_copied_para->runtime_table, func, htable_iterate_para) == -1)
{
printf("iterate error!\n");
}
free(htable_iterate_para);
htable_copied_para=NULL;
return copy_htable_handle;
}
void copy_indextable_item_iterate(const uchar * key, uint size, void * data, void * user)
{
struct index_table_data * index_data = (struct index_table_data *)data;
struct htable_handle * htable_copied_para = (struct htable_handle *)user;
struct index_table_data * index_data_copy = (struct index_table_data *)calloc(1, sizeof(struct index_table_data));
struct TQ * head = (struct TQ *)calloc(1, sizeof(struct TQ));
index_data_copy->listhead = head;
index_data_copy->cnt = index_data->cnt;
TAILQ_INIT(head);
struct linklist_node * tmp_node = NULL;
struct id_table_data * ret = NULL;
int i = 0;
TAILQ_FOREACH(tmp_node, index_data->listhead, listentry)
{
struct linklist_node * node_data = (struct linklist_node *)calloc(1,sizeof(struct linklist_node));
node_data->size = tmp_node->size;
node_data->position = (short *)calloc(node_data->size, sizeof(short));
for(i = 0;i < tmp_node->index;i++)
{
node_data->position[i] = tmp_node->position[i];
}
ret = (struct id_table_data *)MESA_htable_search(htable_copied_para->para, (const uchar *)(&(tmp_node->basicinfo->id)), sizeof(tmp_node->basicinfo->id));
if(ret == NULL)
{
//printf("copy id %u not exist\n",tmp_node->basicinfo->id);
free(node_data->position);
node_data->position = NULL;
free(node_data);
node_data = NULL;
continue;
}
node_data->basicinfo = ret;
node_data->index = tmp_node->index;
node_data->blocksize = tmp_node->blocksize;
TAILQ_INSERT_TAIL(head, node_data, listentry);
}
MESA_htable_add(htable_copied_para->runtime_table, key, size, (const void *)index_data_copy);
}
void copy_idtable_item_iterate(const uchar * key, uint size, void * data, void * user)
{
struct id_table_data * id_data = (struct id_table_data *)data;
struct htable_handle * htable_para = (struct htable_handle *)user;
struct id_table_data * id_data_copy = (struct id_table_data *)calloc(1, sizeof(struct id_table_data));
id_data_copy->blocksize = id_data->blocksize;
id_data_copy->cfds_lvl = id_data->cfds_lvl;
id_data_copy->gram_cnt = id_data->gram_cnt;
id_data_copy->id = id_data->id;
id_data_copy->sfh_length = id_data->sfh_length;
id_data_copy->sfh = (char *)calloc(id_data_copy->sfh_length,sizeof(char));
memcpy(id_data_copy->sfh,id_data->sfh,id_data_copy->sfh_length);
/*int tag_len = strlen(id_data->tag);
id_data_copy->tag = (char *)calloc(tag_len+1,sizeof(char));
memcpy(id_data_copy->tag,id_data->tag,tag_len);*/
MESA_htable_add(htable_para->runtime_table, (const uchar *)(&(id_data_copy->id)), sizeof(id_data_copy->id), (const void *)id_data_copy);
}
int GIE_insert_indextable(MESA_htable_handle htable_copy, struct id_table_data * info, char * key, unsigned int index, unsigned long long blocksize)
{
int key_length = strnlen(key,KEY_MAX_LENGTH);
struct linklist_node * node_data = (struct linklist_node *)calloc(1,sizeof(struct linklist_node));
node_data->size = GRAM_CNT_MAX;
node_data->position = (short *)calloc(node_data->size, sizeof(short));
node_data->basicinfo = info;
node_data->index = 0;
node_data->position[(node_data->index)++] = index;
node_data->blocksize = blocksize;
//_handle->mem_occupy += sizeof(struct linklist_node) + sizeof(short)*(node_data->size);
struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search(htable_copy, \
(const uchar *)(key), key_length));
if(ret != NULL)
{
struct linklist_node * tmp = NULL;
TAILQ_FOREACH(tmp, ret->listhead, listentry)
{
if(tmp->basicinfo->id > node_data->basicinfo->id)
{
TAILQ_INSERT_BEFORE(tmp, node_data, listentry);
ret->cnt ++;
if(ret->cnt >= CNT_MAX)
{
indextable_delete_with_threshold(htable_copy,ret,key);
}
return 0;
}
if(tmp->basicinfo->id == node_data->basicinfo->id && tmp->blocksize == blocksize)
{
if(tmp->index >= tmp->size)
{
tmp->size *= 2;
tmp->position = realloc(tmp->position, (tmp->size)*sizeof(short));
}
tmp->position[(tmp->index)++] = index;
//_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(node_data->size));
free(node_data->position);
node_data->position = NULL;
free(node_data);
node_data = NULL;
return 0;
}
}
TAILQ_INSERT_TAIL(ret->listhead, node_data, listentry);
ret->cnt ++;
if(ret->cnt >= CNT_MAX)
{
indextable_delete_with_threshold(htable_copy,ret,key);
}
}
else
{
struct index_table_data * index_data = (struct index_table_data *)calloc(1, sizeof(struct index_table_data));
struct TQ * head = (struct TQ *)calloc(1, sizeof(struct TQ));
//_handle->mem_occupy += sizeof(struct index_table_data) + sizeof(struct TQ);
index_data->listhead = head;
index_data->cnt = 0;
TAILQ_INIT(head);
TAILQ_INSERT_TAIL(head, node_data, listentry);
index_data->cnt++;
//_handle->hash_cnt++;
if(MESA_htable_add(htable_copy, (const uchar *)(key), key_length, (const void *)index_data) < 0)
{
printf("add index_table failed!\n");
assert(0);
return -1;
}
}
return 0;
}
int GIE_delete(GIE_handle_inner_t * _handle, GIE_digest_t * digest)
{
int success_cnt = 0;
struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(_handle->id_table, \
(const uchar *)(&(digest->id)), sizeof(digest->id));
if(ret == NULL)
{
printf("del %d doesn't exist!\n",digest->id);
return -1;
}
else
{
int gram_value = _handle->user_gram_value;
char key[gram_value+1];
char * tmp_gram = ret->sfh;
while(*tmp_gram != '\0')
{
if(*tmp_gram == ':')
{
tmp_gram++;
break;
}
tmp_gram++;
}
unsigned int real_length = get_real_length(tmp_gram, ret->sfh_length);
int gram_cnt = real_length - gram_value + 1;
int k = 0;
for(k = 0; k < gram_cnt; k++)
{
memset(key, '\0', gram_value+1);
memcpy(key, tmp_gram++, gram_value);
if(GIE_delete_from_indextable_by_key(_handle, key, digest->id) < 0)
{
printf("delete %d indextable failed!\n",digest->id);
continue;
}
}
success_cnt++;
}
return success_cnt;
}
int GIE_delete_from_indextable_by_key(MESA_htable_handle htable, char * key, unsigned int id)
{
int key_length = strnlen(key,KEY_MAX_LENGTH);
struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search(htable, \
(const uchar *)(key), key_length));
if(ret == NULL)
{
return 0;
}
struct linklist_node * tmp = TAILQ_FIRST(ret->listhead);
while(tmp != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp, listentry);
if(tmp->basicinfo->id != id)
{
tmp=linklist_tmp;
continue;
}
TAILQ_REMOVE(ret->listhead, tmp, listentry);
ret->cnt--;
//_handle->mem_occupy -= (sizeof(struct linklist_node) + sizeof(short)*(tmp->size));
free(tmp->position);
tmp->position = NULL;
free(tmp);
tmp = NULL;
if(TAILQ_EMPTY(ret->listhead) == 1)
{
//_handle->mem_occupy -= (sizeof(struct index_table_data) + sizeof(struct TQ));
int ret = MESA_htable_del(htable, (const uchar *)(key), key_length, indextable_free);
if(ret < 0)
{
printf("indextable backtrack delete error!\n");
assert(0);
return -1;
}
}
}
return 0;
}
int GIE_cmp(const void * a, const void * b)
{
unsigned int tmp_a = *(unsigned int *)a;
unsigned int tmp_b = *(unsigned int *)b;
if(before(tmp_a, tmp_b))
{
return -1;
}
else if(after(tmp_a, tmp_b))
{
return 1;
}
else
{
return 0;
}
}
inline unsigned int get_real_length(const char * string, unsigned int length)
{
unsigned int ret = 0;
const char * tmp_str = string;
while(*tmp_str != '\0')
{
if(*tmp_str == '[')
{
break;
}
tmp_str++;
ret ++;
}
return ret;
}
inline int GIE_part_query(GIE_handle_inner_t * _handle, const char * query_string, int index_begin, int part_query_len,unsigned int ** id_union, unsigned int * union_index, unsigned int * union_size, unsigned long long blocksize)
{
unsigned int gram_value = _handle->user_gram_value;
unsigned int real_length = part_query_len;
unsigned int chunk_count_max = 0;
if(real_length < gram_value)
{
return 0;
}
else
{
chunk_count_max = real_length/gram_value;
}
char key[gram_value+1];
struct index_table_data * ret = NULL;
struct linklist_node * tmp_node_t = NULL;
unsigned int position_accuracy = _handle->user_position_accuracy;
int i=0,j=0,k=0;
unsigned int tmp_min = 0;
int sum = 0, htable_index = 0;
for(i = index_begin; i < chunk_count_max + index_begin; i++)
{
sum = 0;
memset(key,'\0',gram_value+1);
memcpy(key, query_string, gram_value);
for(k = 0; k < gram_value; k++)
{
sum += key[k];
}
htable_index = sum%HTABLE_NUM;
ret = (struct index_table_data *) MESA_htable_search(_handle->index_table[htable_index], \
(const uchar *)(key), strnlen(key,gram_value));
query_string = query_string + gram_value;
if(ret ==NULL)
{
break;
}
tmp_node_t = NULL;
TAILQ_FOREACH(tmp_node_t, ret->listhead, listentry)
{
tmp_min = 0;
if(i*gram_value >= position_accuracy)
{
tmp_min = i*gram_value - position_accuracy;
}
for(j = 0; j < tmp_node_t->index; j++)
{
if((blocksize == tmp_node_t->basicinfo->blocksize) && (tmp_node_t->position[j] >= tmp_min) && (tmp_node_t->position[j] <= i*gram_value + position_accuracy))
//if(blocksize == tmp_node_t->basicinfo->blocksize)
{
if((*union_index) >= (*union_size))
{
*union_size = (*union_size) * 2;
*id_union = (unsigned int *)realloc(*id_union, (*union_size)*sizeof(unsigned int));
}
(*id_union)[(*union_index)] = tmp_node_t->basicinfo->id;
(*union_index)++;
break;
}
}
}
}
return chunk_count_max;
}
inline int GIE_gram_with_position(GIE_handle_inner_t * _handle, unsigned long long query_blocksize, const char * fuzzy_string, unsigned int ** id_union,
unsigned int * union_index,unsigned int * union_size, unsigned int * chunk_cnt)
{
const char * tmpstr = fuzzy_string;
const char * query_string_begin;
unsigned long long blocksize = query_blocksize;
int part_query_len = 0;
int query_actual_len = 0;
while(*tmpstr != ':'&& *tmpstr != '\0')
{
tmpstr ++;
}
if(*tmpstr == ':')
{
tmpstr ++;
}
else
{
return 0;
}
query_string_begin = tmpstr;
char *p = NULL;
while((*query_string_begin) != '\0')
{
int left = 0;
int right = 0;
p=strchr(query_string_begin,'[');
if(p!=NULL)
{
part_query_len = p-query_string_begin;
int ret = sscanf(p,"[%d:%d]",&left,&right);
if(ret != 2)
{
break;
}
p=strchr(p,']');
if(p != NULL && (*p) != '\0')
{
int index_begin = (left/blocksize - TOLERENCE_SIZE > 0 ? (left/blocksize - TOLERENCE_SIZE) : 0);
(*chunk_cnt) += GIE_part_query(_handle,query_string_begin,index_begin, part_query_len,
id_union, union_index, union_size, blocksize);
query_actual_len += part_query_len;
query_string_begin = p+1;
}
else
{
break;
}
}
else
{
break;
}
}
return query_actual_len;
}
inline unsigned long long calc_fh_blocksize(unsigned long long orilen)
{
double tmp = orilen/(64 * BLOCKSIZE_MIN);
double index = floor(log(tmp)/log(2));
double tmp_t = pow(2,index);
unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN);
return blocksize;
}
inline unsigned long long get_blocksize_from_head(const char * fuzzy_string, unsigned int str_len)
{
const char * tmp_str = fuzzy_string;
char blk[100];
memset(blk,'\0',sizeof(blk));
unsigned long long blocksize = 0;
int i = 0;
while(*tmp_str != '\0' && *tmp_str != ':' && str_len != 0 && i < 100)
{
blk[i++] = *tmp_str;
tmp_str++;
str_len--;
}
blocksize = (unsigned long long)atoi(blk);
return blocksize;
}
int GIE_comp_edit_distance(const char *query_str, int len1, const char *index_str, int len2)
{
int j = 0, t = 0;
unsigned long long query_blocksize = 0, index_blocksize = 0;
unsigned int query_real_length = 0, index_real_length = 0;
const char *query_gram_begin = query_str;
const char *index_gram_begin = index_str;
char *splice_str = (char *)malloc(sizeof(char)*len1);
memset(splice_str,'\0',len1);
char *spli_str_begin = splice_str;
int edit_distance = 0;
int ret = 0;
char *p = NULL;
int splice_len = 0;
for(j = 0; j < 2; j++)
{
index_blocksize = get_blocksize_from_head(index_gram_begin, len2);
while((*index_gram_begin) != '\0')
{
if((*index_gram_begin) == ':')
{
index_gram_begin++;
break;
}
index_gram_begin++;
}
index_real_length = get_real_length(index_gram_begin, len2);
query_gram_begin = query_str;
for(t = 0; t < 2; t++)
{
query_blocksize = get_blocksize_from_head(query_gram_begin, len1);
//printf("gram_begin:%c\n",*index_gram_begin);
//printf("gram_str:%s\n",index_gram_begin);
while((*query_gram_begin) != '\0')
{
if((*query_gram_begin) == ':')
{
query_gram_begin++;
break;
}
query_gram_begin++;
}
//printf("query_blocksize:%lld, index_blocksize:%lld\n",query_blocksize,index_blocksize);
//index_real_length = get_real_length(index_gram_begin, len1);
if(query_blocksize == index_blocksize)
{
while((*query_gram_begin) != '#' && (*query_gram_begin) != '\0')
{
p=strchr(query_gram_begin,'[');
if(p!=NULL)
{
query_real_length = p-query_gram_begin;
p=strchr(p,']');
if(p != NULL && (*p) != '\0')
{
memcpy(spli_str_begin,query_gram_begin,query_real_length);
spli_str_begin += query_real_length;
//edit_distance += edit_distn(query_gram_begin, query_real_length, index_gram_begin, index_real_length);
query_gram_begin = p+1;
}
else
{
break;
}
}
else
{
break;
}
}
splice_len = strnlen(splice_str,len1);
edit_distance = edit_distn(index_gram_begin, index_real_length, splice_str, splice_len);
//printf("query_real_length:%d splice_length:%d edit_distance:%d\n",query_real_length,splice_len,edit_distance);
ret = 100-(edit_distance*100)/(index_real_length + splice_len);
//ret = (100*ret)/SPAM_LENGTH;
//ret = 100-ret;
//ret = 100 - (100*edit_distance)/(query_real_length);
free(splice_str);
return ret;
}
while(*query_gram_begin != '\0')
{
if(*query_gram_begin == '#')
{
query_gram_begin++;
break;
}
query_gram_begin++;
}
}
while(*index_gram_begin != '\0')
{
if(*index_gram_begin == '#')
{
index_gram_begin++;
break;
}
index_gram_begin++;
}
}
//printf("no blocksize:query_real_length:%d splice_length:%d edit_distance:%d\n",query_real_length,splice_len,edit_distance);
free(splice_str);
return 0;
}
int GIE_query(GIE_handle_t * handle, const char * data, int data_len, GIE_result_t * results, int result_size)
{
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *) handle;
int i = 0, j = 0;
unsigned int union_index = 0;
unsigned int gram_value = _handle->user_gram_value;
unsigned int query_actual_len = 0;
unsigned int union_size = UNION_INIT_SIZE;
unsigned int chunk_cnt = 0;
const char *fuzzy_string_begin = data;
unsigned int * id_union =(unsigned int *)calloc(union_size, sizeof(unsigned int));
unsigned long long query_blocksize = 0;
unsigned int fuzzy_string_len = (unsigned int)data_len;
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
for(j = 0;j < 2;j++)
{
query_blocksize = get_blocksize_from_head(fuzzy_string_begin, fuzzy_string_len);
if(query_blocksize == 0)
{
return 0;
}
query_actual_len += GIE_gram_with_position(_handle, query_blocksize, fuzzy_string_begin, &id_union, &union_index, &union_size, &chunk_cnt);
while(*fuzzy_string_begin != '#' && *fuzzy_string_begin != '\0')
{
fuzzy_string_begin++;
}
if(*fuzzy_string_begin == '#')
{
fuzzy_string_begin++;
}
}
}
else if(_handle->input_format == GIE_INPUT_FORMAT_PLAIN)
{
query_actual_len = fuzzy_string_len;
chunk_cnt = GIE_part_query(_handle, fuzzy_string_begin, 0, query_actual_len, &id_union, &union_index, &union_size, 0);
}
if(union_index == 0)
{
free(id_union);
id_union = NULL;
return 0;
}
qsort(id_union, union_index, sizeof(id_union[0]), GIE_cmp);
unsigned int current_id = id_union[0];
unsigned int * tmp_id = id_union;
unsigned int count = 0;
struct id_table_data * ret_tmp = NULL;
short conf = 0;
int ret_size = 0;
int edit_distance = 0;
for(i = 0; i <= union_index; i++)
{
if( i == union_index || *tmp_id != current_id )
{
ret_tmp = (struct id_table_data *) MESA_htable_search(_handle->id_table, \
(const uchar *)(&(current_id)), sizeof(current_id));
if(ret_tmp == NULL)
{
break;
}
char * tmp_gram = ret_tmp->sfh;
int length = ret_tmp->sfh_length;
if(ret_tmp->gram_cnt == 0||chunk_cnt == 0)
{
conf = 0;
}
else
{
conf = (count*(query_actual_len-gram_value+1)*10)/(chunk_cnt*(ret_tmp->gram_cnt));
}
if(_handle->ED_reexamine == 1)
{
if(_handle->input_format == GIE_INPUT_FORMAT_SFH)
{
conf = GIE_comp_edit_distance(data, fuzzy_string_len, tmp_gram, length);
}
else
{
edit_distance = edit_distn(data, fuzzy_string_len,tmp_gram,length);
conf = 100-(edit_distance*100)/(fuzzy_string_len + length);
}
}
if(conf >= ret_tmp->cfds_lvl)
{
results[ret_size].cfds_lvl = conf;
results[ret_size].id = current_id;
/*results[ret_size].tag = (char *)malloc((ret_tmp->sfh_length + 1)*sizeof(char));
memset(results[ret_size].tag,'\0',(ret_tmp->sfh_length+1));
memcpy(results[ret_size].tag, ret_tmp->sfh,ret_tmp->sfh_length);*/
results[ret_size].tag = ret_tmp->tag;
ret_size++;
}
if(ret_size == result_size)
{
break;
}
current_id = *tmp_id;
count = 1;
}
else
{
count++;
}
tmp_id ++;
}
free(id_union);
id_union = NULL;
return ret_size;
}
unsigned long long GIE_status(GIE_handle_t * handle, int type)
{
unsigned long long length;
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)handle;
switch(type)
{
case MEM_OCCUPY:
length = _handle->mem_occupy;
break;
default:
return 0;
}
return length;
}