将mesafuzzy和GIE的源代码集成进入Maat,并修改Makefile。

This commit is contained in:
zhengchao
2015-11-13 11:41:52 +08:00
parent 0c9449fd3f
commit f1da6a2b81
7 changed files with 2845 additions and 4 deletions

View File

@@ -1,11 +1,11 @@
#opt: OPTFLAGS = -O2
#export OPTFLAGS
CC = g++
CC = gcc
CCC = g++
CFLAGS = -Wall -g -fPIC
CFLAGS += $(OPTFLAGS)
LDFLAGS = -lMESA_handle_logger -lMESA_htable -lpthread
LDFLAGS = -lMESA_handle_logger -lMESA_htable -lpthread -lm
MAILLIB = ../lib
G_H_DIR =../inc_internal
@@ -13,7 +13,8 @@ H_DIR =-I$(G_H_DIR) -I../../inc
LIBMAAT = libmaatframe.a
LIBMAAT_SO = libmaatframe.so
OBJS=config_monitor.o Maat_rule.o Maat_api.o UniversalBoolMatch.o dynamic_array.o cJSON.o json2iris.o map_str2int.o
OBJS=config_monitor.o Maat_rule.o Maat_api.o UniversalBoolMatch.o dynamic_array.o cJSON.o json2iris.o map_str2int.o\
interval_index.o great_index_engine.o mesa_fuzzy.o
.c.o:
$(CC) -c $(CFLAGS) -I. $(H_DIR) $<

View File

@@ -0,0 +1,864 @@
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<math.h>
#include<assert.h>
#include<MESA/MESA_htable.h>
#include "great_index_engine.h"
#include "queue.h"
int GIE_VERSION_1_0_20151109=1;
#define HTABLE_SIZE 1024*1024
#define MAX 10000
#define FIRST_INSERT 1
#define SECOND_INSERT 0
#define TOLERENCE_SIZE 0
#define CONF_MAX 10
#define BLOCKSIZE_MIN 3
#define MAX_UINT64 (0xFFFFFFFFFFFFFFFF)
typedef struct
{
unsigned long long user_precision;
//int user_confidence_level_threshold;
double user_query_accuracy;
MESA_htable_handle id_table;
MESA_htable_handle index_table;
struct VL * valuelist;
}GIE_handle_inner_t;
struct valuelist_node
{
unsigned long long value;
struct VL * valuelist_name;
TAILQ_ENTRY(valuelist_node) vlistentry;
};
struct linklist_node
{
unsigned long long index_key;
struct TQ * listname;
struct id_table_data * basicinfo;
TAILQ_ENTRY(linklist_node) listentry;
};
struct index_table_data
{
struct TQ * listhead;
int cnt;
unsigned long long prev_value;
unsigned long long next_value;
};
struct id_table_data
{
unsigned int id;
unsigned long long origin_len;
unsigned long long blocksize;
char * fh;
short cfds_lvl;
void * tag;
struct linklist_node * first_backtrack;
struct linklist_node * second_backtrack;
};
TAILQ_HEAD(TQ, linklist_node);
TAILQ_HEAD(VL, valuelist_node);
void idtable_free(void * data);
void indextable_free(void * data);
int GIE_insert_indextable(GIE_handle_inner_t * handle, struct id_table_data * info, unsigned long long index_key, int flag);
int GIE_delete_from_indextable_by_key(GIE_handle_inner_t * handle, struct linklist_node * backtrack);
int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t ** digests, int size);
int GIE_union(struct TQ ** union_list, int list_num, struct id_table_data ** result,\
unsigned long long min, unsigned long long max, unsigned long long query_blocksize);
struct TQ * linklist_union(struct TQ * list_first, struct TQ * list_second, unsigned long long min, unsigned long long max,\
unsigned long long query_blocksize);
int minof3(int x, int y, int z);
int GIE_edit_distance(char* w1, int l1, const char* w2, int l2);
int GIE_edit_distance_with_position(char * fh, const char * fuzzy_string, unsigned long long orilen, int * fuzzy_actual_size,\
unsigned long long * calculate_len);
GIE_handle_t * GIE_create(const GIE_create_para_t * para)
{
GIE_handle_inner_t * handle = (GIE_handle_inner_t *)malloc(sizeof(GIE_handle_inner_t));
handle->user_precision = para->index_interval;
//handle->user_confidence_level_threshold = para->confidence_level_threshold;
handle->user_query_accuracy = para->query_accuracy;
struct VL * head = (struct VL *)malloc(sizeof(struct VL));
TAILQ_INIT(head);
handle->valuelist = head;
MESA_htable_create_args_t idtable_args,indextable_args;
memset(&idtable_args, 0, sizeof(idtable_args));
memset(&indextable_args, 0, sizeof(indextable_args));
idtable_args.thread_safe = 0;
idtable_args.hash_slot_size = HTABLE_SIZE;
idtable_args.max_elem_num = 4 * HTABLE_SIZE;
idtable_args.expire_time = 0;
idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU;
idtable_args.key_comp = NULL;
idtable_args.key2index = NULL;
idtable_args.data_free = idtable_free;
idtable_args.data_expire_with_condition = NULL;
idtable_args.recursive = 1;
indextable_args.thread_safe = 0;
indextable_args.hash_slot_size = HTABLE_SIZE;
indextable_args.max_elem_num = 4 * HTABLE_SIZE;
indextable_args.expire_time = 0;
indextable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU;
indextable_args.key_comp = NULL;
indextable_args.key2index = NULL;
indextable_args.data_free = indextable_free;
indextable_args.data_expire_with_condition = NULL;
indextable_args.recursive = 1;
handle->id_table = MESA_htable_create(&idtable_args, sizeof(idtable_args));
handle->index_table = MESA_htable_create(&indextable_args, sizeof(indextable_args));
return (GIE_handle_t *)(handle);
}
void idtable_free(void * data)
{
struct id_table_data * tmp = (struct id_table_data *)data;
free(tmp->fh);
free(tmp);
// printf("free id_table_data!\n");
return;
}
void indextable_free(void * data)
{
// printf("free index_table_data!\n");
struct index_table_data * tmp = (struct index_table_data *)data;
struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead);
while(tmp_node != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry);
free(tmp_node);
// printf("free list_node_data!\n");
tmp_node = linklist_tmp;
}
free(tmp->listhead);
free(tmp);
return;
}
void GIE_destory(GIE_handle_t * handle)
{
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle);
MESA_htable_destroy(_handle->index_table, NULL);
MESA_htable_destroy(_handle->id_table, NULL);
struct valuelist_node * tmp_node = TAILQ_FIRST(_handle->valuelist);
while(tmp_node != NULL)
{
struct valuelist_node * valuelist_tmp = TAILQ_NEXT(tmp_node, vlistentry);
free(tmp_node);
tmp_node = valuelist_tmp;
}
free(_handle->valuelist);
free(_handle);
}
unsigned long long calc_fh_blocksize(unsigned long long orilen)
{
double tmp = orilen/(64 * BLOCKSIZE_MIN);
double index = floor(log(tmp)/log(2));
double tmp_t = pow(2, index);
unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN);
return blocksize;
}
void print_item_iterate(const uchar * key, uint size, void * data, void * user)
{
//unsigned long long index_key = (unsigned long long)(* key);
struct index_table_data * index_data = (struct index_table_data *)data;
struct linklist_node * first_node = TAILQ_FIRST(index_data->listhead);
printf("index_key = %llu\n", first_node->index_key);
struct linklist_node * tmp_node = NULL;
TAILQ_FOREACH(tmp_node, index_data->listhead, listentry)
{
printf("id = %u orilen = %llu ", tmp_node->basicinfo->id, tmp_node->basicinfo->origin_len);
}
printf("\n");
}
int GIE_update(GIE_handle_t * handle, GIE_digest_t ** digests, int size)
{
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle);
struct id_table_data * info=NULL;
int success_cnt=0;
int i = 0;
unsigned int input_fh_len=0;
for(i = 0; i < size; i++)
{
switch(digests[i]->operation)
{
case GIE_INSERT_OPT:
{
unsigned long long first_index_key = (digests[i]->origin_len)/(_handle->user_precision)*(_handle->user_precision);
unsigned long long second_index_key = ((digests[i]->origin_len)/(_handle->user_precision) + 1)*(_handle->user_precision);
info = (struct id_table_data *)malloc(sizeof(struct id_table_data));
//printf("malloc id_table_data!\n");
input_fh_len=strlen(digests[i]->fuzzy_hash);
info->fh = (char *)calloc(sizeof(char),input_fh_len+1);
memcpy(info->fh, digests[i]->fuzzy_hash, input_fh_len);
info->origin_len = digests[i]->origin_len;
info->blocksize = calc_fh_blocksize(digests[i]->origin_len);
info->tag = digests[i]->tag;
info->id = digests[i]->id;
info->cfds_lvl = digests[i]->cfds_lvl;
info->first_backtrack = NULL;
info->second_backtrack = NULL;
if(MESA_htable_add(_handle->id_table, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), (const void *)info) < 0)
{
printf("add %d id_table failed!",digests[i]->id);
free(info->fh);
free(info);
continue;
}
if(GIE_insert_indextable(_handle, info, first_index_key, FIRST_INSERT) < 0)
{
printf("insert %d first failed\n",info->id);
assert(0);
free(info->fh);
free(info);
continue;
}
//printf("(info->first_backtrack)->index_key = %llu\n", (info->first_backtrack)->index_key);
if(GIE_insert_indextable(_handle, info, second_index_key, SECOND_INSERT) < 0)
{
printf("insert %d second failed\n",info->id);
assert(0);
free(info->fh);
free(info);
continue;
}
success_cnt++;
break;
}
case GIE_DELETE_OPT:
{
success_cnt += GIE_delete(_handle, digests, i);
break;
}
default:
break;
}
/*struct valuelist_node * tmp = NULL;
TAILQ_FOREACH(tmp, _handle->valuelist, vlistentry)
{
struct index_table_data * tmp_t = (struct index_table_data *)(MESA_htable_search_cb(_handle->index_table, (const uchar *)(&(tmp->value)), sizeof(tmp->value), NULL, NULL, NULL));
printf("prev_value = %llu ", tmp_t->prev_value);
printf("next_value = %llu ", tmp_t->next_value);
printf("value = %llu\n", tmp->value);
}*/
}
return success_cnt;
}
int GIE_insert_indextable(GIE_handle_inner_t * handle, struct id_table_data * info, unsigned long long index_key, int flag)
{
struct linklist_node * node_data = (struct linklist_node *)malloc(sizeof(struct linklist_node));
// printf("linklist_node malloc success\n");
node_data->basicinfo = info;
node_data->index_key = index_key;
node_data->listname = NULL;
if(flag == FIRST_INSERT)
{
info->first_backtrack = node_data; //Backtracking pointer to index table, it is a pointer to a structure pointer
// printf("1: (info->first_backtrack)->index_key = %llu\n", (info->first_backtrack)->index_key);
}
else
{
info->second_backtrack = node_data;
}
struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search_cb(handle->index_table, (const uchar *)(&index_key), sizeof(index_key), NULL, NULL, NULL));
if(ret != NULL)
{
//printf("ret != NULL\n");
struct linklist_node * tmp = NULL;
node_data->listname = ret->listhead;
//If there are linked list exists in index table, sorted according to id
TAILQ_FOREACH(tmp, ret->listhead, listentry)
{
if(tmp->basicinfo->id > node_data->basicinfo->id)
{
TAILQ_INSERT_BEFORE(tmp, node_data, listentry);
ret->cnt++;
return 0;
}
if(node_data->basicinfo->id == tmp->basicinfo->id)
{
printf("invalid insert!");
return -1;
}
//TODO <20><><EFBFBD><EFBFBD>id<69><64><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>id<69><64><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA><EFBFBD><EFBFBD>invalid insert
}
TAILQ_INSERT_TAIL(ret->listhead, node_data, listentry);
ret->cnt ++;
}
else
{
struct index_table_data * index_data = (struct index_table_data *)malloc(sizeof(struct index_table_data));
struct valuelist_node * tmp_t = NULL;
struct valuelist_node * value_data = (struct valuelist_node *)malloc(sizeof(struct valuelist_node));
value_data->value = index_key;
value_data->valuelist_name = handle->valuelist;
int insert_flag = 0;
TAILQ_FOREACH(tmp_t, handle->valuelist, vlistentry)
{
if(tmp_t->value > value_data->value)
{
TAILQ_INSERT_BEFORE(tmp_t, value_data, vlistentry);
insert_flag = 1;
break;
}
}
if(!insert_flag)
{
TAILQ_INSERT_TAIL(handle->valuelist, value_data, vlistentry);
}
struct valuelist_node * tmp_prev = TAILQ_PREV(value_data, VL, vlistentry);
struct valuelist_node * tmp_next = TAILQ_NEXT(value_data, vlistentry);
if(tmp_prev != NULL && tmp_next != NULL)
{
struct index_table_data * index_tmp_prev = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_prev->value)),\
sizeof(tmp_prev->value));
struct index_table_data * index_tmp_next = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_next->value)),\
sizeof(tmp_next->value));
index_tmp_prev->next_value = value_data->value;
index_data->prev_value = tmp_prev->value;
index_data->next_value = tmp_next->value;
index_tmp_next->prev_value = value_data->value;
}
if(tmp_prev != NULL && tmp_next == NULL)
{
struct index_table_data * index_tmp_prev = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_prev->value)),\
sizeof(tmp_prev->value));
index_tmp_prev->next_value = value_data->value;
index_data->prev_value = tmp_prev->value;
index_data->next_value = MAX_UINT64;
}
if(tmp_prev == NULL && tmp_next != NULL)
{
struct index_table_data * index_tmp_next = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_next->value)),\
sizeof(tmp_next->value));
index_data->prev_value = MAX_UINT64;
index_data->next_value = tmp_next->value;
index_tmp_next->prev_value = value_data->value;
}
if(tmp_prev == NULL && tmp_next == NULL)
{
index_data->prev_value = MAX_UINT64;
index_data->next_value = MAX_UINT64;
}
//If there are no entries<65><73> have to create a list head pointer,
//and add the corresponding entry in the index table, the data link to the back
struct TQ * head = (struct TQ *)malloc(sizeof(struct TQ));
index_data->listhead = head;
index_data->cnt = 0;
TAILQ_INIT(head);
TAILQ_INSERT_TAIL(head, node_data, listentry);
index_data->cnt++;
node_data->listname = index_data->listhead;
if(MESA_htable_add(handle->index_table, (const uchar *)(&index_key), sizeof(index_key), (const void *)index_data) < 0)
{
printf("add index_table failed!\n");
assert(0);
return -1;
}
// struct index_table_data * tmp_v = (struct index_table_data *)(MESA_htable_search_cb(handle->index_table, (const uchar *)(&index_key), sizeof(index_key), NULL, NULL, NULL));
// printf("index_data->prev_value = %llu ", index_data->prev_value);
// printf("index_data->next_value = %llu ", index_data->next_value);
// printf("index_key = %llu ", index_key);
// printf("prev_value = %llu ", tmp_v->prev_value);
// printf("next_value = %llu\n", tmp_v->next_value);
}
return 0;
}
int GIE_delete_from_indextable_by_key(GIE_handle_inner_t * handle, struct linklist_node * backtrack)
{
struct linklist_node * backtrack_node = backtrack; //Find the index table in the first meet of the list node pointer by backtracking
//find the key
unsigned long long tmp_key = backtrack_node->index_key;
//delete the node
TAILQ_REMOVE(backtrack_node->listname, backtrack, listentry);
//if first node is NULL, linklist is NULL, delete the record in the hashtable
if(TAILQ_EMPTY(backtrack_node->listname) == 1)
{
if(MESA_htable_del(handle->index_table, (const uchar *)(&tmp_key), sizeof(tmp_key), indextable_free) < 0)
{
printf("indextable backtrack delete error!\n");
assert(0);
return -1;
}
else
{
struct valuelist_node * tmp = NULL;
TAILQ_FOREACH(tmp, handle->valuelist, vlistentry)
{
if(tmp->value == backtrack_node->index_key)
{
break;
}
}
struct valuelist_node * tmp_prev = TAILQ_PREV(tmp, VL, vlistentry);
struct valuelist_node * tmp_next = TAILQ_NEXT(tmp, vlistentry);
if(tmp_prev != NULL && tmp_next != NULL)
{
struct index_table_data * index_tmp_prev = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_prev->value)), \
sizeof(tmp_prev->value), NULL, NULL, NULL);
struct index_table_data * index_tmp_next = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_next->value)), \
sizeof(tmp_next->value), NULL, NULL, NULL);
index_tmp_prev->next_value = tmp_next->value;
index_tmp_next->prev_value = tmp_prev->value;
}
if(tmp_prev != NULL && tmp_next == NULL)
{
struct index_table_data * index_tmp_prev = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_prev->value)), \
sizeof(tmp_prev->value), NULL, NULL, NULL);
index_tmp_prev->next_value = MAX_UINT64;
}
if(tmp_prev == NULL && tmp_next != NULL)
{
struct index_table_data * index_tmp_next = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_next->value)), \
sizeof(tmp_next->value), NULL, NULL, NULL);
index_tmp_next->prev_value = MAX_UINT64;
}
TAILQ_REMOVE(handle->valuelist, tmp, vlistentry);
free(tmp);
//printf("indextable backtrack delete success!\n");
}
}
free(backtrack_node);
return 0;
}
int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t ** digests, int i)
{
int success_cnt=0;
struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(handle->id_table, \
(const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id));
//if the record doesn't exist, printf delID doesn't exist!
//printf("ret->id = %u\n", ret->id);
//printf("(ret->first_backtrack)->index_key = %llu\n", (ret->first_backtrack)->index_key);
if(ret == NULL)
{
printf("del %d doesn't exist!\n",digests[i]->id);
}
else
{
GIE_delete_from_indextable_by_key(handle, ret->first_backtrack);
GIE_delete_from_indextable_by_key(handle, ret->second_backtrack);
success_cnt++;
}
if(MESA_htable_del(handle->id_table, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), idtable_free) < 0)
{
printf("delete id failed!");
assert(0);
}
return success_cnt;
}
int GIE_union(struct TQ ** union_list, int list_num, struct id_table_data ** result,\
unsigned long long min, unsigned long long max, unsigned long long query_blocksize)
{
struct TQ * tmp_list = (struct TQ *)malloc(sizeof(struct TQ));
TAILQ_INIT(tmp_list);
struct linklist_node * tmp_node = NULL;
int size = 0;
TAILQ_FOREACH(tmp_node, union_list[0], listentry)
{
if(tmp_node->basicinfo->origin_len >= min && tmp_node->basicinfo->origin_len <= max && tmp_node->basicinfo->blocksize == query_blocksize)
{
struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node));
new_node->index_key = tmp_node->index_key;
new_node->basicinfo = tmp_node->basicinfo;
new_node->listname = tmp_list;
TAILQ_INSERT_TAIL(tmp_list, new_node, listentry);
}
}
int i = 0;
for(i = 1; i < list_num; i++)
{
tmp_list = linklist_union(tmp_list, union_list[i], min, max, query_blocksize);
}
struct linklist_node * tmp_node_t = NULL;
TAILQ_FOREACH(tmp_node_t, tmp_list, listentry)
{
result[size++] = tmp_node_t->basicinfo;
}
struct linklist_node * first_node = TAILQ_FIRST(tmp_list);
while(first_node != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(first_node, listentry);
free(first_node);
first_node = linklist_tmp;
}
free(tmp_list);
return size;
}
struct TQ * linklist_union(struct TQ * list_first, struct TQ * list_second, unsigned long long min, unsigned long long max,\
unsigned long long query_blocksize)
{
struct TQ * link_result = (struct TQ *)malloc(sizeof(struct TQ));
TAILQ_INIT(link_result);
struct linklist_node * tmp_first = TAILQ_FIRST(list_first);
struct linklist_node * tmp_second = TAILQ_FIRST(list_second);
while(tmp_first != NULL && tmp_second != NULL)
{
//When combined final result in a relatively small deposit on id, id small pointer will move backward,
// if both are equal, both pointers move backward until a move to the tail end of the list
if(tmp_first->basicinfo->id < tmp_second->basicinfo->id)
{
if(tmp_first->basicinfo->origin_len >= min && tmp_first->basicinfo->origin_len <= max && tmp_first->basicinfo->blocksize == query_blocksize)
{
struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node));
new_node->index_key = tmp_first->index_key;
new_node->basicinfo = tmp_first->basicinfo;
new_node->listname = link_result;
TAILQ_INSERT_TAIL(link_result, new_node, listentry);
}
tmp_first = TAILQ_NEXT(tmp_first, listentry);
}
else if(tmp_first->basicinfo->id > tmp_second->basicinfo->id)
{
if(tmp_second->basicinfo->origin_len >= min && tmp_second->basicinfo->origin_len <= max && tmp_second->basicinfo->blocksize == query_blocksize)
{
struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node));
new_node->index_key = tmp_second->index_key;
new_node->basicinfo = tmp_second->basicinfo;
new_node->listname = link_result;
TAILQ_INSERT_TAIL(link_result, new_node, listentry);
}
tmp_second = TAILQ_NEXT(tmp_second, listentry);
}
else
{
if(tmp_first->basicinfo->origin_len >= min && tmp_first->basicinfo->origin_len <= max && tmp_first->basicinfo->blocksize == query_blocksize)
{
struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node));
new_node->index_key = tmp_first->index_key;
new_node->basicinfo = tmp_first->basicinfo;
new_node->listname = link_result;
TAILQ_INSERT_TAIL(link_result, new_node, listentry);
}
tmp_first = TAILQ_NEXT(tmp_first, listentry);
tmp_second = TAILQ_NEXT(tmp_second, listentry);
}
}
//The list is not linked to the end nodes remaining deposit to results
while(tmp_first != NULL)
{
if(tmp_first->basicinfo->origin_len >= min && tmp_first->basicinfo->origin_len <= max && tmp_first->basicinfo->blocksize == query_blocksize)
{
struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node));
new_node->index_key = tmp_first->index_key;
new_node->basicinfo = tmp_first->basicinfo;
new_node->listname = link_result;
TAILQ_INSERT_TAIL(link_result, new_node, listentry);
}
tmp_first = TAILQ_NEXT(tmp_first, listentry);
}
while(tmp_second != NULL)
{
if(tmp_second->basicinfo->origin_len >= min && tmp_second->basicinfo->origin_len <= max && tmp_second->basicinfo->blocksize == query_blocksize)
{
struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node));
new_node->index_key = tmp_second->index_key;
new_node->basicinfo = tmp_second->basicinfo;
new_node->listname = link_result;
TAILQ_INSERT_TAIL(link_result, new_node, listentry);
}
tmp_second = TAILQ_NEXT(tmp_second, listentry);
}
struct linklist_node * first_node = TAILQ_FIRST(list_first);
while(first_node != NULL)
{
struct linklist_node * linklist_tmp = TAILQ_NEXT(first_node, listentry);
free(first_node);
first_node = linklist_tmp;
}
free(list_first);
return link_result;
}
int minof3(int x, int y, int z)
{
x = (x<y)?x:y;
return (x<z)?x:z;
}
int GIE_edit_distance(char* w1, int l1, const char* w2, int l2)
{
// dp[x][y] means the min edit distance from partial word1 (0..x-1) to partial word2 (0..y-1)
// please note this takes O(mn) space; O(n) solution also available because only last iteration of result needs to be stored
int i, j;
int ** dp = (int **)malloc(sizeof(int *) * (l1 + 1));
for(i = 0; i < l1 + 1; i++)
{
dp[i] = (int *)malloc(sizeof(int) * (l2 + 1));
}
// init the dynamic programming matrix
dp[0][0] = 0;
for(i = 1; i<=l1; i++) dp[i][0] = i;
for(j = 1; j<=l2; j++) dp[0][j] = j;
for(i = 1; i<=l1; i++)
for(j = 1; j<=l2; j++)
if(w1[i-1] != w2[j-1])
//different char; so adding/replacing/deleting all takes one more step
dp[i][j] = minof3(dp[i][j-1], dp[i-1][j-1], dp[i-1][j]) + 1;
else
//same char; so no need to replace it; adding/deleting one still takes one more step
dp[i][j] = minof3(dp[i][j-1]+1, dp[i-1][j-1], dp[i-1][j]+1);
int result = dp[l1][l2];
for(i = 0; i < l1 + 1; i++)
{
free(dp[i]);
}
free(dp);
return result;
}
int GIE_edit_distance_with_position(char * fh, const char * fuzzy_string, unsigned long long orilen, int * fuzzy_actual_size, unsigned long long * calculate_len)
{
*fuzzy_actual_size = 0;
*calculate_len = 0;
int edit_distance = 0;
const char * tmpstr = fuzzy_string;
const char * tmp_fuzzy = fuzzy_string;
char * fh_tmp = fh;
int tmp_fuzzy_len = 0;
int fh_actual_len = 0;
unsigned long long blocksize = 0;
while(*fh_tmp != '\0')
{
if(*fh_tmp == '[')
{
break;
}
fh_actual_len ++;
fh_tmp++;
}
//*fuzzy_all_actual_size = fh_actual_len;
if(fh_actual_len != 0)
{
blocksize = (orilen - 1)/fh_actual_len;
}
else
{
blocksize = calc_fh_blocksize(orilen);
}
while(*tmpstr != '\0')
{
int left = 0;
int right = 0;
if(*tmpstr == '[')
{
char numleft[100],numright[100];
int i = 0 , j = 0;
tmpstr ++;
memset(numleft, '\0', sizeof(char));
memset(numright, '\0', sizeof(char));
while(*tmpstr != '\0' && *tmpstr != ':')
{
numleft[i++] = *tmpstr;
tmpstr ++;
}
//printf("i = %d\n", i);
left = atoi(numleft);
tmpstr++;
while(*tmpstr != '\0' && *tmpstr !=']')
{
numright[j++] = *tmpstr;
tmpstr ++;
}
//printf("j = %d\n", j);
right = atoi(numright);
*calculate_len += right - left;
//TODO: edit distance compare
int index = left/blocksize - TOLERENCE_SIZE > 0 ? left/blocksize - TOLERENCE_SIZE: 0;
int fh_size = right/blocksize + TOLERENCE_SIZE - index > fh_actual_len - index ? fh_actual_len - index: right/blocksize + TOLERENCE_SIZE - index;
edit_distance += GIE_edit_distance(fh + index, fh_size, tmp_fuzzy, tmp_fuzzy_len);
*fuzzy_actual_size += tmp_fuzzy_len;
if(*tmpstr !=']')
{
tmp_fuzzy = tmpstr + 1;
tmp_fuzzy_len = 0;
}
tmpstr ++;
}
else
{
tmp_fuzzy_len++;
tmpstr ++;
}
}
return edit_distance;
}
int GIE_query(GIE_handle_t * handle, unsigned long long origin_len, const char * fuzzy_string, GIE_result_t * results, int size)
{
GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)handle;
//find min_index
double min_tmp = (double)(origin_len * (1 - _handle->user_query_accuracy));
unsigned long long min_tmp_t = (unsigned long long )(floor(min_tmp));
unsigned long long min_index = min_tmp_t/(_handle->user_precision)*(_handle->user_precision);
//find max_index
double max_tmp = (double)(origin_len * (1 + _handle->user_query_accuracy));
unsigned long long max_tmp_t = (unsigned long long)(floor(max_tmp));
unsigned long long max_index = (max_tmp_t/(_handle->user_precision) + 1)*(_handle->user_precision);
unsigned long long tmp_size = (max_index - min_index)/(_handle->user_precision) + 1;
struct TQ * union_list[tmp_size];
unsigned long long i = min_index;
unsigned long long query_blocksize = calc_fh_blocksize(origin_len);
int list_num = 0;
int union_size = 0;
int union_size_max = 0;
int ret_size = 0;
//find
while(i <= max_index)
{
struct index_table_data * list_tmp = (struct index_table_data *)MESA_htable_search_cb(_handle->index_table, (const uchar * )(&i), \
sizeof(i), NULL, NULL, NULL);
if(list_tmp != NULL)
{
union_list[list_num++] = list_tmp->listhead;
i = list_tmp->next_value;
union_size_max += list_tmp->cnt;
}
else
{
i = i + _handle->user_precision;
}
}
struct id_table_data ** result_union = (struct id_table_data **)malloc(sizeof(struct id_table_data *)*union_size_max);
if(list_num != 0)
{
union_size = GIE_union(union_list, list_num, result_union, min_tmp_t, max_tmp_t, query_blocksize);
//printf("union_size = %d\n", union_size);
}
else
{
printf("the fh doesn't exsit!\n");
free(result_union);
return 0;
}
for(i = 0; i < union_size; i++)
{
int fuzzy_actual_len;
unsigned long long calculate_len;
/*if(result_union[i]->id == 2391)
{
printf("right\n");
}*/
int edit_distance = GIE_edit_distance_with_position(result_union[i]->fh, fuzzy_string, origin_len, &fuzzy_actual_len, &calculate_len);
//printf("fuzzy_actual_len = %d\n", fuzzy_actual_len);
short conf_tmp;
if(fuzzy_actual_len != 0 && edit_distance < fuzzy_actual_len)
{
//conf_tmp = CONF_MAX - (fuzzy_all_actual_len - (fuzzy_actual_len - edit_distance))*CONF_MAX/fuzzy_all_actual_len;
conf_tmp = (fuzzy_actual_len - edit_distance)*(calculate_len + 1)*CONF_MAX/(fuzzy_actual_len * origin_len);
//conf_tmp = CONF_MAX - edit_distance*CONF_MAX/fuzzy_actual_len;
}
else
{
conf_tmp = 0;
}
if(conf_tmp >= result_union[i]->cfds_lvl)
{
results[ret_size].cfds_lvl = conf_tmp;
results[ret_size].id = result_union[i]->id;
results[ret_size].origin_len = result_union[i]->origin_len;
results[ret_size++].tag = result_union[i]->tag;
}
if(ret_size == size)
{
break;
}
}
free(result_union);
return ret_size;
}

736
src/entry/interval_index.c Normal file
View File

@@ -0,0 +1,736 @@
#include<stdio.h>
#include<stdlib.h>
#include"interval_index.h"
/**
* There is a trick here. In order to hide specific
* realization of some structures, we use some approaches.
* Then the inner structure is named with "shadow", and
* the outer structure is named with "light". These words
* come from movie <<The Grand Master>>. Enjoy it :)
**/
/**
* Structure of inner segment
**/
typedef struct __IVI_shadow_seg_t{
IVI_seg_t lightseg;
TAILQ_ENTRY(__IVI_shadow_seg_t) ENTRY;
}IVI_shadow_seg_t;
TAILQ_HEAD(TQ, __IVI_shadow_seg_t);
/* Structure of inner InterVal Index */
typedef struct __IVI_shadow_t{
struct TQ ivi_queue;
int segs_cnt;
OFFSET_TYPE segs_length;
}IVI_shadow_t;
/**
* new is closer to head or tail ?
* Return 1 if closer to head than tail
* Else return 0
*/
int closer_to_head(IVI_shadow_seg_t * head, IVI_shadow_seg_t * tail, OFFSET_TYPE target)
{
if(head == NULL || tail == NULL)
return 1;
S_OFFSET_TYPE tmp1 = (S_OFFSET_TYPE)(target - head->lightseg.left);
S_OFFSET_TYPE tmp2 = (S_OFFSET_TYPE)(target - tail->lightseg.left);
S_OFFSET_TYPE distance_to_head = tmp1 > 0 ? tmp1 : -tmp1;
S_OFFSET_TYPE distance_to_tail = tmp2 > 0 ? tmp2 : -tmp2;
return (distance_to_tail - distance_to_head > 0);
}
IVI_seg_t * IVI_prev_continuous_seg(IVI_seg_t * seg)
{
if(NULL == seg)
{
return NULL;
}
IVI_shadow_seg_t * _seg = (IVI_shadow_seg_t *)seg;
IVI_shadow_seg_t * prev = TAILQ_PREV(_seg, TQ, ENTRY);
if(NULL == prev)
{
return NULL;
}
if(continuous((prev->lightseg).right, seg->left))
return (IVI_seg_t *)prev;
return NULL;
}
IVI_seg_t * IVI_next_continuous_seg(IVI_seg_t * seg)
{
if(NULL == seg)
{
return NULL;
}
IVI_shadow_seg_t * _seg = (IVI_shadow_seg_t *)seg;
IVI_shadow_seg_t * next = TAILQ_NEXT(_seg, ENTRY);
if(NULL == next)
{
return NULL;
}
if(continuous(seg->right, (next->lightseg).left))
return (IVI_seg_t *)next;
return NULL;
}
/**
* Name:
* IVI_relative_position
* Description:
* Get relative position of given two interval segments
* Params:
* seg1: Subject of relation
* seg2: Object of relation
* Relation:
* On success, return the relation of two segments with enum;
* Else, return ERROR in enum;
**/
Relation_t IVI_relative_position(IVI_seg_t * seg1, IVI_seg_t * seg2)
{
if(NULL == seg1 || NULL == seg2)
{
return ERROR;
}
if(before(seg1->right, seg2->left))
{
return LEFT_NO_OVERLAP;
}
if(!before(seg1->right, seg2->left) && before(seg1->right, seg2->right) && before(seg1->left, seg2->left))
{
return LEFT_OVERLAP;
}
if(!before(seg1->left, seg2->left) && !after(seg1->right, seg2->right))
{
return CONTAINED;
}
if(!after(seg1->left, seg2->left) && !before(seg1->right, seg2->right))
{
return CONTAIN;
}
if(!after(seg1->left, seg2->right) && after(seg1->right, seg2->right) && after(seg1->left, seg2->left))
{
return RIGHT_OVERLAP;
}
if(after(seg1->left, seg2->right))
{
return RIGHT_NO_OVERLAP;
}
return ERROR;
}
/**
* Name:
* IVI_create
* Description:
* Create an InterVal Index
* Params:
* void
* Return:
* Return a handler of this InterVal Index
**/
IVI_t * IVI_create(void)
{
IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)malloc(sizeof(IVI_shadow_t));
TAILQ_INIT(&(shadow_ivi->ivi_queue));
shadow_ivi->segs_cnt = 0;
shadow_ivi->segs_length = 0;
return (IVI_t *)shadow_ivi;
}
/**
* Name:
* IVI_destroy
* Description:
* Destroy a given InterVal Index's handler
* Params:
* handler: The InterVal Index you want to destroy
* cb: Callback function for user to free data in segement
* usr_para: User parameter
* Return:
* void
**/
void IVI_destroy(IVI_t * handler, IVI_callback_t cb, void * usr_para)
{
if(handler == NULL)
{
return;
}
IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler;
IVI_shadow_seg_t * tmpseg = TAILQ_FIRST(&(shadow_ivi->ivi_queue));
IVI_shadow_seg_t * tmp;
/* Free each seg in IVI */
while(tmpseg != NULL)
{
tmp = TAILQ_NEXT(tmpseg, ENTRY);
/* Free *data in seg */
if(NULL != cb)
{
cb(&(tmpseg->lightseg), usr_para);
}
free(tmpseg);
tmpseg = tmp;
}
/* Free IVI */
free(shadow_ivi);
handler = NULL;
}
/**
* Name:
* IVI_seg_malloc
* Description:
* Malloc a segment with given parameters
* Params:
* left: Left point of segment
* right: Right point of segment
* data: User data
* Return:
* Return a pointer of segment structure.
**/
IVI_seg_t * IVI_seg_malloc(OFFSET_TYPE left, OFFSET_TYPE right, void * data)
{
/* Left must <= Right */
if(after(left, right))
{
return NULL;
}
IVI_shadow_seg_t * shadow_seg = (IVI_shadow_seg_t *)malloc(sizeof(IVI_shadow_seg_t));
shadow_seg->lightseg.left = left;
shadow_seg->lightseg.right= right;
shadow_seg->lightseg.data = data;
return (IVI_seg_t *)shadow_seg;
}
/**
* Name:
* IVI_seg_free
* Description:
* Free the memory of given segment
* Params:
* seg: The segment that you want to free
* cb: Callback function for user to free *data in seg
* usr_para: User parameter for cb
* Return:
* void
**/
void IVI_seg_free(IVI_seg_t * seg, IVI_callback_t cb, void * usr_para)
{
/* Free user data first */
if(cb != NULL)
{
cb(seg, usr_para);
}
IVI_shadow_seg_t * shadow_seg = (IVI_shadow_seg_t *)seg;
/* Free seg */
free(shadow_seg);
seg = NULL;
}
/**
* Name:
* IVI_insert
* Description:
* Insert a segment to an InterVal Index handler,and the segment
* MUST not be overlapped with others in handler.
* Params:
* handler: The handler of InterVal Index created by IVI_create
* seg: A segment that user wants to add. It MUST be created
* by IVI_seg_malloc.
* Return:
* On success, 0 is returned;
* Else when overlapp occures or error occures, -1 is returned.
**/
int IVI_insert(IVI_t * handler, IVI_seg_t * seg)
{
IVI_shadow_t * shadow_ivi;
IVI_shadow_seg_t *head, *tail, *new_seg, *tmp_seg;
if(NULL == handler || NULL == seg)
{
return -1;
}
shadow_ivi = (IVI_shadow_t *)handler;
new_seg = (IVI_shadow_seg_t *)seg;
head = TAILQ_FIRST(&(shadow_ivi->ivi_queue));
tail = TAILQ_LAST(&(shadow_ivi->ivi_queue), TQ);
if(closer_to_head(head, tail, seg->left))
{
TAILQ_FOREACH(tmp_seg, &(shadow_ivi->ivi_queue), ENTRY)
{
/* Find the first seg whose left is bigger than given seg's right, we will insert new seg before it */
if(after(tmp_seg->lightseg.left, new_seg->lightseg.right))
{
TAILQ_INSERT_BEFORE(tmp_seg, new_seg, ENTRY);
shadow_ivi->segs_cnt ++;
shadow_ivi->segs_length += (seg->right - seg->left + 1);
return 0;
}
else if(before(tmp_seg->lightseg.right, new_seg->lightseg.left))
{
continue;
}
else /* Overlap */
{
return -1;
}
}
/* If have searched to the end of list, we will inset it to the tail */
TAILQ_INSERT_TAIL(&(shadow_ivi->ivi_queue), new_seg, ENTRY);
shadow_ivi->segs_cnt ++;
shadow_ivi->segs_length += (seg->right - seg->left + 1);
}
else
{
TAILQ_FOREACH_REVERSE(tmp_seg, &(shadow_ivi->ivi_queue), TQ, ENTRY)
{
/* Find the first seg whose right is smaller than given seg's left, we will insert new seg after it */
if(before(tmp_seg->lightseg.right, new_seg->lightseg.left))
{
TAILQ_INSERT_AFTER(&(shadow_ivi->ivi_queue), tmp_seg, new_seg, ENTRY);
shadow_ivi->segs_cnt ++;
shadow_ivi->segs_length += (seg->right - seg->left + 1);
return 0;
}
else if(after(tmp_seg->lightseg.left, new_seg->lightseg.right))
{
continue;
}
else /* Overlap */
{
return -1;
}
}
/* If have searched to the head of list, we will inset it to the head */
TAILQ_INSERT_HEAD(&(shadow_ivi->ivi_queue), new_seg, ENTRY);
shadow_ivi->segs_cnt ++;
shadow_ivi->segs_length += (seg->right - seg->left + 1);
}
return 0;
}
/**
* Name:
* IVI_remove
* Description:
* Remove a given segment from given InterVal Index handler.
* Params:
* handler: The handler of InterVal Index created by IVI_create
* seg: A segment that user wants to delete. It MUST be created
* by IVI_seg_malloc.
* Return:
* On success, 0 is returned;
* Else when overlapp occures, -1 is returned.
**/
int IVI_remove(IVI_t * handler, IVI_seg_t * seg)
{
if(NULL == handler || NULL == seg)
{
return -1;
}
IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler;
IVI_shadow_seg_t * shadow_seg = (IVI_shadow_seg_t *)seg;
TAILQ_REMOVE(&(shadow_ivi->ivi_queue), shadow_seg, ENTRY);
shadow_ivi->segs_cnt --;
shadow_ivi->segs_length -= (seg->right - seg->left + 1);
return 0;
}
/**
* Name:
* IVI_query
* Description:
* Query from given InterVal Index and get the number of segments
* which are overlapped with given interval, and store those segments
* in the last parameter.
* Params:
* handler: The handler of interval index created by IVI_create
* left: Left point of given interval
* right: Right point of given interval
* segs: An address of a segment pointer array to store those segments which
* are overlapped with given interval. NOTE that user should not malloc
* the array, and segs need to be freed by user. The element of *segs
* MUST not be freed by user.
* Return:
* Return the number of segments which are overlapped with given interval
**/
int IVI_query(IVI_t * handler, OFFSET_TYPE left, OFFSET_TYPE right, IVI_seg_t *** segs)
{
IVI_shadow_t * shadow_ivi;
IVI_shadow_seg_t *head, *tail, *tmp, *left_tmp, *right_tmp;
int interval_cnt = 0, i;
if(NULL == handler || after(left, right))
{
return -1;
}
shadow_ivi = (IVI_shadow_t *)handler;
head = TAILQ_FIRST(&(shadow_ivi->ivi_queue));
tail = TAILQ_LAST(&(shadow_ivi->ivi_queue), TQ);
/* Traverse from head or tail? We need to decide */
if(closer_to_head(head, tail, left))
{
tmp = head;
while(tmp != NULL)
{
if(after(left, tmp->lightseg.right))
{
tmp = TAILQ_NEXT(tmp, ENTRY);
}
else
{
/* Get the seg which left is in or before*/
left_tmp = tmp;
break;
}
}
if(tmp == NULL)
{
*segs = NULL;
return 0;
}
/* Get the num of overlapped segs */
while(tmp != NULL)
{
if(!before(right, tmp->lightseg.left))
{
tmp = TAILQ_NEXT(tmp, ENTRY);
interval_cnt ++;
}
else
{
break;
}
}
tmp = left_tmp;
if(interval_cnt == 0)
{
*segs = NULL;
return 0;
}
*segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *));
for(i = 0; i < interval_cnt; i++)
{
(*segs)[i] = (IVI_seg_t *)tmp;
tmp = TAILQ_NEXT(tmp, ENTRY);
}
}
else
{
tmp = tail;
while(tmp != NULL)
{
if(before(right, tmp->lightseg.left))
{
tmp = TAILQ_PREV(tmp, TQ, ENTRY);
}
else
{
right_tmp = tmp;
break;
}
}
if(tmp == NULL)
{
*segs = NULL;
return 0;
}
/* Get the num of overlapped segs */
while(tmp != NULL)
{
if(!after(left, tmp->lightseg.right))
{
tmp = TAILQ_PREV(tmp, TQ, ENTRY);
interval_cnt ++;
}
else
{
break;
}
}
tmp = right_tmp;
if(interval_cnt == 0)
{
*segs = NULL;
return 0;
}
*segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *));
for(i = interval_cnt - 1; i >= 0; i--)
{
(*segs)[i] = (IVI_seg_t *)tmp;
tmp = TAILQ_PREV(tmp, TQ, ENTRY);
}
}
return interval_cnt;
}
/**
* Name:
* IVI_query_continuous
* Description:
* Query from interval index handler and get the number of continous segments
* which are overlapped with given interval.
* Params:
* handler: The handler of InterVal Index created by IVI_create.
* left: Left point of given interval
* right: Right point of given interval
* segs: An address of a segment pointer array to store those segments which
* are overlapped with given interval. NOTE that user should not malloc
* the array, and segs need to be freed by user. The element of *segs
* MUST not be freed by user.
* Return:
* Return the number of continous segments which are overlapped with given interval
**/
int IVI_query_continuous(IVI_t * handler, OFFSET_TYPE left, OFFSET_TYPE right, IVI_seg_t *** segs)
{
IVI_shadow_t * shadow_ivi;
IVI_shadow_seg_t *head, *tail, *tmp, *left_tmp, *right_tmp;
int interval_cnt = 0, i;
if(NULL == handler || after(left, right))
{
return -1;
}
shadow_ivi = (IVI_shadow_t *)handler;
head = TAILQ_FIRST(&(shadow_ivi->ivi_queue));
tail = TAILQ_LAST(&(shadow_ivi->ivi_queue), TQ);
/* Traverse from head or tail? We need to decide */
if(closer_to_head(head, tail, left))
{
tmp = head;
while(tmp != NULL)
{
if(after(left, tmp->lightseg.right))
{
tmp = TAILQ_NEXT(tmp, ENTRY);
}
else
{
/* Get the seg which left is in or before*/
left_tmp = tmp;
break;
}
}
if(tmp == NULL)
{
*segs = NULL;
return 0;
}
/* Get the num of overlapped segs */
while(tmp != NULL)
{
if(!before(right, tmp->lightseg.left))
{
tmp = TAILQ_NEXT(tmp, ENTRY);
interval_cnt ++;
}
else
{
break;
}
IVI_shadow_seg_t * prev = TAILQ_PREV(tmp, TQ, ENTRY);
if(tmp != NULL && !continuous(prev->lightseg.right, tmp->lightseg.left))
{
break;
}
}
tmp = left_tmp;
if(interval_cnt == 0)
{
*segs = NULL;
return 0;
}
*segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *));
for(i = 0; i < interval_cnt; i++)
{
(*segs)[i] = (IVI_seg_t *)tmp;
tmp = TAILQ_NEXT(tmp, ENTRY);
}
}
else
{
tmp = tail;
while(tmp != NULL)
{
if(before(right, tmp->lightseg.left))
{
tmp = TAILQ_PREV(tmp, TQ, ENTRY);
}
else
{
right_tmp = tmp;
break;
}
}
if(tmp == NULL)
{
*segs = NULL;
return 0;
}
/* Get the num of overlapped segs */
while(tmp != NULL)
{
if(!after(left, tmp->lightseg.right))
{
tmp = TAILQ_PREV(tmp, TQ, ENTRY);
interval_cnt ++;
}
else
{
break;
}
IVI_shadow_seg_t * next = TAILQ_NEXT(tmp, ENTRY);
if(tmp != NULL && !continuous(tmp->lightseg.right, next->lightseg.left))
{
break;
}
}
tmp = right_tmp;
if(interval_cnt == 0)
{
*segs = NULL;
return 0;
}
*segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *));
for(i = interval_cnt - 1; i >= 0; i--)
{
(*segs)[i] = (IVI_seg_t *)tmp;
tmp = TAILQ_PREV(tmp, TQ, ENTRY);
}
}
return interval_cnt;
}
/**
* Name:
* IVI_seg_cnt
* Description:
* Get the count of segments in given interval index handler
* Params:
* handler: The handler of InterVal Index created by IVI_create.
* Return:
* Return the count of segments in given interval index handler
**/
int IVI_seg_cnt(IVI_t * handler)
{
if(handler == NULL)
return -1;
IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler;
return shadow_ivi->segs_cnt;
}
/**
* Name:
* IVI_seg_len
* Description:
* Get the length of whole segments in given interval index handler
* Params:
* handler: The handler of InterVal Index created by IVI_create.
* Return:
* Return the length of whole segments in given interval index handler
**/
OFFSET_TYPE IVI_seg_length(IVI_t * handler)
{
if(handler == NULL)
return -1;
IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler;
return shadow_ivi->segs_length;
}
/**
* Name:
* IVI_traverse
* Description:
* Traverse given InterVal Index and execute given callback function
* one time for each seg in InterVal Index.
* Params:
* handler: The handler of InterVal Index created by IVI_create.
* IVI_callback_t: Callback function for user to define.
* usr_para: Parameter user want to pass to callback function.
* Return:
* void
**/
void IVI_traverse(IVI_t * handler, IVI_callback_t cb, void * usr_para)
{
if(NULL == handler || NULL == cb)
{
return;
}
IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler;
IVI_shadow_seg_t * tmp_seg = TAILQ_FIRST(&(shadow_ivi->ivi_queue));
IVI_shadow_seg_t * tmp;
/* Traverse the IVI */
while(tmp_seg != NULL)
{
/*
* The place we can't use TAILQ_FOREACH because we
* do not no what will callback funciton does.
* */
tmp = TAILQ_NEXT(tmp_seg, ENTRY);
cb((IVI_seg_t *)tmp_seg, usr_para);
tmp_seg = tmp;
}
}

828
src/entry/mesa_fuzzy.c Normal file
View File

@@ -0,0 +1,828 @@
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include "mesa_fuzzy.h"
#include "interval_index.h"
#define ROLLING_WINDOW 7
#define BLOCKSIZE_MIN 3
#define MAXSIZE 10000
#define HASH_PRIME 0x01000193
#define HASH_INIT 0x28021967
#define DEBUG (0)
struct roll_state
{
unsigned char window[ROLLING_WINDOW];
unsigned int h1, h2, h3;
unsigned int n;
};
typedef struct
{
char * left_data; //ָ<><D6B8><EFBFBD><EFBFBD><EFBFBD>ݵ<EFBFBD>ͷָ<CDB7><D6B8>
unsigned int left_len; //<2F><><EFBFBD>߱<EFBFBD><DFB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݵij<DDB5><C4B3><EFBFBD>
char * hash_result; //<2F><><EFBFBD><EFBFBD>segment<6E><74>FNVֵ
unsigned long long left_offset;
unsigned long long right_offset;
struct roll_state * right_status_r; //<2F>ұ߽<D2B1><DFBD><EFBFBD>rollhash״̬
unsigned int right_status_shash; //<2F>ұ߽<D2B1><DFBD><EFBFBD>FNVֵ
unsigned int right_len;//<2F>ұ߽<D2B1><DFBD>ij<EFBFBD><C4B3><EFBFBD>
int slice_num;
}fuzzy_node;
typedef struct
{
unsigned long long orilen;
IVI_t * ivi; //ÿһ<C3BF><D2BB>handle<6C><65><EFBFBD><EFBFBD><E6B1A3>һ<EFBFBD><D2BB>IVIָ<49>һ<EBA3AC><D2BB>IVI<56><49><EFBFBD><EFBFBD><E6B1A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD><EFBFBD><EFBFBD>Ƭ
unsigned long long effective_length;
}fuzzy_handle_inner_t;
typedef struct
{
char * head; //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>char<61><72><EFBFBD><EFBFBD>
unsigned int size;
unsigned int offset; //<2F><><EFBFBD><EFBFBD><E9B3A4>
unsigned long long first_FNV_offset;
unsigned long long last_FNV_offset;
}final_result;
typedef struct
{
unsigned long long first_FNV_offset;
unsigned long long last_FNV_offset;
unsigned long long hash_length;
}final_length;
unsigned int fuzzy_hash_calculate(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize);
void fuzzy_calculate_self(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize);
void fuzzy_calculate_self_with_prev(IVI_seg_t * prev_seg, IVI_seg_t * seg, const char * data, unsigned long long blocksize);
void fuzzy_modify_next(IVI_seg_t * seg, IVI_seg_t * next_seg, unsigned long long blocksize);
unsigned long long get_prev_continous_length(IVI_seg_t * seg);
unsigned int segment_overlap(fuzzy_handle_t * handle, fuzzy_node * fnode, unsigned int size, unsigned long long offset, const char * data);
void fuzzy_hash_merge(IVI_seg_t * seg, void * user_para);
void fuzzy_hash_merge_new(IVI_seg_t * seg, void * user_para);
void fuzzy_hash_length(IVI_seg_t * seg, void * user_para);
unsigned long long fuzzy_status(fuzzy_handle_t * handle, int type);
char * b64 =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
/**
* roll_state<74><65>ʼ<EFBFBD><CABC>
*/
static void roll_init(struct roll_state * self)
{
memset(self, 0, sizeof(struct roll_state));
}
/**
* <20><><EFBFBD><EFBFBD>roll_hashֵ<68><D6B5><EFBFBD><EFBFBD><EFBFBD>ⲿ<EFBFBD><E2B2BF><EFBFBD>ݶ<EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
*/
static void roll_hash(struct roll_state * self, unsigned char c)
{
self->h2 -= self->h1;
self->h2 += ROLLING_WINDOW * (unsigned int)c;
self->h1 += (unsigned int)c;
self->h1 -= (unsigned int)self->window[self->n];
self->window[self->n] = c;
self->n++;
if (self->n == ROLLING_WINDOW)
self->n = 0;
self->h3 <<= 5;
self->h3 ^= c;
}
/**
* <20><><EFBFBD><EFBFBD><E3B4B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>roll_hashֵ<68><D6B5>ÿ<EFBFBD><C3BF>roll_hashֵ<68><D6B5><EFBFBD><EFBFBD>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƭ
*/
static unsigned int roll_sum(const struct roll_state * self)
{
return self->h1 + self->h2 + self->h3;
/* return self->h1 + self->h2; */
}
/**
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC>FNVֵ
*/
static unsigned int sum_hash(unsigned char c, unsigned int h)
{
return (h * HASH_PRIME) ^ c;
}
/**
* <20><><EFBFBD><EFBFBD>handle
*/
fuzzy_handle_t * fuzzy_create_handle(unsigned long long origin_len)
{
fuzzy_handle_inner_t * handle = (fuzzy_handle_inner_t *)malloc(sizeof(fuzzy_handle_inner_t));
handle->orilen = origin_len;
handle->ivi = IVI_create();
handle->effective_length = 0;
return (fuzzy_handle_t *)handle;
}
/**
* IVI_destroy<6F>Ļص<C4BB><D8B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>IVI<56>е<EFBFBD><D0B5><EFBFBD><EFBFBD><EFBFBD>
*/
void fuzzy_node_free(IVI_seg_t * seg, void * usr_para)
{
//printf("free seg[%lu, %lu]\n", seg->left, seg->right);
fuzzy_node * temp = (fuzzy_node*)(seg->data);
if(temp->left_data != NULL)
{
free(temp->left_data);
temp->left_data = NULL;
}
if(temp->hash_result != NULL)
{
free(temp->hash_result);
temp->hash_result = NULL;
}
free(temp->right_status_r);
temp->right_status_r = NULL;
free(temp);
temp = NULL;
return;
}
/**
* <20><><EFBFBD><EFBFBD>handle
*/
void fuzzy_destroy_handle(fuzzy_handle_t * handle)
{
IVI_destroy(((fuzzy_handle_inner_t *)handle)->ivi, fuzzy_node_free, NULL);
free((fuzzy_handle_inner_t *)handle);
return;
}
/**
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݣ<EFBFBD><DDA3><EFBFBD><EFBFBD>Ҽ<EFBFBD><D2BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݵ<EFBFBD>fuzzy_hashֵ
*/
unsigned int fuzzy_feed(fuzzy_handle_t * handle, const char * data, unsigned int size, unsigned long long offset)
{
fuzzy_node * node = (fuzzy_node *)calloc(sizeof(fuzzy_node), 1);
node->right_status_r = (struct roll_state *)calloc(sizeof (struct roll_state), 1);
roll_init(node->right_status_r);
node->slice_num = 0;
unsigned int length = segment_overlap(handle, node, size, offset, data);
if(offset == 0)
{
((fuzzy_handle_inner_t *)handle)->effective_length += size - node->right_len;
return (size - node->right_len);
}
else
{
((fuzzy_handle_inner_t *)handle)->effective_length += length;
}
return length; //<2F><><EFBFBD><EFBFBD><EFBFBD>Ѿ<EFBFBD><D1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ч<EFBFBD><D0A7><EFBFBD><EFBFBD>
}
unsigned long long get_blocksize(unsigned long long orilen)
{
double tmp = orilen/(64 * BLOCKSIZE_MIN);
double index = floor(log(tmp)/log(2));
double tmp_t = pow(2, index);
unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN);
return blocksize;
}
/**
* <20>ж<EFBFBD><D0B6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƿ<EFBFBD><C7B7><EFBFBD><EFBFBD>Ѿ<EFBFBD><D1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>и<EFBFBD><D0B8><EFBFBD>
*/
unsigned int segment_overlap(fuzzy_handle_t * handle, fuzzy_node * fnode, unsigned int size, unsigned long long offset, const char * data)
{
IVI_seg_t ** overlap_segs = NULL;
IVI_seg_t * seg = IVI_seg_malloc(offset, offset + size -1, (void *)fnode);
int overlap_segnum = 0;
unsigned int effective_length = 0;
unsigned int total_length = 0;
unsigned long long blocksize = get_blocksize(((fuzzy_handle_inner_t *)handle)->orilen);
/*<2A><>ѯ<EFBFBD>Ƿ<EFBFBD><C7B7>и<EFBFBD><D0B8>ǣ<EFBFBD><C7A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD>и<EFBFBD><D0B8>ǣ<EFBFBD><C7A3><EFBFBD><EFBFBD>ظ<EFBFBD><D8B8>ǵ<EFBFBD>segment<6E><74>Ƭ<EFBFBD><C6AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>û<EFBFBD>и<EFBFBD><D0B8>ǣ<EFBFBD><C7A3><EFBFBD><EFBFBD><EFBFBD>0*/
overlap_segnum = IVI_query(((fuzzy_handle_inner_t *)handle)->ivi, offset, offset + size - 1, &overlap_segs);
/*<2A><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵΪ<D6B5><CEAA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˵<EFBFBD><CBB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>IJ<EFBFBD><C4B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><E2A3AC>ӡ<EFBFBD><D3A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ*/
if(overlap_segnum < 0)
{
printf("fragment info error!\n");
IVI_seg_free(seg, fuzzy_node_free, NULL);
return 0;
}
/*<2A><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵΪ0<CEAA><30>˵<EFBFBD><CBB5>û<EFBFBD>и<EFBFBD><D0B8>ǵ<EFBFBD><C7B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֱ<EFBFBD>Ӳ<EFBFBD><D3B2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*/
if(overlap_segnum == 0)
{
IVI_insert(((fuzzy_handle_inner_t *)handle)->ivi,seg);
effective_length = fuzzy_hash_calculate(seg, data, offset, blocksize);
total_length = seg->right - seg->left + 1;
return effective_length;
}
/*<2A><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵΪ<D6B5><CEAA><EFBFBD>ǵ<EFBFBD>Ƭ<EFBFBD><C6AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA><EFBFBD>ݸ<EFBFBD><DDB8><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>һһ<D2BB><D2BB><EFBFBD>д<EFBFBD><D0B4><EFBFBD>*/
int flag = 0;
int i;
for(i = 0; i < overlap_segnum; i++)
{
switch(IVI_relative_position(seg, overlap_segs[i]))
{
case LEFT_OVERLAP: //<2F>󸲸ǣ<F3B8B2B8><C7A3><EFBFBD>seg<65><67><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>Ϊoverlap_seg<65><67><EFBFBD><EFBFBD>ֵ
{
seg->right = overlap_segs[i]->left - 1;
break;
}
case CONTAIN: //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Dz<EFBFBD><C7B2><EFBFBD>ֱ<EFBFBD>Ӳ<EFBFBD><D3B2>룬Ȼ<EBA3AC><C8BB><EFBFBD>ı<EFBFBD>seg<65><67><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>data<74>ƶ<EFBFBD><C6B6><EFBFBD>ָ<EFBFBD><D6B8><EFBFBD><EFBFBD>λ<EFBFBD><CEBB>
{
if(overlap_segs[i]->left - 1 >= seg->left)
{
fuzzy_node * node = (fuzzy_node *)calloc(sizeof(fuzzy_node), 1);
memcpy(node, fnode, sizeof(fuzzy_node));
node->right_status_r = (struct roll_state *)calloc(sizeof (struct roll_state), 1);
roll_init(node->right_status_r);
IVI_seg_t * thseg = IVI_seg_malloc(seg->left, overlap_segs[i]->left - 1, (void *)node);
IVI_insert(((fuzzy_handle_inner_t *)handle)->ivi,thseg);
effective_length += fuzzy_hash_calculate(thseg, data, offset, blocksize);
total_length += thseg->right - thseg->left + 1;
}
seg->left = overlap_segs[i]->right + 1;
data = data + ((seg->left) - offset);
offset = seg->left;
break;
}
case RIGHT_OVERLAP: //<2F>Ҹ<EFBFBD><D2B8>ǣ<EFBFBD><C7A3><EFBFBD>seg<65><67><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>Ϊoverlap_seg<65><67><EFBFBD><EFBFBD>ֵ
{
seg->left = overlap_segs[i]->right + 1;
data = data + ((seg->left) - offset);
offset = seg->left;
break;
}
case CONTAINED: //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֱ<EFBFBD><D6B1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƭ
{
flag = 1;
//printf("contained! free seg\n");
IVI_seg_free(seg, fuzzy_node_free, NULL);
free(overlap_segs);
break;
}
default:
break;
}
if(flag == 1)
{
return 0;
}
}
/*<2A><><EFBFBD><EFBFBD><EFBFBD>µ<EFBFBD><C2B5><EFBFBD><EFBFBD>ݲ<EFBFBD><DDB2><EFBFBD><EBB5BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><E6A3AC><EFBFBD>ҽ<EFBFBD><D2BD>м<EFBFBD><D0BC><EFBFBD>*/
if(seg->left <= seg->right)
{
IVI_insert(((fuzzy_handle_inner_t *)handle)->ivi, seg);
effective_length += fuzzy_hash_calculate(seg, data, offset, blocksize);
total_length += seg->right - seg->left + 1;
//((fuzzy_handle_inner_t *)handle)->effective_length += effective_length;
}
else
{
IVI_seg_free(seg, fuzzy_node_free, NULL);
}
free(overlap_segs);
return effective_length;
}
/**
* <20><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>fuzzy_hashֵ
*/
unsigned int fuzzy_hash_calculate(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize)
{
IVI_seg_t * prev_seg;
IVI_seg_t * next_seg;
unsigned int effective_length = 0;
prev_seg = IVI_prev_continuous_seg(seg);
next_seg = IVI_next_continuous_seg(seg);
//printf("seg->right = %lu, seg->left = %lu\n", seg->right, seg->left);
unsigned int size = seg->right - seg->left + 1;
fuzzy_node * node = (fuzzy_node *)(seg->data);
if(NULL == prev_seg)
{
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC>ֱ<EFBFBD>ӳ<EFBFBD>ʼ<EFBFBD><CABC>roll_state<74><65><EFBFBD>м<EFBFBD><D0BC><EFBFBD>
roll_init(node->right_status_r);
fuzzy_calculate_self(seg, data, offset, blocksize);
effective_length = size - node->left_len;
node->left_offset = offset + node->left_len;
}
else
{
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC>ȡ<EFBFBD><C8A1>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC><EFBFBD>ұ߽<D2B1><DFBD><EFBFBD><EFBFBD>м<EFBFBD>״ֵ̬<CCAC><D6B5><EFBFBD>м<EFBFBD><D0BC><EFBFBD>
fuzzy_calculate_self_with_prev(prev_seg, seg, data, blocksize);
effective_length = size + ((fuzzy_node *)(prev_seg->data))->right_len;
node->left_offset = offset - ((fuzzy_node *)(prev_seg->data))->right_len;
}
/* <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD>Ƭ,<2C><><EFBFBD><EFBFBD><EFBFBD>Լ<EFBFBD><D4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ľ<EFBFBD><C4BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA>Ƭ,<2C><><EFBFBD>޸ĺ<DEB8><C4BA><EFBFBD><EFBFBD>ķ<EFBFBD>Ƭ */
if(next_seg != NULL)
{
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ں<EFBFBD><DABA><EFBFBD>Ƭ<EFBFBD><C6AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC><EFBFBD>ұ߽<D2B1><DFBD><EFBFBD><EFBFBD>м<EFBFBD>״ֵ̬ȡ<D6B5><C8A1><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>Ƭ<EFBFBD><C6AC><EFBFBD><EFBFBD><EFBFBD>߽<EFBFBD><DFBD><EFBFBD><EFBFBD>м<EFBFBD>״̬<D7B4><CCAC><EFBFBD>м<EFBFBD><D0BC><EFBFBD>
fuzzy_modify_next(seg, next_seg, blocksize);
effective_length += ((fuzzy_node *)(next_seg->data))->left_len;
node->right_offset = offset + size + ((fuzzy_node *)(next_seg->data))->left_len;
}
else
{
effective_length -= node->right_len;
node->right_offset = offset + (size - (node->right_len));
}
return effective_length;
}
void fuzzy_calculate_self(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize)
{
fuzzy_node * node = (fuzzy_node *)(seg->data);
struct roll_state * rs = node->right_status_r;
unsigned long long size = seg->right - seg->left + 1;
unsigned int FNV_hash_value = HASH_INIT;
char * FNV_hash = (char *)malloc(sizeof(char)*size);
unsigned long long fnv_index = 0, i, last_slice_index;
unsigned int roll_hash_value;
for(i = 0; i < size; i++)
{
roll_hash(rs, data[i]);
roll_hash_value = roll_sum(rs);
FNV_hash_value = sum_hash(data[i], FNV_hash_value);
if(i >= ROLLING_WINDOW - 1 && roll_hash_value % blocksize == blocksize - 1)
{
node->slice_num ++;
if(node->slice_num == 1)
{
node->left_len = i + 1;
}
last_slice_index = i;
/* <20><><EFBFBD><EFBFBD>FNV<4E><56>ֵ */
FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64];
//printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]);
FNV_hash_value = HASH_INIT;
}
}
/* һƬ<D2BB><C6AC>û<EFBFBD><C3BB><EFBFBD>ҵ<EFBFBD> */
if(node->slice_num == 0)
{
node->left_len = size;
node->right_len = 0;
}
else
{
node->right_len = size - last_slice_index - 1;
}
node->right_status_shash = FNV_hash_value;
/* <20><><EFBFBD>ƽ<EFBFBD><C6BD><EFBFBD><EFBFBD><EFBFBD>hash_result<6C><74> */
node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1));
memcpy(node->hash_result, FNV_hash, fnv_index);
(node->hash_result)[fnv_index] = '\0';
node->left_data = (char *)malloc(sizeof(char) * (node->left_len));
memcpy(node->left_data, data, node->left_len);
free(FNV_hash);
return;
}
unsigned long long get_prev_continous_length(IVI_seg_t * seg)
{
unsigned long long length = 0;
IVI_seg_t * temp = seg;
while(temp != NULL)
{
length += temp->right - temp->left + 1;
if(length >= ROLLING_WINDOW)
return length;
temp = IVI_prev_continuous_seg(temp);
}
return length;
}
/**
* <20><><EFBFBD><EFBFBD>ǰ<EFBFBD>εı<CEB5><C4B1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
*/
void fuzzy_calculate_self_with_prev(IVI_seg_t * prev_seg, IVI_seg_t * seg, const char * data, unsigned long long blocksize)
{
fuzzy_node * prev_node = (fuzzy_node *)(prev_seg->data);
fuzzy_node * node = (fuzzy_node *)(seg->data);
/* ʹ<><CAB9>ǰ<EFBFBD>ε<EFBFBD>roll state */
memcpy(node->right_status_r, prev_node->right_status_r, sizeof(struct roll_state));
struct roll_state * rs = node->right_status_r;
unsigned long long size = seg->right - seg->left + 1;
unsigned int FNV_hash_value = prev_node->right_status_shash;
char * FNV_hash = (char *)malloc(sizeof(char)*size);
unsigned long long fnv_index = 0, i, last_slice_index;
unsigned int roll_hash_value;
unsigned long long prev_len = get_prev_continous_length(prev_seg);
for(i = 0; i < size; i++)
{
roll_hash(rs, data[i]);
roll_hash_value = roll_sum(rs);
FNV_hash_value = sum_hash(data[i], FNV_hash_value);
if(i + prev_len >= ROLLING_WINDOW \
&& roll_hash_value % blocksize == blocksize - 1)
{
node->slice_num ++;
if(node->slice_num == 1)
{
node->left_len = i + 1;
}
last_slice_index = i;
/* <20><><EFBFBD><EFBFBD>FNV<4E><56>ֵ */
FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64];
//printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]);
FNV_hash_value = HASH_INIT;
}
}
/* һƬ<D2BB><C6AC>û<EFBFBD><C3BB><EFBFBD>ҵ<EFBFBD> */
if(node->slice_num == 0)
{
node->left_len = size;
node->right_len = 0;
}
else
{
node->right_len = size - last_slice_index - 1;
}
node->right_status_shash = FNV_hash_value;
/* <20><><EFBFBD>ƽ<EFBFBD><C6BD><EFBFBD><EFBFBD><EFBFBD>hash_result<6C><74> */
node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1));
memcpy(node->hash_result, FNV_hash, fnv_index);
(node->hash_result)[fnv_index] = '\0';
node->left_data = (char *)malloc(sizeof(char) * (node->left_len));
memcpy(node->left_data, data, node->left_len);
free(FNV_hash);
}
void fuzzy_modify_self_with_prev(IVI_seg_t * prev_seg, IVI_seg_t * seg, char * data, unsigned long long blocksize)
{
fuzzy_node * prev_node = (fuzzy_node *)(prev_seg->data);
fuzzy_node * node = (fuzzy_node *)(seg->data);
/* ʹ<><CAB9>ǰ<EFBFBD>ε<EFBFBD>roll state */
memcpy(node->right_status_r, prev_node->right_status_r, sizeof(struct roll_state));
struct roll_state * rs = node->right_status_r;
unsigned long long size = seg->right - seg->left + 1;
unsigned int FNV_hash_value = prev_node->right_status_shash;
char * FNV_hash = (char *)malloc(sizeof(char)*size);
unsigned long long fnv_index = 0, i, last_slice_index;
unsigned int roll_hash_value;
unsigned long long prev_len = get_prev_continous_length(prev_seg);
for(i = 0; i < size; i++)
{
roll_hash(rs, data[i]);
roll_hash_value = roll_sum(rs);
FNV_hash_value = sum_hash(data[i], FNV_hash_value);
if(i + prev_len >= ROLLING_WINDOW \
&& roll_hash_value % blocksize == blocksize- 1)
{
node->slice_num ++;
if(node->slice_num == 1)
{
node->left_len = i + 1;
}
last_slice_index = i;
/* <20><><EFBFBD><EFBFBD>FNV<4E><56>ֵ */
FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64];
//printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]);
FNV_hash_value = HASH_INIT;
}
}
/* һƬ<D2BB><C6AC>û<EFBFBD><C3BB><EFBFBD>ҵ<EFBFBD> */
if(node->slice_num == 0)
{
node->left_len = size;
node->right_len = 0;
}
else
{
node->right_len = size - last_slice_index - 1;
}
node->right_status_shash = FNV_hash_value;
/* <20><><EFBFBD>ƽ<EFBFBD><C6BD><EFBFBD><EFBFBD><EFBFBD>hash_result<6C><74> */
free(node->hash_result);
node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1));
memcpy(node->hash_result, FNV_hash, fnv_index);
(node->hash_result)[fnv_index] = '\0';
//printf("old node->left_data = %s\n", node->left_data);
free(node->left_data);
node->left_data = (char *)malloc(sizeof(char) * (node->left_len));
memcpy(node->left_data, data, node->left_len);
//printf("new node->left_data = %s\n", node->left_data);
free(FNV_hash);
}
/**
* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>εı<CEB5><C4B1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
*/
void fuzzy_modify_next(IVI_seg_t * seg, IVI_seg_t * next_seg, unsigned long long blocksize)
{
IVI_seg_t * tmp_curr_seg = seg;
IVI_seg_t * tmp_next_seg = next_seg;
while(tmp_next_seg != NULL)
{
fuzzy_node * tmp_next_node = (fuzzy_node *)(tmp_next_seg->data);
if(tmp_next_node->slice_num != 0)
{
break;
}
/* <20><>һ<EFBFBD><D2BB>û<EFBFBD>з<EFBFBD>Ƭ, <20><><EFBFBD><EFBFBD><EFBFBD>¼<EFBFBD><C2BC><EFBFBD> */
char * data = (char *)malloc(sizeof(char) * (tmp_next_node->left_len));
memcpy(data, tmp_next_node->left_data, tmp_next_node->left_len);
fuzzy_modify_self_with_prev(tmp_curr_seg, tmp_next_seg, data, blocksize);
free(data);
tmp_curr_seg = tmp_next_seg;
tmp_next_seg = IVI_next_continuous_seg(tmp_next_seg);
}
unsigned long long prev_len = get_prev_continous_length(tmp_curr_seg);
/* tmp_next_seg<65><67><EFBFBD><EFBFBD><EFBFBD>з<EFBFBD>Ƭ<EFBFBD><C6AC> */
if(tmp_next_seg != NULL)
{
fuzzy_node * tmp_curr_node = (fuzzy_node *)(tmp_curr_seg->data);
fuzzy_node * tmp_next_node = (fuzzy_node *)(tmp_next_seg->data);
unsigned long long size = tmp_next_node->left_len;
char * FNV_hash = (char *)malloc(sizeof(char)*size);
unsigned long long fnv_index = 0, i;
unsigned int roll_hash_value;
struct roll_state rs;
memcpy(&rs, tmp_curr_node->right_status_r, sizeof(struct roll_state));
char * data = tmp_next_node->left_data;
unsigned int FNV_hash_value = tmp_curr_node->right_status_shash;
for(i = 0; i < size; i++)
{
roll_hash(&rs, data[i]);
roll_hash_value = roll_sum(&rs);
FNV_hash_value = sum_hash(data[i], FNV_hash_value);
if((i + prev_len >= ROLLING_WINDOW) \
&& roll_hash_value % blocksize == blocksize - 1)
{
tmp_next_node->slice_num ++;
FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64];
//printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]);
FNV_hash_value = HASH_INIT;
if(fnv_index == 1)
{
tmp_next_node->left_len = i + 1;
}
}
}
tmp_next_node->slice_num --;
/* <20><><EFBFBD>ƽ<EFBFBD><C6BD><EFBFBD><EFBFBD><EFBFBD>hash_result<6C><74> */
unsigned long long old_len = strlen(tmp_next_node->hash_result);
if(old_len == 1)
{
free(tmp_next_node->hash_result);
tmp_next_node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1));
memcpy(tmp_next_node->hash_result, FNV_hash, fnv_index);
(tmp_next_node->hash_result)[fnv_index] = '\0';
}
else
{
unsigned long long new_len = old_len - 1 + fnv_index;
char tmp[old_len - 1];
char * old_hash = (tmp_next_node->hash_result) + 1;
memcpy(tmp, old_hash, old_len - 1);
free(tmp_next_node->hash_result);
tmp_next_node->hash_result = (char *)malloc(sizeof(char) * (new_len + 1));
memset(tmp_next_node->hash_result, '\0', (new_len + 1));
memcpy(tmp_next_node->hash_result, FNV_hash, fnv_index);
strncat(tmp_next_node->hash_result, tmp, old_len - 1);
(tmp_next_node->hash_result)[new_len] = '\0';
}
free(FNV_hash);
}
return;
}
/**
* ȡ<><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>hash_resultֵ<74><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ƴ<EFBFBD>ӣ<EFBFBD><D3A3>γ<EFBFBD><CEB3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>result<6C><74><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>abc[1:100]def[200:300]<5D><><EFBFBD>ָ<EFBFBD>ʽ
*/
int fuzzy_digest(fuzzy_handle_t * handle, char * result, unsigned int size)
{
final_result * temp = (final_result *)malloc(sizeof(final_result));
temp->head = result;
temp->size = size;
temp->offset = 0;
temp->first_FNV_offset = 0;
temp->last_FNV_offset = 0;
//final_result * temp = (final_result *)malloc(sizeof(final_result));
//temp->offset = 0;
IVI_traverse(((fuzzy_handle_inner_t *)handle)->ivi, fuzzy_hash_merge_new, (void *) temp);
result[size - 1] = '\0';
//memcpy(result, temp->result, size);
free(temp);
return 0;
}
void fuzzy_hash_merge_new(IVI_seg_t * seg, void * user_para)
{
IVI_seg_t * prev_seg;
IVI_seg_t * next_seg;
prev_seg = IVI_prev_continuous_seg(seg);
next_seg = IVI_next_continuous_seg(seg);
char buffer[MAXSIZE];
final_result * tmp = (final_result *)user_para;
fuzzy_node * node = (fuzzy_node *)(seg->data);
if(node->slice_num != 0)
{
tmp->last_FNV_offset = seg->right - node->right_len;
}
if(prev_seg == NULL && next_seg == NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ƴ<EFBFBD><C6B4>
{
tmp->first_FNV_offset = seg->left;
tmp->last_FNV_offset = seg->right - node->right_len;
sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right);
}
if(prev_seg == NULL && next_seg != NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>ƬΪ<C6AC>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD>FNVֵ<56><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȥ
{
tmp->first_FNV_offset = seg->left;
sprintf(buffer, "%s", node->hash_result);
}
if(prev_seg != NULL && next_seg == NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>ƬΪ<C6AC>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD>ƫ<EFBFBD><C6AB>
{
sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right);
}
if(prev_seg != NULL && next_seg != NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>FNVֵ<56><D6B5>ȥ
{
sprintf(buffer, "%s", node->hash_result);
}
unsigned int inner_size = strlen(buffer);
tmp->offset += inner_size;
if(tmp->offset <= tmp->size)
{
memcpy(tmp->head, buffer, inner_size);
tmp->head += inner_size;
}
else
{
unsigned int length = (tmp->size - (tmp->offset - inner_size));
if(length != 0)
{
memcpy(tmp->head, buffer, length);
}
tmp->offset = tmp->size;
tmp->head += length;
}
return;
}
/**
* <20><><EFBFBD><EFBFBD>fuzzy_hash<73>ĸ<EFBFBD><C4B8>ֳ<EFBFBD><D6B3><EFBFBD>
*/
unsigned long long fuzzy_status(fuzzy_handle_t * handle, int type)
{
unsigned long long length;
fuzzy_handle_inner_t * _handle = (fuzzy_handle_inner_t *)(handle);
switch(type)
{
case TOTAL_LENGTH: //<2F>Ѿ<EFBFBD><D1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>hashֵ<68><D6B5>ȫ<EFBFBD><C8AB><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
{
length = IVI_seg_length(_handle->ivi);
break;
}
case EFFECTIVE_LENGTH: //<2F><><EFBFBD><EFBFBD><EFBFBD>ڼ<EFBFBD><DABC><EFBFBD>hashֵ<68><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ч<EFBFBD><D0A7><EFBFBD><EFBFBD>
{
length = _handle->effective_length;
break;
}
case HASH_LENGTH: //<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ϣ<EFBFBD><CFA3><EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><C4B3><EFBFBD>
{
final_length tmp_length;
tmp_length.hash_length = 0;
tmp_length.first_FNV_offset = 0;
tmp_length.last_FNV_offset = 0;
IVI_traverse(_handle->ivi, fuzzy_hash_length, (void *)&tmp_length);
length = tmp_length.hash_length + 1;
break;
}
default:
return 0;
}
return length;
}
void fuzzy_hash_length(IVI_seg_t * seg, void * user_para)
{
IVI_seg_t * prev_seg;
IVI_seg_t * next_seg;
prev_seg = IVI_prev_continuous_seg(seg);
next_seg = IVI_next_continuous_seg(seg);
char buffer[MAXSIZE];
final_length * tmp = (final_length *)user_para;
fuzzy_node * node = (fuzzy_node *)(seg->data);
if(node->slice_num != 0)
{
//printf("node->slice_num != 0\n");
tmp->last_FNV_offset = seg->right - node->right_len;
//printf("%lu\n", tmp->last_FNV_offset);
}
if(prev_seg == NULL && next_seg == NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ƴ<EFBFBD><C6B4>
{
tmp->first_FNV_offset = seg->left;
tmp->last_FNV_offset = seg->right - node->right_len;
sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right);
}
if(prev_seg == NULL && next_seg != NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>ƬΪ<C6AC>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD>FNVֵ<56><D6B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȥ
{
tmp->first_FNV_offset = seg->left;
sprintf(buffer, "%s", node->hash_result);
}
if(prev_seg != NULL && next_seg == NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>ƬΪ<C6AC>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5><EFBFBD><EFBFBD>ƫ<EFBFBD><C6AB>
{
sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right);
}
if(prev_seg != NULL && next_seg != NULL) //<2F><><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD>Ƭ<EFBFBD><C6AC>Ϊ<EFBFBD>գ<EFBFBD><D5A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>FNVֵ<56><D6B5>ȥ
{
sprintf(buffer, "%s", node->hash_result);
}
tmp->hash_length += strlen(buffer);
return;
}