From f1da6a2b8168834abfd6bafd26fb68c84bfe8950 Mon Sep 17 00:00:00 2001 From: zhengchao Date: Fri, 13 Nov 2015 11:41:52 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86mesafuzzy=E5=92=8CGIE=E7=9A=84?= =?UTF-8?q?=E6=BA=90=E4=BB=A3=E7=A0=81=E9=9B=86=E6=88=90=E8=BF=9B=E5=85=A5?= =?UTF-8?q?Maat=EF=BC=8C=E5=B9=B6=E4=BF=AE=E6=94=B9Makefile=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/entry/Makefile | 7 +- src/entry/great_index_engine.c | 864 ++++++++++++++++++++++++++++++ src/entry/interval_index.c | 736 +++++++++++++++++++++++++ src/entry/mesa_fuzzy.c | 828 ++++++++++++++++++++++++++++ src/inc_internal/interval_index.h | 298 +++++++++++ src/inc_internal/mesa_fuzzy.h | 3 +- src/inc_internal/queue.h | 113 ++++ 7 files changed, 2845 insertions(+), 4 deletions(-) create mode 100644 src/entry/great_index_engine.c create mode 100644 src/entry/interval_index.c create mode 100644 src/entry/mesa_fuzzy.c create mode 100644 src/inc_internal/interval_index.h create mode 100644 src/inc_internal/queue.h diff --git a/src/entry/Makefile b/src/entry/Makefile index e3e8521..1d17c62 100644 --- a/src/entry/Makefile +++ b/src/entry/Makefile @@ -1,11 +1,11 @@ #opt: OPTFLAGS = -O2 #export OPTFLAGS -CC = g++ +CC = gcc CCC = g++ CFLAGS = -Wall -g -fPIC CFLAGS += $(OPTFLAGS) -LDFLAGS = -lMESA_handle_logger -lMESA_htable -lpthread +LDFLAGS = -lMESA_handle_logger -lMESA_htable -lpthread -lm MAILLIB = ../lib G_H_DIR =../inc_internal @@ -13,7 +13,8 @@ H_DIR =-I$(G_H_DIR) -I../../inc LIBMAAT = libmaatframe.a LIBMAAT_SO = libmaatframe.so -OBJS=config_monitor.o Maat_rule.o Maat_api.o UniversalBoolMatch.o dynamic_array.o cJSON.o json2iris.o map_str2int.o +OBJS=config_monitor.o Maat_rule.o Maat_api.o UniversalBoolMatch.o dynamic_array.o cJSON.o json2iris.o map_str2int.o\ + interval_index.o great_index_engine.o mesa_fuzzy.o .c.o: $(CC) -c $(CFLAGS) -I. $(H_DIR) $< diff --git a/src/entry/great_index_engine.c b/src/entry/great_index_engine.c new file mode 100644 index 0000000..6f663df --- /dev/null +++ b/src/entry/great_index_engine.c @@ -0,0 +1,864 @@ +#include +#include +#include +#include +#include +#include +#include "great_index_engine.h" +#include "queue.h" +int GIE_VERSION_1_0_20151109=1; +#define HTABLE_SIZE 1024*1024 +#define MAX 10000 +#define FIRST_INSERT 1 +#define SECOND_INSERT 0 +#define TOLERENCE_SIZE 0 +#define CONF_MAX 10 +#define BLOCKSIZE_MIN 3 +#define MAX_UINT64 (0xFFFFFFFFFFFFFFFF) + +typedef struct +{ + unsigned long long user_precision; + //int user_confidence_level_threshold; + double user_query_accuracy; + MESA_htable_handle id_table; + MESA_htable_handle index_table; + struct VL * valuelist; +}GIE_handle_inner_t; + +struct valuelist_node +{ + unsigned long long value; + struct VL * valuelist_name; + TAILQ_ENTRY(valuelist_node) vlistentry; +}; + +struct linklist_node +{ + unsigned long long index_key; + struct TQ * listname; + struct id_table_data * basicinfo; + TAILQ_ENTRY(linklist_node) listentry; +}; + +struct index_table_data +{ + struct TQ * listhead; + int cnt; + unsigned long long prev_value; + unsigned long long next_value; +}; + +struct id_table_data +{ + unsigned int id; + unsigned long long origin_len; + unsigned long long blocksize; + char * fh; + short cfds_lvl; + void * tag; + struct linklist_node * first_backtrack; + struct linklist_node * second_backtrack; +}; + +TAILQ_HEAD(TQ, linklist_node); +TAILQ_HEAD(VL, valuelist_node); + +void idtable_free(void * data); +void indextable_free(void * data); +int GIE_insert_indextable(GIE_handle_inner_t * handle, struct id_table_data * info, unsigned long long index_key, int flag); +int GIE_delete_from_indextable_by_key(GIE_handle_inner_t * handle, struct linklist_node * backtrack); +int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t ** digests, int size); +int GIE_union(struct TQ ** union_list, int list_num, struct id_table_data ** result,\ + unsigned long long min, unsigned long long max, unsigned long long query_blocksize); + +struct TQ * linklist_union(struct TQ * list_first, struct TQ * list_second, unsigned long long min, unsigned long long max,\ + unsigned long long query_blocksize); + + +int minof3(int x, int y, int z); +int GIE_edit_distance(char* w1, int l1, const char* w2, int l2); +int GIE_edit_distance_with_position(char * fh, const char * fuzzy_string, unsigned long long orilen, int * fuzzy_actual_size,\ + unsigned long long * calculate_len); + +GIE_handle_t * GIE_create(const GIE_create_para_t * para) +{ + GIE_handle_inner_t * handle = (GIE_handle_inner_t *)malloc(sizeof(GIE_handle_inner_t)); + handle->user_precision = para->index_interval; + //handle->user_confidence_level_threshold = para->confidence_level_threshold; + handle->user_query_accuracy = para->query_accuracy; + + struct VL * head = (struct VL *)malloc(sizeof(struct VL)); + TAILQ_INIT(head); + handle->valuelist = head; + + + MESA_htable_create_args_t idtable_args,indextable_args; + memset(&idtable_args, 0, sizeof(idtable_args)); + memset(&indextable_args, 0, sizeof(indextable_args)); + + + idtable_args.thread_safe = 0; + idtable_args.hash_slot_size = HTABLE_SIZE; + idtable_args.max_elem_num = 4 * HTABLE_SIZE; + idtable_args.expire_time = 0; + idtable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU; + idtable_args.key_comp = NULL; + idtable_args.key2index = NULL; + idtable_args.data_free = idtable_free; + idtable_args.data_expire_with_condition = NULL; + idtable_args.recursive = 1; + + indextable_args.thread_safe = 0; + indextable_args.hash_slot_size = HTABLE_SIZE; + indextable_args.max_elem_num = 4 * HTABLE_SIZE; + indextable_args.expire_time = 0; + indextable_args.eliminate_type = HASH_ELIMINATE_ALGO_LRU; + indextable_args.key_comp = NULL; + indextable_args.key2index = NULL; + indextable_args.data_free = indextable_free; + indextable_args.data_expire_with_condition = NULL; + indextable_args.recursive = 1; + + handle->id_table = MESA_htable_create(&idtable_args, sizeof(idtable_args)); + handle->index_table = MESA_htable_create(&indextable_args, sizeof(indextable_args)); + + return (GIE_handle_t *)(handle); +} + +void idtable_free(void * data) +{ + struct id_table_data * tmp = (struct id_table_data *)data; + free(tmp->fh); + free(tmp); +// printf("free id_table_data!\n"); + return; +} + +void indextable_free(void * data) +{ +// printf("free index_table_data!\n"); + struct index_table_data * tmp = (struct index_table_data *)data; + struct linklist_node * tmp_node = TAILQ_FIRST(tmp->listhead); + while(tmp_node != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(tmp_node, listentry); + free(tmp_node); +// printf("free list_node_data!\n"); + tmp_node = linklist_tmp; + } + free(tmp->listhead); + free(tmp); + return; +} + +void GIE_destory(GIE_handle_t * handle) +{ + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle); + MESA_htable_destroy(_handle->index_table, NULL); + MESA_htable_destroy(_handle->id_table, NULL); + + struct valuelist_node * tmp_node = TAILQ_FIRST(_handle->valuelist); + while(tmp_node != NULL) + { + struct valuelist_node * valuelist_tmp = TAILQ_NEXT(tmp_node, vlistentry); + free(tmp_node); + tmp_node = valuelist_tmp; + } + free(_handle->valuelist); + free(_handle); +} + + +unsigned long long calc_fh_blocksize(unsigned long long orilen) +{ + double tmp = orilen/(64 * BLOCKSIZE_MIN); + double index = floor(log(tmp)/log(2)); + double tmp_t = pow(2, index); + unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN); + return blocksize; +} + +void print_item_iterate(const uchar * key, uint size, void * data, void * user) +{ + //unsigned long long index_key = (unsigned long long)(* key); + struct index_table_data * index_data = (struct index_table_data *)data; + struct linklist_node * first_node = TAILQ_FIRST(index_data->listhead); + printf("index_key = %llu\n", first_node->index_key); + struct linklist_node * tmp_node = NULL; + TAILQ_FOREACH(tmp_node, index_data->listhead, listentry) + { + printf("id = %u orilen = %llu ", tmp_node->basicinfo->id, tmp_node->basicinfo->origin_len); + } + printf("\n"); +} + +int GIE_update(GIE_handle_t * handle, GIE_digest_t ** digests, int size) +{ + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)(handle); + struct id_table_data * info=NULL; + int success_cnt=0; + int i = 0; + + unsigned int input_fh_len=0; + + for(i = 0; i < size; i++) + { + switch(digests[i]->operation) + { + case GIE_INSERT_OPT: + { + unsigned long long first_index_key = (digests[i]->origin_len)/(_handle->user_precision)*(_handle->user_precision); + unsigned long long second_index_key = ((digests[i]->origin_len)/(_handle->user_precision) + 1)*(_handle->user_precision); + info = (struct id_table_data *)malloc(sizeof(struct id_table_data)); + //printf("malloc id_table_data!\n"); + input_fh_len=strlen(digests[i]->fuzzy_hash); + info->fh = (char *)calloc(sizeof(char),input_fh_len+1); + memcpy(info->fh, digests[i]->fuzzy_hash, input_fh_len); + + info->origin_len = digests[i]->origin_len; + info->blocksize = calc_fh_blocksize(digests[i]->origin_len); + info->tag = digests[i]->tag; + info->id = digests[i]->id; + info->cfds_lvl = digests[i]->cfds_lvl; + + info->first_backtrack = NULL; + info->second_backtrack = NULL; + if(MESA_htable_add(_handle->id_table, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), (const void *)info) < 0) + { + printf("add %d id_table failed!",digests[i]->id); + free(info->fh); + free(info); + continue; + } + if(GIE_insert_indextable(_handle, info, first_index_key, FIRST_INSERT) < 0) + { + printf("insert %d first failed\n",info->id); + assert(0); + free(info->fh); + free(info); + continue; + } + //printf("(info->first_backtrack)->index_key = %llu\n", (info->first_backtrack)->index_key); + + if(GIE_insert_indextable(_handle, info, second_index_key, SECOND_INSERT) < 0) + { + printf("insert %d second failed\n",info->id); + assert(0); + free(info->fh); + free(info); + continue; + } + + success_cnt++; + break; + } + case GIE_DELETE_OPT: + { + success_cnt += GIE_delete(_handle, digests, i); + break; + } + default: + break; + } + /*struct valuelist_node * tmp = NULL; + TAILQ_FOREACH(tmp, _handle->valuelist, vlistentry) + { + struct index_table_data * tmp_t = (struct index_table_data *)(MESA_htable_search_cb(_handle->index_table, (const uchar *)(&(tmp->value)), sizeof(tmp->value), NULL, NULL, NULL)); + printf("prev_value = %llu ", tmp_t->prev_value); + printf("next_value = %llu ", tmp_t->next_value); + printf("value = %llu\n", tmp->value); + }*/ + } + return success_cnt; +} + + +int GIE_insert_indextable(GIE_handle_inner_t * handle, struct id_table_data * info, unsigned long long index_key, int flag) +{ + struct linklist_node * node_data = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + // printf("linklist_node malloc success\n"); + node_data->basicinfo = info; + node_data->index_key = index_key; + node_data->listname = NULL; + + if(flag == FIRST_INSERT) + { + info->first_backtrack = node_data; //Backtracking pointer to index table, it is a pointer to a structure pointer + // printf("1: (info->first_backtrack)->index_key = %llu\n", (info->first_backtrack)->index_key); + } + else + { + info->second_backtrack = node_data; + } + + struct index_table_data * ret = (struct index_table_data *)(MESA_htable_search_cb(handle->index_table, (const uchar *)(&index_key), sizeof(index_key), NULL, NULL, NULL)); + if(ret != NULL) + { + //printf("ret != NULL\n"); + struct linklist_node * tmp = NULL; + node_data->listname = ret->listhead; + //If there are linked list exists in index table, sorted according to id + TAILQ_FOREACH(tmp, ret->listhead, listentry) + { + if(tmp->basicinfo->id > node_data->basicinfo->id) + { + TAILQ_INSERT_BEFORE(tmp, node_data, listentry); + ret->cnt++; + return 0; + } + if(node_data->basicinfo->id == tmp->basicinfo->id) + { + printf("invalid insert!"); + return -1; + } + //TODO 如果id是以前插入过的id,就要返回invalid insert + } + TAILQ_INSERT_TAIL(ret->listhead, node_data, listentry); + ret->cnt ++; + } + else + { + struct index_table_data * index_data = (struct index_table_data *)malloc(sizeof(struct index_table_data)); + + struct valuelist_node * tmp_t = NULL; + struct valuelist_node * value_data = (struct valuelist_node *)malloc(sizeof(struct valuelist_node)); + value_data->value = index_key; + value_data->valuelist_name = handle->valuelist; + + int insert_flag = 0; + TAILQ_FOREACH(tmp_t, handle->valuelist, vlistentry) + { + if(tmp_t->value > value_data->value) + { + TAILQ_INSERT_BEFORE(tmp_t, value_data, vlistentry); + insert_flag = 1; + break; + } + } + if(!insert_flag) + { + TAILQ_INSERT_TAIL(handle->valuelist, value_data, vlistentry); + } + + struct valuelist_node * tmp_prev = TAILQ_PREV(value_data, VL, vlistentry); + struct valuelist_node * tmp_next = TAILQ_NEXT(value_data, vlistentry); + + if(tmp_prev != NULL && tmp_next != NULL) + { + struct index_table_data * index_tmp_prev = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_prev->value)),\ + sizeof(tmp_prev->value)); + + struct index_table_data * index_tmp_next = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_next->value)),\ + sizeof(tmp_next->value)); + index_tmp_prev->next_value = value_data->value; + index_data->prev_value = tmp_prev->value; + index_data->next_value = tmp_next->value; + index_tmp_next->prev_value = value_data->value; + } + if(tmp_prev != NULL && tmp_next == NULL) + { + + struct index_table_data * index_tmp_prev = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_prev->value)),\ + sizeof(tmp_prev->value)); + + index_tmp_prev->next_value = value_data->value; + index_data->prev_value = tmp_prev->value; + index_data->next_value = MAX_UINT64; + } + if(tmp_prev == NULL && tmp_next != NULL) + { + + struct index_table_data * index_tmp_next = MESA_htable_search(handle->index_table, (const uchar *)(&(tmp_next->value)),\ + sizeof(tmp_next->value)); + + index_data->prev_value = MAX_UINT64; + index_data->next_value = tmp_next->value; + index_tmp_next->prev_value = value_data->value; + } + if(tmp_prev == NULL && tmp_next == NULL) + { + index_data->prev_value = MAX_UINT64; + index_data->next_value = MAX_UINT64; + } + + + //If there are no entries, have to create a list head pointer, + //and add the corresponding entry in the index table, the data link to the back + + struct TQ * head = (struct TQ *)malloc(sizeof(struct TQ)); + index_data->listhead = head; + index_data->cnt = 0; + + TAILQ_INIT(head); + TAILQ_INSERT_TAIL(head, node_data, listentry); + index_data->cnt++; + node_data->listname = index_data->listhead; + + if(MESA_htable_add(handle->index_table, (const uchar *)(&index_key), sizeof(index_key), (const void *)index_data) < 0) + { + printf("add index_table failed!\n"); + assert(0); + return -1; + } + + // struct index_table_data * tmp_v = (struct index_table_data *)(MESA_htable_search_cb(handle->index_table, (const uchar *)(&index_key), sizeof(index_key), NULL, NULL, NULL)); + // printf("index_data->prev_value = %llu ", index_data->prev_value); + // printf("index_data->next_value = %llu ", index_data->next_value); + // printf("index_key = %llu ", index_key); + // printf("prev_value = %llu ", tmp_v->prev_value); + // printf("next_value = %llu\n", tmp_v->next_value); + + } + return 0; +} + + + +int GIE_delete_from_indextable_by_key(GIE_handle_inner_t * handle, struct linklist_node * backtrack) +{ + struct linklist_node * backtrack_node = backtrack; //Find the index table in the first meet of the list node pointer by backtracking + + //find the key + unsigned long long tmp_key = backtrack_node->index_key; + + //delete the node + TAILQ_REMOVE(backtrack_node->listname, backtrack, listentry); + + //if first node is NULL, linklist is NULL, delete the record in the hashtable + if(TAILQ_EMPTY(backtrack_node->listname) == 1) + { + if(MESA_htable_del(handle->index_table, (const uchar *)(&tmp_key), sizeof(tmp_key), indextable_free) < 0) + { + printf("indextable backtrack delete error!\n"); + assert(0); + return -1; + } + else + { + struct valuelist_node * tmp = NULL; + TAILQ_FOREACH(tmp, handle->valuelist, vlistentry) + { + if(tmp->value == backtrack_node->index_key) + { + break; + } + } + struct valuelist_node * tmp_prev = TAILQ_PREV(tmp, VL, vlistentry); + struct valuelist_node * tmp_next = TAILQ_NEXT(tmp, vlistentry); + if(tmp_prev != NULL && tmp_next != NULL) + { + struct index_table_data * index_tmp_prev = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_prev->value)), \ + sizeof(tmp_prev->value), NULL, NULL, NULL); + struct index_table_data * index_tmp_next = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_next->value)), \ + sizeof(tmp_next->value), NULL, NULL, NULL); + index_tmp_prev->next_value = tmp_next->value; + index_tmp_next->prev_value = tmp_prev->value; + } + if(tmp_prev != NULL && tmp_next == NULL) + { + struct index_table_data * index_tmp_prev = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_prev->value)), \ + sizeof(tmp_prev->value), NULL, NULL, NULL); + index_tmp_prev->next_value = MAX_UINT64; + } + if(tmp_prev == NULL && tmp_next != NULL) + { + struct index_table_data * index_tmp_next = MESA_htable_search_cb(handle->index_table, (const uchar *)(&(tmp_next->value)), \ + sizeof(tmp_next->value), NULL, NULL, NULL); + index_tmp_next->prev_value = MAX_UINT64; + } + TAILQ_REMOVE(handle->valuelist, tmp, vlistentry); + free(tmp); + //printf("indextable backtrack delete success!\n"); + } + } + free(backtrack_node); + return 0; + +} + + +int GIE_delete(GIE_handle_inner_t * handle, GIE_digest_t ** digests, int i) +{ + int success_cnt=0; + struct id_table_data * ret = (struct id_table_data *) MESA_htable_search(handle->id_table, \ + (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id)); + + //if the record doesn't exist, printf delID doesn't exist! + //printf("ret->id = %u\n", ret->id); + //printf("(ret->first_backtrack)->index_key = %llu\n", (ret->first_backtrack)->index_key); + if(ret == NULL) + { + printf("del %d doesn't exist!\n",digests[i]->id); + } + else + { + GIE_delete_from_indextable_by_key(handle, ret->first_backtrack); + GIE_delete_from_indextable_by_key(handle, ret->second_backtrack); + success_cnt++; + } + if(MESA_htable_del(handle->id_table, (const uchar *)(&(digests[i]->id)), sizeof(digests[i]->id), idtable_free) < 0) + { + printf("delete id failed!"); + assert(0); + } + return success_cnt; +} + + + +int GIE_union(struct TQ ** union_list, int list_num, struct id_table_data ** result,\ + unsigned long long min, unsigned long long max, unsigned long long query_blocksize) +{ + struct TQ * tmp_list = (struct TQ *)malloc(sizeof(struct TQ)); + TAILQ_INIT(tmp_list); + struct linklist_node * tmp_node = NULL; + int size = 0; + TAILQ_FOREACH(tmp_node, union_list[0], listentry) + { + if(tmp_node->basicinfo->origin_len >= min && tmp_node->basicinfo->origin_len <= max && tmp_node->basicinfo->blocksize == query_blocksize) + { + struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + new_node->index_key = tmp_node->index_key; + new_node->basicinfo = tmp_node->basicinfo; + new_node->listname = tmp_list; + TAILQ_INSERT_TAIL(tmp_list, new_node, listentry); + } + } + int i = 0; + for(i = 1; i < list_num; i++) + { + tmp_list = linklist_union(tmp_list, union_list[i], min, max, query_blocksize); + } + + struct linklist_node * tmp_node_t = NULL; + TAILQ_FOREACH(tmp_node_t, tmp_list, listentry) + { + result[size++] = tmp_node_t->basicinfo; + } + + struct linklist_node * first_node = TAILQ_FIRST(tmp_list); + while(first_node != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(first_node, listentry); + free(first_node); + first_node = linklist_tmp; + } + free(tmp_list); + return size; +} + + + + +struct TQ * linklist_union(struct TQ * list_first, struct TQ * list_second, unsigned long long min, unsigned long long max,\ + unsigned long long query_blocksize) +{ + struct TQ * link_result = (struct TQ *)malloc(sizeof(struct TQ)); + TAILQ_INIT(link_result); + struct linklist_node * tmp_first = TAILQ_FIRST(list_first); + struct linklist_node * tmp_second = TAILQ_FIRST(list_second); + while(tmp_first != NULL && tmp_second != NULL) + { + //When combined final result in a relatively small deposit on id, id small pointer will move backward, + // if both are equal, both pointers move backward until a move to the tail end of the list + if(tmp_first->basicinfo->id < tmp_second->basicinfo->id) + { + if(tmp_first->basicinfo->origin_len >= min && tmp_first->basicinfo->origin_len <= max && tmp_first->basicinfo->blocksize == query_blocksize) + { + struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + new_node->index_key = tmp_first->index_key; + new_node->basicinfo = tmp_first->basicinfo; + new_node->listname = link_result; + TAILQ_INSERT_TAIL(link_result, new_node, listentry); + + } + tmp_first = TAILQ_NEXT(tmp_first, listentry); + } + else if(tmp_first->basicinfo->id > tmp_second->basicinfo->id) + { + if(tmp_second->basicinfo->origin_len >= min && tmp_second->basicinfo->origin_len <= max && tmp_second->basicinfo->blocksize == query_blocksize) + { + struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + new_node->index_key = tmp_second->index_key; + new_node->basicinfo = tmp_second->basicinfo; + new_node->listname = link_result; + TAILQ_INSERT_TAIL(link_result, new_node, listentry); + } + tmp_second = TAILQ_NEXT(tmp_second, listentry); + } + else + { + if(tmp_first->basicinfo->origin_len >= min && tmp_first->basicinfo->origin_len <= max && tmp_first->basicinfo->blocksize == query_blocksize) + { + struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + new_node->index_key = tmp_first->index_key; + new_node->basicinfo = tmp_first->basicinfo; + new_node->listname = link_result; + TAILQ_INSERT_TAIL(link_result, new_node, listentry); + } + tmp_first = TAILQ_NEXT(tmp_first, listentry); + tmp_second = TAILQ_NEXT(tmp_second, listentry); + } + } + + //The list is not linked to the end nodes remaining deposit to results + while(tmp_first != NULL) + { + if(tmp_first->basicinfo->origin_len >= min && tmp_first->basicinfo->origin_len <= max && tmp_first->basicinfo->blocksize == query_blocksize) + { + struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + new_node->index_key = tmp_first->index_key; + new_node->basicinfo = tmp_first->basicinfo; + new_node->listname = link_result; + TAILQ_INSERT_TAIL(link_result, new_node, listentry); + } + tmp_first = TAILQ_NEXT(tmp_first, listentry); + } + while(tmp_second != NULL) + { + if(tmp_second->basicinfo->origin_len >= min && tmp_second->basicinfo->origin_len <= max && tmp_second->basicinfo->blocksize == query_blocksize) + { + struct linklist_node * new_node = (struct linklist_node *)malloc(sizeof(struct linklist_node)); + new_node->index_key = tmp_second->index_key; + new_node->basicinfo = tmp_second->basicinfo; + new_node->listname = link_result; + TAILQ_INSERT_TAIL(link_result, new_node, listentry); + } + tmp_second = TAILQ_NEXT(tmp_second, listentry); + } + + + struct linklist_node * first_node = TAILQ_FIRST(list_first); + while(first_node != NULL) + { + struct linklist_node * linklist_tmp = TAILQ_NEXT(first_node, listentry); + free(first_node); + first_node = linklist_tmp; + } + free(list_first); + + + return link_result; +} + + +int minof3(int x, int y, int z) +{ + x = (x 0 ? left/blocksize - TOLERENCE_SIZE: 0; + int fh_size = right/blocksize + TOLERENCE_SIZE - index > fh_actual_len - index ? fh_actual_len - index: right/blocksize + TOLERENCE_SIZE - index; + edit_distance += GIE_edit_distance(fh + index, fh_size, tmp_fuzzy, tmp_fuzzy_len); + *fuzzy_actual_size += tmp_fuzzy_len; + + if(*tmpstr !=']') + { + tmp_fuzzy = tmpstr + 1; + tmp_fuzzy_len = 0; + } + tmpstr ++; + } + else + { + tmp_fuzzy_len++; + tmpstr ++; + } + } + return edit_distance; +} + + +int GIE_query(GIE_handle_t * handle, unsigned long long origin_len, const char * fuzzy_string, GIE_result_t * results, int size) +{ + GIE_handle_inner_t * _handle = (GIE_handle_inner_t *)handle; + + //find min_index + double min_tmp = (double)(origin_len * (1 - _handle->user_query_accuracy)); + unsigned long long min_tmp_t = (unsigned long long )(floor(min_tmp)); + unsigned long long min_index = min_tmp_t/(_handle->user_precision)*(_handle->user_precision); + + //find max_index + double max_tmp = (double)(origin_len * (1 + _handle->user_query_accuracy)); + unsigned long long max_tmp_t = (unsigned long long)(floor(max_tmp)); + unsigned long long max_index = (max_tmp_t/(_handle->user_precision) + 1)*(_handle->user_precision); + + unsigned long long tmp_size = (max_index - min_index)/(_handle->user_precision) + 1; + struct TQ * union_list[tmp_size]; + + unsigned long long i = min_index; + unsigned long long query_blocksize = calc_fh_blocksize(origin_len); + int list_num = 0; + int union_size = 0; + int union_size_max = 0; + int ret_size = 0; + + //find + while(i <= max_index) + { + struct index_table_data * list_tmp = (struct index_table_data *)MESA_htable_search_cb(_handle->index_table, (const uchar * )(&i), \ + sizeof(i), NULL, NULL, NULL); + if(list_tmp != NULL) + { + union_list[list_num++] = list_tmp->listhead; + i = list_tmp->next_value; + union_size_max += list_tmp->cnt; + } + else + { + i = i + _handle->user_precision; + } + + } + + struct id_table_data ** result_union = (struct id_table_data **)malloc(sizeof(struct id_table_data *)*union_size_max); + + if(list_num != 0) + { + union_size = GIE_union(union_list, list_num, result_union, min_tmp_t, max_tmp_t, query_blocksize); + //printf("union_size = %d\n", union_size); + } + else + { + printf("the fh doesn't exsit!\n"); + free(result_union); + return 0; + } + + for(i = 0; i < union_size; i++) + { + int fuzzy_actual_len; + unsigned long long calculate_len; + /*if(result_union[i]->id == 2391) + { + printf("right\n"); + }*/ + int edit_distance = GIE_edit_distance_with_position(result_union[i]->fh, fuzzy_string, origin_len, &fuzzy_actual_len, &calculate_len); + //printf("fuzzy_actual_len = %d\n", fuzzy_actual_len); + short conf_tmp; + if(fuzzy_actual_len != 0 && edit_distance < fuzzy_actual_len) + { + //conf_tmp = CONF_MAX - (fuzzy_all_actual_len - (fuzzy_actual_len - edit_distance))*CONF_MAX/fuzzy_all_actual_len; + conf_tmp = (fuzzy_actual_len - edit_distance)*(calculate_len + 1)*CONF_MAX/(fuzzy_actual_len * origin_len); + //conf_tmp = CONF_MAX - edit_distance*CONF_MAX/fuzzy_actual_len; + } + else + { + conf_tmp = 0; + } + if(conf_tmp >= result_union[i]->cfds_lvl) + { + results[ret_size].cfds_lvl = conf_tmp; + results[ret_size].id = result_union[i]->id; + results[ret_size].origin_len = result_union[i]->origin_len; + results[ret_size++].tag = result_union[i]->tag; + } + if(ret_size == size) + { + break; + } + + } + + free(result_union); + return ret_size; +} + + + diff --git a/src/entry/interval_index.c b/src/entry/interval_index.c new file mode 100644 index 0000000..581acbe --- /dev/null +++ b/src/entry/interval_index.c @@ -0,0 +1,736 @@ +#include +#include +#include"interval_index.h" + + +/** + * There is a trick here. In order to hide specific + * realization of some structures, we use some approaches. + * Then the inner structure is named with "shadow", and + * the outer structure is named with "light". These words + * come from movie <>. Enjoy it :) + **/ + + +/** + * Structure of inner segment + **/ +typedef struct __IVI_shadow_seg_t{ + IVI_seg_t lightseg; + TAILQ_ENTRY(__IVI_shadow_seg_t) ENTRY; +}IVI_shadow_seg_t; + + +TAILQ_HEAD(TQ, __IVI_shadow_seg_t); + +/* Structure of inner InterVal Index */ +typedef struct __IVI_shadow_t{ + struct TQ ivi_queue; + int segs_cnt; + OFFSET_TYPE segs_length; +}IVI_shadow_t; + + + +/** + * new is closer to head or tail ? + * Return 1 if closer to head than tail + * Else return 0 + */ +int closer_to_head(IVI_shadow_seg_t * head, IVI_shadow_seg_t * tail, OFFSET_TYPE target) +{ + if(head == NULL || tail == NULL) + return 1; + S_OFFSET_TYPE tmp1 = (S_OFFSET_TYPE)(target - head->lightseg.left); + S_OFFSET_TYPE tmp2 = (S_OFFSET_TYPE)(target - tail->lightseg.left); + S_OFFSET_TYPE distance_to_head = tmp1 > 0 ? tmp1 : -tmp1; + S_OFFSET_TYPE distance_to_tail = tmp2 > 0 ? tmp2 : -tmp2; + return (distance_to_tail - distance_to_head > 0); +} + + +IVI_seg_t * IVI_prev_continuous_seg(IVI_seg_t * seg) +{ + if(NULL == seg) + { + return NULL; + } + + IVI_shadow_seg_t * _seg = (IVI_shadow_seg_t *)seg; + IVI_shadow_seg_t * prev = TAILQ_PREV(_seg, TQ, ENTRY); + if(NULL == prev) + { + return NULL; + } + if(continuous((prev->lightseg).right, seg->left)) + return (IVI_seg_t *)prev; + return NULL; +} + + +IVI_seg_t * IVI_next_continuous_seg(IVI_seg_t * seg) +{ + if(NULL == seg) + { + return NULL; + } + IVI_shadow_seg_t * _seg = (IVI_shadow_seg_t *)seg; + IVI_shadow_seg_t * next = TAILQ_NEXT(_seg, ENTRY); + if(NULL == next) + { + return NULL; + } + if(continuous(seg->right, (next->lightseg).left)) + return (IVI_seg_t *)next; + return NULL; +} + + +/** + * Name: + * IVI_relative_position + * Description: + * Get relative position of given two interval segments + * Params: + * seg1: Subject of relation + * seg2: Object of relation + * Relation: + * On success, return the relation of two segments with enum; + * Else, return ERROR in enum; + **/ +Relation_t IVI_relative_position(IVI_seg_t * seg1, IVI_seg_t * seg2) +{ + if(NULL == seg1 || NULL == seg2) + { + return ERROR; + } + + if(before(seg1->right, seg2->left)) + { + return LEFT_NO_OVERLAP; + } + + if(!before(seg1->right, seg2->left) && before(seg1->right, seg2->right) && before(seg1->left, seg2->left)) + { + return LEFT_OVERLAP; + } + + if(!before(seg1->left, seg2->left) && !after(seg1->right, seg2->right)) + { + return CONTAINED; + } + + if(!after(seg1->left, seg2->left) && !before(seg1->right, seg2->right)) + { + return CONTAIN; + } + + if(!after(seg1->left, seg2->right) && after(seg1->right, seg2->right) && after(seg1->left, seg2->left)) + { + return RIGHT_OVERLAP; + } + + if(after(seg1->left, seg2->right)) + { + return RIGHT_NO_OVERLAP; + } + return ERROR; +} + + + +/** + * Name: + * IVI_create + * Description: + * Create an InterVal Index + * Params: + * void + * Return: + * Return a handler of this InterVal Index + **/ +IVI_t * IVI_create(void) +{ + IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)malloc(sizeof(IVI_shadow_t)); + TAILQ_INIT(&(shadow_ivi->ivi_queue)); + shadow_ivi->segs_cnt = 0; + shadow_ivi->segs_length = 0; + return (IVI_t *)shadow_ivi; +} + + +/** + * Name: + * IVI_destroy + * Description: + * Destroy a given InterVal Index's handler + * Params: + * handler: The InterVal Index you want to destroy + * cb: Callback function for user to free data in segement + * usr_para: User parameter + * Return: + * void + **/ +void IVI_destroy(IVI_t * handler, IVI_callback_t cb, void * usr_para) +{ + if(handler == NULL) + { + return; + } + + IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler; + IVI_shadow_seg_t * tmpseg = TAILQ_FIRST(&(shadow_ivi->ivi_queue)); + IVI_shadow_seg_t * tmp; + /* Free each seg in IVI */ + while(tmpseg != NULL) + { + tmp = TAILQ_NEXT(tmpseg, ENTRY); + /* Free *data in seg */ + if(NULL != cb) + { + cb(&(tmpseg->lightseg), usr_para); + } + free(tmpseg); + tmpseg = tmp; + } + + /* Free IVI */ + free(shadow_ivi); + handler = NULL; +} + + + +/** + * Name: + * IVI_seg_malloc + * Description: + * Malloc a segment with given parameters + * Params: + * left: Left point of segment + * right: Right point of segment + * data: User data + * Return: + * Return a pointer of segment structure. + **/ +IVI_seg_t * IVI_seg_malloc(OFFSET_TYPE left, OFFSET_TYPE right, void * data) +{ + /* Left must <= Right */ + if(after(left, right)) + { + return NULL; + } + IVI_shadow_seg_t * shadow_seg = (IVI_shadow_seg_t *)malloc(sizeof(IVI_shadow_seg_t)); + shadow_seg->lightseg.left = left; + shadow_seg->lightseg.right= right; + shadow_seg->lightseg.data = data; + + return (IVI_seg_t *)shadow_seg; +} + + + +/** + * Name: + * IVI_seg_free + * Description: + * Free the memory of given segment + * Params: + * seg: The segment that you want to free + * cb: Callback function for user to free *data in seg + * usr_para: User parameter for cb + * Return: + * void + **/ +void IVI_seg_free(IVI_seg_t * seg, IVI_callback_t cb, void * usr_para) +{ + /* Free user data first */ + if(cb != NULL) + { + cb(seg, usr_para); + } + IVI_shadow_seg_t * shadow_seg = (IVI_shadow_seg_t *)seg; + + /* Free seg */ + free(shadow_seg); + seg = NULL; +} + + +/** + * Name: + * IVI_insert + * Description: + * Insert a segment to an InterVal Index handler,and the segment + * MUST not be overlapped with others in handler. + * Params: + * handler: The handler of InterVal Index created by IVI_create + * seg: A segment that user wants to add. It MUST be created + * by IVI_seg_malloc. + * Return: + * On success, 0 is returned; + * Else when overlapp occures or error occures, -1 is returned. + **/ +int IVI_insert(IVI_t * handler, IVI_seg_t * seg) +{ + IVI_shadow_t * shadow_ivi; + IVI_shadow_seg_t *head, *tail, *new_seg, *tmp_seg; + + if(NULL == handler || NULL == seg) + { + return -1; + } + + shadow_ivi = (IVI_shadow_t *)handler; + new_seg = (IVI_shadow_seg_t *)seg; + head = TAILQ_FIRST(&(shadow_ivi->ivi_queue)); + tail = TAILQ_LAST(&(shadow_ivi->ivi_queue), TQ); + + if(closer_to_head(head, tail, seg->left)) + { + TAILQ_FOREACH(tmp_seg, &(shadow_ivi->ivi_queue), ENTRY) + { + /* Find the first seg whose left is bigger than given seg's right, we will insert new seg before it */ + if(after(tmp_seg->lightseg.left, new_seg->lightseg.right)) + { + TAILQ_INSERT_BEFORE(tmp_seg, new_seg, ENTRY); + shadow_ivi->segs_cnt ++; + shadow_ivi->segs_length += (seg->right - seg->left + 1); + return 0; + } + else if(before(tmp_seg->lightseg.right, new_seg->lightseg.left)) + { + continue; + } + else /* Overlap */ + { + return -1; + } + } + + /* If have searched to the end of list, we will inset it to the tail */ + TAILQ_INSERT_TAIL(&(shadow_ivi->ivi_queue), new_seg, ENTRY); + shadow_ivi->segs_cnt ++; + shadow_ivi->segs_length += (seg->right - seg->left + 1); + } + else + { + TAILQ_FOREACH_REVERSE(tmp_seg, &(shadow_ivi->ivi_queue), TQ, ENTRY) + { + /* Find the first seg whose right is smaller than given seg's left, we will insert new seg after it */ + if(before(tmp_seg->lightseg.right, new_seg->lightseg.left)) + { + TAILQ_INSERT_AFTER(&(shadow_ivi->ivi_queue), tmp_seg, new_seg, ENTRY); + shadow_ivi->segs_cnt ++; + shadow_ivi->segs_length += (seg->right - seg->left + 1); + return 0; + } + else if(after(tmp_seg->lightseg.left, new_seg->lightseg.right)) + { + continue; + } + else /* Overlap */ + { + return -1; + } + } + + /* If have searched to the head of list, we will inset it to the head */ + TAILQ_INSERT_HEAD(&(shadow_ivi->ivi_queue), new_seg, ENTRY); + shadow_ivi->segs_cnt ++; + shadow_ivi->segs_length += (seg->right - seg->left + 1); + } + + return 0; +} + + + +/** + * Name: + * IVI_remove + * Description: + * Remove a given segment from given InterVal Index handler. + * Params: + * handler: The handler of InterVal Index created by IVI_create + * seg: A segment that user wants to delete. It MUST be created + * by IVI_seg_malloc. + * Return: + * On success, 0 is returned; + * Else when overlapp occures, -1 is returned. + **/ +int IVI_remove(IVI_t * handler, IVI_seg_t * seg) +{ + if(NULL == handler || NULL == seg) + { + return -1; + } + + IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler; + IVI_shadow_seg_t * shadow_seg = (IVI_shadow_seg_t *)seg; + + TAILQ_REMOVE(&(shadow_ivi->ivi_queue), shadow_seg, ENTRY); + shadow_ivi->segs_cnt --; + shadow_ivi->segs_length -= (seg->right - seg->left + 1); + return 0; +} + + + +/** + * Name: + * IVI_query + * Description: + * Query from given InterVal Index and get the number of segments + * which are overlapped with given interval, and store those segments + * in the last parameter. + * Params: + * handler: The handler of interval index created by IVI_create + * left: Left point of given interval + * right: Right point of given interval + * segs: An address of a segment pointer array to store those segments which + * are overlapped with given interval. NOTE that user should not malloc + * the array, and segs need to be freed by user. The element of *segs + * MUST not be freed by user. + * Return: + * Return the number of segments which are overlapped with given interval + **/ +int IVI_query(IVI_t * handler, OFFSET_TYPE left, OFFSET_TYPE right, IVI_seg_t *** segs) +{ + IVI_shadow_t * shadow_ivi; + IVI_shadow_seg_t *head, *tail, *tmp, *left_tmp, *right_tmp; + int interval_cnt = 0, i; + + if(NULL == handler || after(left, right)) + { + return -1; + } + + shadow_ivi = (IVI_shadow_t *)handler; + head = TAILQ_FIRST(&(shadow_ivi->ivi_queue)); + tail = TAILQ_LAST(&(shadow_ivi->ivi_queue), TQ); + + /* Traverse from head or tail? We need to decide */ + if(closer_to_head(head, tail, left)) + { + tmp = head; + while(tmp != NULL) + { + if(after(left, tmp->lightseg.right)) + { + tmp = TAILQ_NEXT(tmp, ENTRY); + } + else + { + /* Get the seg which left is in or before*/ + left_tmp = tmp; + break; + } + } + if(tmp == NULL) + { + *segs = NULL; + return 0; + } + + /* Get the num of overlapped segs */ + while(tmp != NULL) + { + if(!before(right, tmp->lightseg.left)) + { + tmp = TAILQ_NEXT(tmp, ENTRY); + interval_cnt ++; + } + else + { + break; + } + } + + tmp = left_tmp; + if(interval_cnt == 0) + { + *segs = NULL; + return 0; + } + *segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *)); + for(i = 0; i < interval_cnt; i++) + { + (*segs)[i] = (IVI_seg_t *)tmp; + tmp = TAILQ_NEXT(tmp, ENTRY); + } + } + else + { + tmp = tail; + while(tmp != NULL) + { + if(before(right, tmp->lightseg.left)) + { + tmp = TAILQ_PREV(tmp, TQ, ENTRY); + } + else + { + right_tmp = tmp; + break; + } + } + if(tmp == NULL) + { + *segs = NULL; + return 0; + } + + /* Get the num of overlapped segs */ + while(tmp != NULL) + { + if(!after(left, tmp->lightseg.right)) + { + tmp = TAILQ_PREV(tmp, TQ, ENTRY); + interval_cnt ++; + } + else + { + break; + } + } + tmp = right_tmp; + if(interval_cnt == 0) + { + *segs = NULL; + return 0; + } + *segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *)); + for(i = interval_cnt - 1; i >= 0; i--) + { + (*segs)[i] = (IVI_seg_t *)tmp; + tmp = TAILQ_PREV(tmp, TQ, ENTRY); + } + } + return interval_cnt; +} + + + +/** + * Name: + * IVI_query_continuous + * Description: + * Query from interval index handler and get the number of continous segments + * which are overlapped with given interval. + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * left: Left point of given interval + * right: Right point of given interval + * segs: An address of a segment pointer array to store those segments which + * are overlapped with given interval. NOTE that user should not malloc + * the array, and segs need to be freed by user. The element of *segs + * MUST not be freed by user. + * Return: + * Return the number of continous segments which are overlapped with given interval + **/ +int IVI_query_continuous(IVI_t * handler, OFFSET_TYPE left, OFFSET_TYPE right, IVI_seg_t *** segs) +{ + IVI_shadow_t * shadow_ivi; + IVI_shadow_seg_t *head, *tail, *tmp, *left_tmp, *right_tmp; + int interval_cnt = 0, i; + + if(NULL == handler || after(left, right)) + { + return -1; + } + shadow_ivi = (IVI_shadow_t *)handler; + head = TAILQ_FIRST(&(shadow_ivi->ivi_queue)); + tail = TAILQ_LAST(&(shadow_ivi->ivi_queue), TQ); + + + + /* Traverse from head or tail? We need to decide */ + if(closer_to_head(head, tail, left)) + { + tmp = head; + while(tmp != NULL) + { + if(after(left, tmp->lightseg.right)) + { + tmp = TAILQ_NEXT(tmp, ENTRY); + } + else + { + /* Get the seg which left is in or before*/ + left_tmp = tmp; + break; + } + } + if(tmp == NULL) + { + *segs = NULL; + return 0; + } + + /* Get the num of overlapped segs */ + while(tmp != NULL) + { + if(!before(right, tmp->lightseg.left)) + { + tmp = TAILQ_NEXT(tmp, ENTRY); + interval_cnt ++; + } + else + { + break; + } + + IVI_shadow_seg_t * prev = TAILQ_PREV(tmp, TQ, ENTRY); + if(tmp != NULL && !continuous(prev->lightseg.right, tmp->lightseg.left)) + { + break; + } + } + + tmp = left_tmp; + if(interval_cnt == 0) + { + *segs = NULL; + return 0; + } + *segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *)); + for(i = 0; i < interval_cnt; i++) + { + (*segs)[i] = (IVI_seg_t *)tmp; + tmp = TAILQ_NEXT(tmp, ENTRY); + } + } + else + { + tmp = tail; + while(tmp != NULL) + { + if(before(right, tmp->lightseg.left)) + { + tmp = TAILQ_PREV(tmp, TQ, ENTRY); + } + else + { + right_tmp = tmp; + break; + } + } + if(tmp == NULL) + { + *segs = NULL; + return 0; + } + + /* Get the num of overlapped segs */ + while(tmp != NULL) + { + if(!after(left, tmp->lightseg.right)) + { + tmp = TAILQ_PREV(tmp, TQ, ENTRY); + interval_cnt ++; + } + else + { + break; + } + IVI_shadow_seg_t * next = TAILQ_NEXT(tmp, ENTRY); + if(tmp != NULL && !continuous(tmp->lightseg.right, next->lightseg.left)) + { + break; + } + } + tmp = right_tmp; + if(interval_cnt == 0) + { + *segs = NULL; + return 0; + } + *segs = (IVI_seg_t **)malloc(interval_cnt * sizeof(IVI_seg_t *)); + for(i = interval_cnt - 1; i >= 0; i--) + { + (*segs)[i] = (IVI_seg_t *)tmp; + tmp = TAILQ_PREV(tmp, TQ, ENTRY); + } + } + + return interval_cnt; +} + + + +/** + * Name: + * IVI_seg_cnt + * Description: + * Get the count of segments in given interval index handler + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * Return: + * Return the count of segments in given interval index handler + **/ +int IVI_seg_cnt(IVI_t * handler) +{ + if(handler == NULL) + return -1; + IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler; + return shadow_ivi->segs_cnt; +} + + + +/** + * Name: + * IVI_seg_len + * Description: + * Get the length of whole segments in given interval index handler + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * Return: + * Return the length of whole segments in given interval index handler + **/ +OFFSET_TYPE IVI_seg_length(IVI_t * handler) +{ + if(handler == NULL) + return -1; + IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler; + return shadow_ivi->segs_length; +} + + + +/** + * Name: + * IVI_traverse + * Description: + * Traverse given InterVal Index and execute given callback function + * one time for each seg in InterVal Index. + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * IVI_callback_t: Callback function for user to define. + * usr_para: Parameter user want to pass to callback function. + * Return: + * void + **/ +void IVI_traverse(IVI_t * handler, IVI_callback_t cb, void * usr_para) +{ + if(NULL == handler || NULL == cb) + { + return; + } + + IVI_shadow_t * shadow_ivi = (IVI_shadow_t *)handler; + IVI_shadow_seg_t * tmp_seg = TAILQ_FIRST(&(shadow_ivi->ivi_queue)); + IVI_shadow_seg_t * tmp; + /* Traverse the IVI */ + while(tmp_seg != NULL) + { + /* + * The place we can't use TAILQ_FOREACH because we + * do not no what will callback funciton does. + * */ + tmp = TAILQ_NEXT(tmp_seg, ENTRY); + cb((IVI_seg_t *)tmp_seg, usr_para); + tmp_seg = tmp; + } +} diff --git a/src/entry/mesa_fuzzy.c b/src/entry/mesa_fuzzy.c new file mode 100644 index 0000000..b9666f7 --- /dev/null +++ b/src/entry/mesa_fuzzy.c @@ -0,0 +1,828 @@ +#include +#include +#include +#include +#include +#include +#include +#include "mesa_fuzzy.h" +#include "interval_index.h" + + +#define ROLLING_WINDOW 7 +#define BLOCKSIZE_MIN 3 +#define MAXSIZE 10000 +#define HASH_PRIME 0x01000193 +#define HASH_INIT 0x28021967 + +#define DEBUG (0) + +struct roll_state +{ + unsigned char window[ROLLING_WINDOW]; + unsigned int h1, h2, h3; + unsigned int n; +}; + + +typedef struct +{ + char * left_data; //指向数据的头指针 + unsigned int left_len; //左边保留数据的长度 + + char * hash_result; //这个segment的FNV值 + unsigned long long left_offset; + unsigned long long right_offset; + + struct roll_state * right_status_r; //右边界的rollhash状态 + unsigned int right_status_shash; //右边界的FNV值 + unsigned int right_len;//右边界的长度 + int slice_num; + +}fuzzy_node; + + +typedef struct +{ + unsigned long long orilen; + IVI_t * ivi; //每一个handle里面保存一个IVI指针,一个IVI里面保存的是一个文件里的片 + unsigned long long effective_length; +}fuzzy_handle_inner_t; + + +typedef struct +{ + char * head; //最后输出结果的char数组 + unsigned int size; + unsigned int offset; //数组长度 + unsigned long long first_FNV_offset; + unsigned long long last_FNV_offset; +}final_result; + + +typedef struct +{ + unsigned long long first_FNV_offset; + unsigned long long last_FNV_offset; + unsigned long long hash_length; +}final_length; + + +unsigned int fuzzy_hash_calculate(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize); +void fuzzy_calculate_self(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize); +void fuzzy_calculate_self_with_prev(IVI_seg_t * prev_seg, IVI_seg_t * seg, const char * data, unsigned long long blocksize); +void fuzzy_modify_next(IVI_seg_t * seg, IVI_seg_t * next_seg, unsigned long long blocksize); +unsigned long long get_prev_continous_length(IVI_seg_t * seg); +unsigned int segment_overlap(fuzzy_handle_t * handle, fuzzy_node * fnode, unsigned int size, unsigned long long offset, const char * data); +void fuzzy_hash_merge(IVI_seg_t * seg, void * user_para); +void fuzzy_hash_merge_new(IVI_seg_t * seg, void * user_para); +void fuzzy_hash_length(IVI_seg_t * seg, void * user_para); +unsigned long long fuzzy_status(fuzzy_handle_t * handle, int type); + +char * b64 = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + +/** + * roll_state初始化 + */ +static void roll_init(struct roll_state * self) +{ + memset(self, 0, sizeof(struct roll_state)); +} + + +/** + * 计算roll_hash值,将外部数据读取到窗口中 + */ +static void roll_hash(struct roll_state * self, unsigned char c) +{ + self->h2 -= self->h1; + self->h2 += ROLLING_WINDOW * (unsigned int)c; + + self->h1 += (unsigned int)c; + self->h1 -= (unsigned int)self->window[self->n]; + + self->window[self->n] = c; + self->n++; + if (self->n == ROLLING_WINDOW) + self->n = 0; + self->h3 <<= 5; + self->h3 ^= c; +} + + +/** + * 计算窗口里面的roll_hash值,每次roll_hash值满足一定条件,分片 + */ +static unsigned int roll_sum(const struct roll_state * self) +{ + return self->h1 + self->h2 + self->h3; + /* return self->h1 + self->h2; */ +} + + +/** + * 计算分片的FNV值 + */ +static unsigned int sum_hash(unsigned char c, unsigned int h) +{ + return (h * HASH_PRIME) ^ c; +} + + +/** + * 创建handle + */ +fuzzy_handle_t * fuzzy_create_handle(unsigned long long origin_len) +{ + fuzzy_handle_inner_t * handle = (fuzzy_handle_inner_t *)malloc(sizeof(fuzzy_handle_inner_t)); + handle->orilen = origin_len; + handle->ivi = IVI_create(); + handle->effective_length = 0; + return (fuzzy_handle_t *)handle; +} + + +/** + * IVI_destroy的回调函数,销毁IVI中的数据 + */ +void fuzzy_node_free(IVI_seg_t * seg, void * usr_para) +{ + //printf("free seg[%lu, %lu]\n", seg->left, seg->right); + fuzzy_node * temp = (fuzzy_node*)(seg->data); + if(temp->left_data != NULL) + { + free(temp->left_data); + temp->left_data = NULL; + } + if(temp->hash_result != NULL) + { + free(temp->hash_result); + temp->hash_result = NULL; + } + free(temp->right_status_r); + temp->right_status_r = NULL; + free(temp); + temp = NULL; + return; +} + + + +/** + * 销毁handle + */ +void fuzzy_destroy_handle(fuzzy_handle_t * handle) +{ + IVI_destroy(((fuzzy_handle_inner_t *)handle)->ivi, fuzzy_node_free, NULL); + free((fuzzy_handle_inner_t *)handle); + return; +} + + +/** + * 输入数据,并且计算数据的fuzzy_hash值 + */ +unsigned int fuzzy_feed(fuzzy_handle_t * handle, const char * data, unsigned int size, unsigned long long offset) +{ + fuzzy_node * node = (fuzzy_node *)calloc(sizeof(fuzzy_node), 1); + node->right_status_r = (struct roll_state *)calloc(sizeof (struct roll_state), 1); + roll_init(node->right_status_r); + node->slice_num = 0; + unsigned int length = segment_overlap(handle, node, size, offset, data); + if(offset == 0) + { + ((fuzzy_handle_inner_t *)handle)->effective_length += size - node->right_len; + return (size - node->right_len); + } + else + { + ((fuzzy_handle_inner_t *)handle)->effective_length += length; + } + return length; //返回已经计算的有效长度 +} + + + +unsigned long long get_blocksize(unsigned long long orilen) +{ + double tmp = orilen/(64 * BLOCKSIZE_MIN); + double index = floor(log(tmp)/log(2)); + double tmp_t = pow(2, index); + unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN); + return blocksize; +} + +/** + * 判断数据是否与已经计算过的数据有覆盖 + */ +unsigned int segment_overlap(fuzzy_handle_t * handle, fuzzy_node * fnode, unsigned int size, unsigned long long offset, const char * data) +{ + IVI_seg_t ** overlap_segs = NULL; + IVI_seg_t * seg = IVI_seg_malloc(offset, offset + size -1, (void *)fnode); + int overlap_segnum = 0; + unsigned int effective_length = 0; + unsigned int total_length = 0; + unsigned long long blocksize = get_blocksize(((fuzzy_handle_inner_t *)handle)->orilen); + + /*查询是否有覆盖,如果有覆盖,返回覆盖的segment的片数,如果没有覆盖,返回0*/ + overlap_segnum = IVI_query(((fuzzy_handle_inner_t *)handle)->ivi, offset, offset + size - 1, &overlap_segs); + + /*如果返回值为负数,说明输入的参数有问题,打印错误信息*/ + if(overlap_segnum < 0) + { + printf("fragment info error!\n"); + IVI_seg_free(seg, fuzzy_node_free, NULL); + return 0; + } + + /*如果返回值为0,说明没有覆盖的情况,直接插入就行*/ + if(overlap_segnum == 0) + { + IVI_insert(((fuzzy_handle_inner_t *)handle)->ivi,seg); + effective_length = fuzzy_hash_calculate(seg, data, offset, blocksize); + + total_length = seg->right - seg->left + 1; + return effective_length; + } + + /*如果返回值为覆盖的片数,则需要根据覆盖类型一一进行处理*/ + int flag = 0; + int i; + for(i = 0; i < overlap_segnum; i++) + { + switch(IVI_relative_position(seg, overlap_segs[i])) + { + case LEFT_OVERLAP: //左覆盖,把seg的右值改为overlap_seg的左值 + { + seg->right = overlap_segs[i]->left - 1; + break; + } + case CONTAIN: //包含关系,将左边那部分直接插入,然后改变seg的左值,并将data移动到指定的位置 + { + if(overlap_segs[i]->left - 1 >= seg->left) + { + fuzzy_node * node = (fuzzy_node *)calloc(sizeof(fuzzy_node), 1); + memcpy(node, fnode, sizeof(fuzzy_node)); + node->right_status_r = (struct roll_state *)calloc(sizeof (struct roll_state), 1); + roll_init(node->right_status_r); + IVI_seg_t * thseg = IVI_seg_malloc(seg->left, overlap_segs[i]->left - 1, (void *)node); + IVI_insert(((fuzzy_handle_inner_t *)handle)->ivi,thseg); + effective_length += fuzzy_hash_calculate(thseg, data, offset, blocksize); + total_length += thseg->right - thseg->left + 1; + } + seg->left = overlap_segs[i]->right + 1; + data = data + ((seg->left) - offset); + offset = seg->left; + break; + } + case RIGHT_OVERLAP: //右覆盖,把seg的左值改为overlap_seg的右值 + { + seg->left = overlap_segs[i]->right + 1; + data = data + ((seg->left) - offset); + offset = seg->left; + break; + } + case CONTAINED: //如果被包含,就直接舍弃掉这片 + { + flag = 1; + //printf("contained! free seg\n"); + IVI_seg_free(seg, fuzzy_node_free, NULL); + free(overlap_segs); + break; + } + default: + break; + } + if(flag == 1) + { + return 0; + } + + } + + /*将余下的数据插入到区间链表里面,并且进行计算*/ + if(seg->left <= seg->right) + { + IVI_insert(((fuzzy_handle_inner_t *)handle)->ivi, seg); + effective_length += fuzzy_hash_calculate(seg, data, offset, blocksize); + total_length += seg->right - seg->left + 1; + //((fuzzy_handle_inner_t *)handle)->effective_length += effective_length; + } + else + { + IVI_seg_free(seg, fuzzy_node_free, NULL); + } + + free(overlap_segs); + return effective_length; +} + + +/** + * 处理前后分片,计算fuzzy_hash值 + */ +unsigned int fuzzy_hash_calculate(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize) +{ + IVI_seg_t * prev_seg; + IVI_seg_t * next_seg; + unsigned int effective_length = 0; + + prev_seg = IVI_prev_continuous_seg(seg); + next_seg = IVI_next_continuous_seg(seg); + //printf("seg->right = %lu, seg->left = %lu\n", seg->right, seg->left); + unsigned int size = seg->right - seg->left + 1; + fuzzy_node * node = (fuzzy_node *)(seg->data); + if(NULL == prev_seg) + { + //如果不存在前分片,直接初始化roll_state进行计算 + roll_init(node->right_status_r); + fuzzy_calculate_self(seg, data, offset, blocksize); + effective_length = size - node->left_len; + node->left_offset = offset + node->left_len; + } + else + { + //如果存在前分片,取出前分片的右边界的中间状态值进行计算 + + fuzzy_calculate_self_with_prev(prev_seg, seg, data, blocksize); + effective_length = size + ((fuzzy_node *)(prev_seg->data))->right_len; + node->left_offset = offset - ((fuzzy_node *)(prev_seg->data))->right_len; + } + + /* 如果有后分片,并且自己计算的结果里需要分片,则修改后面的分片 */ + if(next_seg != NULL) + { + //如果存在后分片,将本分片的右边界的中间状态值取出来和后分片的左边界的中间状态进行计算 + fuzzy_modify_next(seg, next_seg, blocksize); + + effective_length += ((fuzzy_node *)(next_seg->data))->left_len; + node->right_offset = offset + size + ((fuzzy_node *)(next_seg->data))->left_len; + + } + else + { + effective_length -= node->right_len; + node->right_offset = offset + (size - (node->right_len)); + } + return effective_length; +} + + + +void fuzzy_calculate_self(IVI_seg_t * seg, const char * data, unsigned long long offset, unsigned long long blocksize) +{ + fuzzy_node * node = (fuzzy_node *)(seg->data); + struct roll_state * rs = node->right_status_r; + unsigned long long size = seg->right - seg->left + 1; + unsigned int FNV_hash_value = HASH_INIT; + + char * FNV_hash = (char *)malloc(sizeof(char)*size); + unsigned long long fnv_index = 0, i, last_slice_index; + unsigned int roll_hash_value; + for(i = 0; i < size; i++) + { + roll_hash(rs, data[i]); + roll_hash_value = roll_sum(rs); + FNV_hash_value = sum_hash(data[i], FNV_hash_value); + if(i >= ROLLING_WINDOW - 1 && roll_hash_value % blocksize == blocksize - 1) + { + node->slice_num ++; + + if(node->slice_num == 1) + { + node->left_len = i + 1; + } + last_slice_index = i; + /* 保存FNV的值 */ + FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64]; + //printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]); + FNV_hash_value = HASH_INIT; + } + } + + /* 一片都没有找到 */ + if(node->slice_num == 0) + { + node->left_len = size; + node->right_len = 0; + } + else + { + node->right_len = size - last_slice_index - 1; + } + node->right_status_shash = FNV_hash_value; + + /* 复制结果到hash_result里 */ + node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1)); + memcpy(node->hash_result, FNV_hash, fnv_index); + (node->hash_result)[fnv_index] = '\0'; + + node->left_data = (char *)malloc(sizeof(char) * (node->left_len)); + memcpy(node->left_data, data, node->left_len); + + free(FNV_hash); + return; +} + + +unsigned long long get_prev_continous_length(IVI_seg_t * seg) +{ + unsigned long long length = 0; + IVI_seg_t * temp = seg; + while(temp != NULL) + { + length += temp->right - temp->left + 1; + if(length >= ROLLING_WINDOW) + return length; + temp = IVI_prev_continuous_seg(temp); + } + return length; +} + +/** + * 处理前段的保留数据 + */ +void fuzzy_calculate_self_with_prev(IVI_seg_t * prev_seg, IVI_seg_t * seg, const char * data, unsigned long long blocksize) +{ + fuzzy_node * prev_node = (fuzzy_node *)(prev_seg->data); + fuzzy_node * node = (fuzzy_node *)(seg->data); + + /* 使用前段的roll state */ + memcpy(node->right_status_r, prev_node->right_status_r, sizeof(struct roll_state)); + struct roll_state * rs = node->right_status_r; + unsigned long long size = seg->right - seg->left + 1; + unsigned int FNV_hash_value = prev_node->right_status_shash; + + + char * FNV_hash = (char *)malloc(sizeof(char)*size); + unsigned long long fnv_index = 0, i, last_slice_index; + unsigned int roll_hash_value; + unsigned long long prev_len = get_prev_continous_length(prev_seg); + + for(i = 0; i < size; i++) + { + roll_hash(rs, data[i]); + roll_hash_value = roll_sum(rs); + FNV_hash_value = sum_hash(data[i], FNV_hash_value); + if(i + prev_len >= ROLLING_WINDOW \ + && roll_hash_value % blocksize == blocksize - 1) + { + node->slice_num ++; + if(node->slice_num == 1) + { + node->left_len = i + 1; + } + + last_slice_index = i; + /* 保存FNV的值 */ + FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64]; + //printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]); + FNV_hash_value = HASH_INIT; + } + } + + /* 一片都没有找到 */ + if(node->slice_num == 0) + { + node->left_len = size; + node->right_len = 0; + } + else + { + node->right_len = size - last_slice_index - 1; + } + node->right_status_shash = FNV_hash_value; + + /* 复制结果到hash_result里 */ + node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1)); + memcpy(node->hash_result, FNV_hash, fnv_index); + (node->hash_result)[fnv_index] = '\0'; + + node->left_data = (char *)malloc(sizeof(char) * (node->left_len)); + memcpy(node->left_data, data, node->left_len); + + free(FNV_hash); +} + + + +void fuzzy_modify_self_with_prev(IVI_seg_t * prev_seg, IVI_seg_t * seg, char * data, unsigned long long blocksize) +{ + fuzzy_node * prev_node = (fuzzy_node *)(prev_seg->data); + fuzzy_node * node = (fuzzy_node *)(seg->data); + + /* 使用前段的roll state */ + memcpy(node->right_status_r, prev_node->right_status_r, sizeof(struct roll_state)); + struct roll_state * rs = node->right_status_r; + unsigned long long size = seg->right - seg->left + 1; + unsigned int FNV_hash_value = prev_node->right_status_shash; + + + char * FNV_hash = (char *)malloc(sizeof(char)*size); + unsigned long long fnv_index = 0, i, last_slice_index; + unsigned int roll_hash_value; + unsigned long long prev_len = get_prev_continous_length(prev_seg); + for(i = 0; i < size; i++) + { + roll_hash(rs, data[i]); + roll_hash_value = roll_sum(rs); + FNV_hash_value = sum_hash(data[i], FNV_hash_value); + if(i + prev_len >= ROLLING_WINDOW \ + && roll_hash_value % blocksize == blocksize- 1) + { + node->slice_num ++; + if(node->slice_num == 1) + { + node->left_len = i + 1; + } + + last_slice_index = i; + /* 保存FNV的值 */ + FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64]; + //printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]); + FNV_hash_value = HASH_INIT; + } + } + + /* 一片都没有找到 */ + if(node->slice_num == 0) + { + node->left_len = size; + node->right_len = 0; + } + else + { + node->right_len = size - last_slice_index - 1; + } + node->right_status_shash = FNV_hash_value; + + /* 复制结果到hash_result里 */ + free(node->hash_result); + node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1)); + memcpy(node->hash_result, FNV_hash, fnv_index); + (node->hash_result)[fnv_index] = '\0'; + + //printf("old node->left_data = %s\n", node->left_data); + free(node->left_data); + node->left_data = (char *)malloc(sizeof(char) * (node->left_len)); + memcpy(node->left_data, data, node->left_len); + //printf("new node->left_data = %s\n", node->left_data); + free(FNV_hash); +} + + + +/** + * 处理后段的保留数据 + */ +void fuzzy_modify_next(IVI_seg_t * seg, IVI_seg_t * next_seg, unsigned long long blocksize) +{ + IVI_seg_t * tmp_curr_seg = seg; + IVI_seg_t * tmp_next_seg = next_seg; + while(tmp_next_seg != NULL) + { + fuzzy_node * tmp_next_node = (fuzzy_node *)(tmp_next_seg->data); + if(tmp_next_node->slice_num != 0) + { + break; + } + + /* 下一段没有分片, 则重新计算 */ + + char * data = (char *)malloc(sizeof(char) * (tmp_next_node->left_len)); + memcpy(data, tmp_next_node->left_data, tmp_next_node->left_len); + fuzzy_modify_self_with_prev(tmp_curr_seg, tmp_next_seg, data, blocksize); + free(data); + + tmp_curr_seg = tmp_next_seg; + tmp_next_seg = IVI_next_continuous_seg(tmp_next_seg); + } + + unsigned long long prev_len = get_prev_continous_length(tmp_curr_seg); + /* tmp_next_seg中是有分片的 */ + if(tmp_next_seg != NULL) + { + fuzzy_node * tmp_curr_node = (fuzzy_node *)(tmp_curr_seg->data); + fuzzy_node * tmp_next_node = (fuzzy_node *)(tmp_next_seg->data); + + unsigned long long size = tmp_next_node->left_len; + + char * FNV_hash = (char *)malloc(sizeof(char)*size); + unsigned long long fnv_index = 0, i; + unsigned int roll_hash_value; + + struct roll_state rs; + memcpy(&rs, tmp_curr_node->right_status_r, sizeof(struct roll_state)); + char * data = tmp_next_node->left_data; + unsigned int FNV_hash_value = tmp_curr_node->right_status_shash; + for(i = 0; i < size; i++) + { + roll_hash(&rs, data[i]); + roll_hash_value = roll_sum(&rs); + FNV_hash_value = sum_hash(data[i], FNV_hash_value); + + if((i + prev_len >= ROLLING_WINDOW) \ + && roll_hash_value % blocksize == blocksize - 1) + { + tmp_next_node->slice_num ++; + FNV_hash[fnv_index ++] = b64[FNV_hash_value % 64]; + //printf("data[%lu]=%c, FNV_hash = %c\n", i, data[i], b64[FNV_hash_value % 64]); + FNV_hash_value = HASH_INIT; + + if(fnv_index == 1) + { + tmp_next_node->left_len = i + 1; + } + } + } + + tmp_next_node->slice_num --; + + + /* 复制结果到hash_result里 */ + unsigned long long old_len = strlen(tmp_next_node->hash_result); + if(old_len == 1) + { + free(tmp_next_node->hash_result); + tmp_next_node->hash_result = (char *)malloc(sizeof(char) * (fnv_index + 1)); + memcpy(tmp_next_node->hash_result, FNV_hash, fnv_index); + (tmp_next_node->hash_result)[fnv_index] = '\0'; + } + else + { + unsigned long long new_len = old_len - 1 + fnv_index; + char tmp[old_len - 1]; + char * old_hash = (tmp_next_node->hash_result) + 1; + memcpy(tmp, old_hash, old_len - 1); + free(tmp_next_node->hash_result); + tmp_next_node->hash_result = (char *)malloc(sizeof(char) * (new_len + 1)); + memset(tmp_next_node->hash_result, '\0', (new_len + 1)); + memcpy(tmp_next_node->hash_result, FNV_hash, fnv_index); + strncat(tmp_next_node->hash_result, tmp, old_len - 1); + (tmp_next_node->hash_result)[new_len] = '\0'; + } + free(FNV_hash); + } + return; +} + + + +/** + * 取出区间链表里面的hash_result值,并进行拼接,形成最后的result输出,并且满足abc[1:100]def[200:300]这种格式 + */ +int fuzzy_digest(fuzzy_handle_t * handle, char * result, unsigned int size) +{ + final_result * temp = (final_result *)malloc(sizeof(final_result)); + temp->head = result; + temp->size = size; + temp->offset = 0; + temp->first_FNV_offset = 0; + temp->last_FNV_offset = 0; + //final_result * temp = (final_result *)malloc(sizeof(final_result)); + //temp->offset = 0; + IVI_traverse(((fuzzy_handle_inner_t *)handle)->ivi, fuzzy_hash_merge_new, (void *) temp); + result[size - 1] = '\0'; + //memcpy(result, temp->result, size); + free(temp); + return 0; +} + + +void fuzzy_hash_merge_new(IVI_seg_t * seg, void * user_para) +{ + IVI_seg_t * prev_seg; + IVI_seg_t * next_seg; + prev_seg = IVI_prev_continuous_seg(seg); + next_seg = IVI_next_continuous_seg(seg); + char buffer[MAXSIZE]; + final_result * tmp = (final_result *)user_para; + fuzzy_node * node = (fuzzy_node *)(seg->data); + if(node->slice_num != 0) + { + tmp->last_FNV_offset = seg->right - node->right_len; + } + + if(prev_seg == NULL && next_seg == NULL) //如果前分片和后分片都为空,则不用拼接 + { + tmp->first_FNV_offset = seg->left; + tmp->last_FNV_offset = seg->right - node->right_len; + sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right); + } + if(prev_seg == NULL && next_seg != NULL) //如果前分片为空,后分片不为空,更新左值,将FNV值链接上去 + { + tmp->first_FNV_offset = seg->left; + + sprintf(buffer, "%s", node->hash_result); + } + if(prev_seg != NULL && next_seg == NULL) //如果前分片不为空,后分片为空,更新右值输出偏移 + { + + sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right); + } + if(prev_seg != NULL && next_seg != NULL) //如果前分片不为空,后分片不为空,将链接FNV值上去 + { + sprintf(buffer, "%s", node->hash_result); + } + + unsigned int inner_size = strlen(buffer); + tmp->offset += inner_size; + if(tmp->offset <= tmp->size) + { + memcpy(tmp->head, buffer, inner_size); + tmp->head += inner_size; + } + else + { + unsigned int length = (tmp->size - (tmp->offset - inner_size)); + if(length != 0) + { + memcpy(tmp->head, buffer, length); + } + tmp->offset = tmp->size; + tmp->head += length; + } + return; +} + + +/** + * 计算fuzzy_hash的各种长度 + */ +unsigned long long fuzzy_status(fuzzy_handle_t * handle, int type) +{ + unsigned long long length; + fuzzy_handle_inner_t * _handle = (fuzzy_handle_inner_t *)(handle); + switch(type) + { + case TOTAL_LENGTH: //已经计算过hash值的全部长度 + { + length = IVI_seg_length(_handle->ivi); + break; + } + case EFFECTIVE_LENGTH: //包含在计算hash值里面的有效长度 + { + length = _handle->effective_length; + break; + } + case HASH_LENGTH: //最后输出哈希结果的长度 + { + final_length tmp_length; + tmp_length.hash_length = 0; + tmp_length.first_FNV_offset = 0; + tmp_length.last_FNV_offset = 0; + IVI_traverse(_handle->ivi, fuzzy_hash_length, (void *)&tmp_length); + length = tmp_length.hash_length + 1; + break; + } + default: + return 0; + } + return length; +} + + +void fuzzy_hash_length(IVI_seg_t * seg, void * user_para) +{ + + IVI_seg_t * prev_seg; + IVI_seg_t * next_seg; + prev_seg = IVI_prev_continuous_seg(seg); + next_seg = IVI_next_continuous_seg(seg); + char buffer[MAXSIZE]; + final_length * tmp = (final_length *)user_para; + fuzzy_node * node = (fuzzy_node *)(seg->data); + if(node->slice_num != 0) + { + //printf("node->slice_num != 0\n"); + tmp->last_FNV_offset = seg->right - node->right_len; + //printf("%lu\n", tmp->last_FNV_offset); + } + + if(prev_seg == NULL && next_seg == NULL) //如果前分片和后分片都为空,则不用拼接 + { + tmp->first_FNV_offset = seg->left; + tmp->last_FNV_offset = seg->right - node->right_len; + sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right); + } + if(prev_seg == NULL && next_seg != NULL) //如果前分片为空,后分片不为空,更新左值,将FNV值链接上去 + { + tmp->first_FNV_offset = seg->left; + + sprintf(buffer, "%s", node->hash_result); + } + if(prev_seg != NULL && next_seg == NULL) //如果前分片不为空,后分片为空,更新右值输出偏移 + { + + sprintf(buffer, "%s[%llu:%llu]", node->hash_result, tmp->first_FNV_offset, seg->right); + } + if(prev_seg != NULL && next_seg != NULL) //如果前分片不为空,后分片不为空,将链接FNV值上去 + { + sprintf(buffer, "%s", node->hash_result); + } + tmp->hash_length += strlen(buffer); + return; +} + diff --git a/src/inc_internal/interval_index.h b/src/inc_internal/interval_index.h new file mode 100644 index 0000000..ad2dca4 --- /dev/null +++ b/src/inc_internal/interval_index.h @@ -0,0 +1,298 @@ +/************************************************************************ + * InterVal Index interface + * NOTE that: + * (1) There are no overlapping intervals in InterVal Index; + * (2) Each interval is closed; + * (3) The interval supports rollback. + * + * author: zhengchao@iie.ac.cn tangqi@iie.ac.cn + * last modify time: 2015-08-29 + *************************************************************************/ + +#ifndef _INTERVAL_INDEX_H_ +#define _INTERVAL_INDEX_H_ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "queue.h" + +#define SIZE_8 + +#ifdef SIZE_8 +typedef unsigned long long OFFSET_TYPE; +typedef signed long long S_OFFSET_TYPE; +#else +typedef unsigned int OFFSET_TYPE; +typedef signed int S_OFFSET_TYPE; +#endif + + +typedef struct{ +}IVI_t; + + +/** + * structure of segment + **/ +typedef struct __IVI_seg_t{ + OFFSET_TYPE left; + OFFSET_TYPE right; + void * data; +}IVI_seg_t; + + +typedef void IVI_callback_t(IVI_seg_t * seg, void * usr_para); + +/** + * Deal with rollback + * Refering to the approach of Linux's kernel to solute tcp seq rollback + **/ +static inline int before(OFFSET_TYPE off1, OFFSET_TYPE off2) +{ + return (S_OFFSET_TYPE)(off1 - off2) < 0; +} +#define after(off2, off1) before(off1, off2) + +static inline int continuous(OFFSET_TYPE prev, OFFSET_TYPE next) +{ + return ((next - prev) == 1); +} + + +IVI_seg_t * IVI_prev_continuous_seg(IVI_seg_t * seg); +IVI_seg_t * IVI_next_continuous_seg(IVI_seg_t * seg); + + +/** + * Relation of two segments + **/ +typedef enum __Relation_t{ + LEFT_NO_OVERLAP = 1, // |___A___| + // |___B___| + + LEFT_OVERLAP, // |___A___| + // |___B___| + + CONTAINED, // |___A___| + // |_____B_____| + + CONTAIN, // |_____A_____| + // |___B___| + + RIGHT_OVERLAP, // |___A___| + // |___B___| + + RIGHT_NO_OVERLAP, // |___A___| + // |___B___| + + ERROR +}Relation_t; + + +/** + * Name: + * IVI_relative_position + * Description: + * Get relative position of given two interval segments + * Params: + * seg1: Subject of relation + * seg2: Object of relation + * Relation: + * On success, return the relation of two segments with enum; + * Else, return ERROR in enum; + **/ +Relation_t IVI_relative_position(IVI_seg_t * seg1, IVI_seg_t * seg2); + + + +/** + * Name: + * IVI_create + * Description: + * Create an InterVal Index + * Params: + * void + * Return: + * Return a handler of this InterVal Index + **/ +IVI_t * IVI_create(void); + + + +/** + * Name: + * IVI_destroy + * Description: + * Destroy a given InterVal Index's handler + * Params: + * handler: The InterVal Index you want to destroy + * cb: Callback function for user to free data in segement + * usr_para: User parameter + * Return: + * void + **/ +void IVI_destroy(IVI_t * handler, IVI_callback_t cb, void * usr_para); + + + +/** + * Name: + * IVI_seg_malloc + * Description: + * Malloc a segment with given parameters + * Params: + * left: Left point of segment + * right: Right point of segment + * data: User data + * Return: + * Return a pointer of segment structure. + **/ +IVI_seg_t * IVI_seg_malloc(OFFSET_TYPE left, OFFSET_TYPE right, void * data); + + + +/** + * Name: + * IVI_seg_free + * Description: + * Free the memory of given segment + * Params: + * seg: The segment that you want to free + * cb: Callback function for user to free *data in seg + * usr_para: User parameter for cb + * Return: + * void + **/ +void IVI_seg_free(IVI_seg_t * seg, IVI_callback_t cb, void * usr_para); + + + +/** + * Name: + * IVI_insert + * Description: + * Insert a segment to an InterVal Index handler,and the segment + * MUST not be overlapped with others in handler. + * Params: + * handler: The handler of InterVal Index created by IVI_create + * seg: A segment that user wants to add. It MUST be created + * by IVI_seg_malloc. + * Return: + * On success, 0 is returned; + * Else when overlapp occures or error occures, -1 is returned. + **/ +int IVI_insert(IVI_t * handler, IVI_seg_t * seg); + + + +/** + * Name: + * IVI_remove + * Description: + * Remove a given segment from given InterVal Index handler. + * Params: + * handler: The handler of InterVal Index created by IVI_create + * seg: A segment that user wants to delete. It MUST be created + * by IVI_seg_malloc. + * Return: + * On success, 0 is returned; + * Else when overlapp occures, -1 is returned. + **/ +int IVI_remove(IVI_t * handler, IVI_seg_t * seg); + + + +/** + * Name: + * IVI_query + * Description: + * Query from given InterVal Index and get the number of segments + * which are overlapped with given interval, and store those segments + * in the last parameter. + * Params: + * handler: The handler of interval index created by IVI_create + * left: Left point of given interval + * right: Right point of given interval + * segs: An address of a segment pointer array to store those segments which + * are overlapped with given interval. NOTE that user should not malloc + * the array, and segs need to be freed by user. The element of *segs + * MUST not be freed by user. + * Return: + * Return the number of segments which are overlapped with given interval + **/ +int IVI_query(IVI_t * handler, OFFSET_TYPE left, OFFSET_TYPE right, IVI_seg_t *** segs); + + + +/** + * Name: + * IVI_query_continuous + * Description: + * Query from interval index handler and get the number of continous segments + * which are overlapped with given interval. + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * left: Left point of given interval + * right: Right point of given interval + * segs: An address of a segment pointer array to store those segments which + * are overlapped with given interval. NOTE that user should not malloc + * the array, and segs need to be freed by user. The element of *segs + * MUST not be freed by user. + * Return: + * Return the number of continous segments which are overlapped with given interval + **/ +int IVI_query_continuous(IVI_t * handler, OFFSET_TYPE left, OFFSET_TYPE right, IVI_seg_t *** segs); + + + +/** + * Name: + * IVI_seg_cnt + * Description: + * Get the count of segments in given interval index handler + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * Return: + * Return the count of segments in given interval index handler + **/ +int IVI_seg_cnt(IVI_t * handler); + + +/** + * Name: + * IVI_seg_len + * Description: + * Get the length of whole segments in given interval index handler + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * Return: + * Return the length of whole segments in given interval index handler + **/ +OFFSET_TYPE IVI_seg_length(IVI_t * handler); + + + +/** + * Name: + * IVI_traverse + * Description: + * Traverse given InterVal Index and execute given callback function + * one time for each seg in InterVal Index. + * Params: + * handler: The handler of InterVal Index created by IVI_create. + * IVI_callback_t: Callback function for user to define. + * usr_para: Parameter user want to pass to callback function. + * Return: + * void + **/ +void IVI_traverse(IVI_t * handler, IVI_callback_t cb, void * usr_para); + + + +#ifdef __cplusplus +} +#endif + +#endif /* _INTERVAL_INDEX_H_ */ diff --git a/src/inc_internal/mesa_fuzzy.h b/src/inc_internal/mesa_fuzzy.h index 6497d89..ddd97c6 100644 --- a/src/inc_internal/mesa_fuzzy.h +++ b/src/inc_internal/mesa_fuzzy.h @@ -34,7 +34,7 @@ typedef struct * create a fuzzy hash handle and return it. * @return [handle] */ -fuzzy_handle_t * fuzzy_create_handle(unsigned long long total_len); +fuzzy_handle_t * fuzzy_create_handle(unsigned long long origin_len); /** * destroy context by a fuzzy hash handle. @@ -82,3 +82,4 @@ unsigned long long fuzzy_status(fuzzy_handle_t * handle, int type); #endif #endif + diff --git a/src/inc_internal/queue.h b/src/inc_internal/queue.h new file mode 100644 index 0000000..eec1e9b --- /dev/null +++ b/src/inc_internal/queue.h @@ -0,0 +1,113 @@ +/*************************************************** +* TAILQ int Linux's +****************************************************/ + + +#ifndef _QUEUE_H_ +#define _QUEUE_H_ + +/** + * Tail queue definitions. + */ +#define _TAILQ_HEAD(name, type, qual) \ +struct name { \ + qual type *tqh_first; /* first element */ \ + qual type *qual *tqh_last; /* addr of last next element */ \ +} +#define TAILQ_HEAD(name, type) _TAILQ_HEAD(name, struct type,) + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first } + +#define _TAILQ_ENTRY(type, qual) \ +struct { \ + qual type *tqe_next; /* next element */ \ + qual type *qual *tqe_prev; /* address of previous next element */\ +} +#define TAILQ_ENTRY(type) _TAILQ_ENTRY(struct type,) + +/* + * Tail queue functions. + */ +#define TAILQ_INIT(head) do { \ + (head)->tqh_first = NULL; \ + (head)->tqh_last = &(head)->tqh_first; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + if (((elm)->field.tqe_next = (head)->tqh_first) != NULL) \ + (head)->tqh_first->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (head)->tqh_first = (elm); \ + (elm)->field.tqe_prev = &(head)->tqh_first; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + (elm)->field.tqe_next = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &(elm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\ + (elm)->field.tqe_next->field.tqe_prev = \ + &(elm)->field.tqe_next; \ + else \ + (head)->tqh_last = &(elm)->field.tqe_next; \ + (listelm)->field.tqe_next = (elm); \ + (elm)->field.tqe_prev = &(listelm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + (elm)->field.tqe_next = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &(elm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_REMOVE(head, elm, field) do { \ + if (((elm)->field.tqe_next) != NULL) \ + (elm)->field.tqe_next->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + *(elm)->field.tqe_prev = (elm)->field.tqe_next; \ +} while (/*CONSTCOND*/0) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = ((head)->tqh_first); \ + (var); \ + (var) = ((var)->field.tqe_next)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = (*(((struct headname *)((head)->tqh_last))->tqh_last)); \ + (var); \ + (var) = (*(((struct headname *)((var)->field.tqe_prev))->tqh_last))) + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + } \ +} while (/*CONSTCOND*/0) + +/* + * Tail queue access methods. + */ +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) +#define TAILQ_FIRST(head) ((head)->tqh_first) +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + + + +#endif