#include #include #include #include #include #include #include #include #include "mesa_fuzzy.h" #include "interval_index.h" //#define DEBUG_PRINT #define INIT_SIZE 128 #define ENTROPY_THRESHOLD 0.5 #define MULTIPLE 4 int count = 0; const char * map_to64bytes = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; double get_rs_entropy(unsigned int * r_array, unsigned int r_index); int cmp(const void * a, const void * b); void sfh_rs_entropy(IVI_seg_t * seg, void * user_para); void sfh_tune_simulation(IVI_seg_t * seg, void * user_para); void sfh_output_state_t(IVI_seg_t * seg, void * user_para); int write_uint_array(unsigned int ** array, unsigned int *index,unsigned int *size,unsigned int value); /** * roll_state初始化 */ static inline void roll_init(struct roll_state_t * self) { memset(self, 0, sizeof(struct roll_state_t)); } /** * 计算roll_hash值,将外部数据读取到窗口中 */ static inline void roll_hash(struct roll_state_t * self, unsigned char c) { self->h2 -= self->h1; self->h2 += ROLLING_WINDOW * (unsigned int)c; self->h1 += (unsigned int)c; self->h1 -= (unsigned int)self->window[self->n]; self->window[self->n] = c; self->n++; if (self->n == ROLLING_WINDOW) { self->n = 0; } self->h3 <<= 5; self->h3 ^= c; } /** * 计算窗口里面的roll_hash值,每次roll_hash值满足一定条件,分片 */ static inline unsigned int roll_sum(const struct roll_state_t * self) { return self->h1 + self->h2 + self->h3; } /** * 计算分片的FNV值 */ static inline unsigned int sum_hash(unsigned char c, unsigned int h) { return (h * HASH_PRIME) ^ c; } /** * 创建handle */ sfh_instance_t * SFH_instance(unsigned long long origin_len) { fuzzy_handle_inner_t * handle = NULL; unsigned long long tmp_blksize = 0; tmp_blksize = get_blocksize(origin_len); if(tmp_blksize==0) { return NULL; } handle = (fuzzy_handle_inner_t *)calloc(1,sizeof(fuzzy_handle_inner_t)); handle->fuzzy_node_memory = 0; handle->IVI_memory = 0; handle->fuzzy_node_memory += sizeof(fuzzy_handle_inner_t); handle->orilen = origin_len; handle->ivi = IVI_create(); handle->effective_length = 0; handle->length_increase = 0; handle->sim_tuned_rs_cnt = 0; //handle->blocksize=tmp_blksize; handle->blocksize = 3; handle->do_tune=1; return (sfh_instance_t *)handle; } /** * IVI_destroy的回调函数,销毁IVI中的数据 */ void fuzzy_node_free(IVI_seg_t * seg, void * usr_para) { fuzzy_handle_inner_t * _handle = (fuzzy_handle_inner_t *)usr_para; sfh_seg_t * temp = (sfh_seg_t*)(seg->data); _handle->fuzzy_node_memory-=destroy_sfh_seg(temp); return; } void SFH_release(sfh_instance_t * handle) { IVI_destroy(((fuzzy_handle_inner_t *)handle)->ivi, fuzzy_node_free, (void *)handle); ((fuzzy_handle_inner_t *)handle)->fuzzy_node_memory -= sizeof(fuzzy_handle_inner_t); free((fuzzy_handle_inner_t *)handle); return; } unsigned int SFH_feed(sfh_instance_t * handle, const char * data, unsigned int size, unsigned long long offset) { fuzzy_handle_inner_t * _handle=(fuzzy_handle_inner_t *)handle; if(data == NULL || size == 0) { return 0; } unsigned int length = segment_overlap(_handle, size, offset, data); _handle->effective_length += length; _handle->length_increase += length; if(_handle->s_state_cnt>EXPECT_SIGNATURE_LEN&&_handle->do_tune==1) { //printf("s_state_cnt before:%d\n", _handle->s_state_cnt); //printf("blocksize before:%llu\n", _handle->blocksize); unsigned long long check_length = (_handle->effective_length/_handle->s_state_cnt)*EXPECT_SIGNATURE_LEN; if(_handle->length_increase > check_length) { IVI_traverse(_handle->ivi, sfh_tune_simulation, (void *)_handle); //printf("sim_rs_cnt:%d\n", _handle->sim_tuned_rs_cnt); if(_handle->sim_tuned_rs_cnt>EXPECT_SIGNATURE_LEN) { _handle->blocksize*= MULTIPLE; IVI_traverse(_handle->ivi, sfh_tune_callback, (void *)_handle); } _handle->sim_tuned_rs_cnt = 0; _handle->length_increase = 0; } //printf("s_state_cnt after:%d\n", _handle->s_state_cnt); //printf("blocksize after:%llu\n", _handle->blocksize); } #if 0 SFH_digest(handle,result, sizeof(result)); printf("%llu %s\n",offset,result); #endif return length; } void sfh_tune_simulation(IVI_seg_t * seg, void * user_para) { sfh_seg_t * tmp = (sfh_seg_t *)(seg->data); int i = 0; fuzzy_handle_inner_t * _handle = (fuzzy_handle_inner_t *)user_para; unsigned long long blocksize = _handle->blocksize * MULTIPLE; for(i = 0; i < tmp->r_cnt; i++) { if(tmp->r_array[i] % blocksize == blocksize -1) { _handle->sim_tuned_rs_cnt ++; } } } unsigned long long get_blocksize(unsigned long long orilen) { double tmp = orilen/(64 * BLOCKSIZE_MIN); double index = floor(log(tmp)/log(2)); double tmp_t = pow(2, index); unsigned long long blocksize = (unsigned long long)(tmp_t * BLOCKSIZE_MIN); if(blocksize == 0) { blocksize = BLOCKSIZE_MIN; } return blocksize; // return BLOCKSIZE_MIN; } sfh_seg_t* create_sfh_seg(fuzzy_handle_inner_t * _handle) { sfh_seg_t*p=(sfh_seg_t*)calloc(sizeof(sfh_seg_t),1); roll_init(&(p->r_state)); p->s_size = INIT_SIZE; p->s_cnt=0; p->r_size = INIT_SIZE; p->r_cnt=0; p->r_array = (unsigned int*)malloc(sizeof(unsigned int)*(p->r_size)); _handle->fuzzy_node_memory+=sizeof(unsigned int)*(p->r_size); p->s_array = (struct zt_state_t*)malloc(sizeof(struct zt_state_t)*(p->s_size)); _handle->fuzzy_node_memory+=sizeof(struct zt_state_t)*(p->s_size); zt_hash_initial(&(p->s_state)); zt_hash_initial(&(p->p_state)); _handle->fuzzy_node_memory += sizeof(sfh_seg_t); return p; } //return freed memory size int destroy_sfh_seg(sfh_seg_t*p) { int ret_size=0; if(p->s_array != NULL) { free(p->s_array); p->s_array=NULL; ret_size+=p->s_size*sizeof(struct zt_state_t); } if(p->r_array != NULL) { free(p->r_array); p->r_array=NULL; ret_size+=p->r_size*sizeof(unsigned int); } ret_size+=sizeof(sfh_seg_t); free(p); p=NULL; return ret_size; } /** * 判断数据是否与已经计算过的数据有覆盖 */ unsigned int segment_overlap(fuzzy_handle_inner_t * _handle, unsigned int size, unsigned long long offset, const char * data) { IVI_seg_t ** overlap_segs = NULL; IVI_seg_t *new_seg=NULL,*target_seg=NULL; sfh_seg_t* sfh_seg=NULL; int overlap_segnum = 0,i=0,co_seg_num=0,ret=0; unsigned int effective_length = 0; unsigned long long calc_begin=offset; unsigned long long calc_end=offset+size-1; //printf("size: %u\n",size); //printf("before query\n"); /*查询是否有覆盖,如果有覆盖,返回覆盖的segment的片数,如果没有覆盖,返回0*/ if(offset>0) { overlap_segnum = IVI_query(_handle->ivi, offset-1, offset + size, &overlap_segs); } else { overlap_segnum = IVI_query(_handle->ivi, 0, offset + size, &overlap_segs); } IVI_seg_t * co_overlap_segs[overlap_segnum+1]; assert(overlap_segnum>=0); if(overlap_segnum==0||offsetleft) { sfh_seg=create_sfh_seg(_handle); calc_begin=offset; if(overlap_segnum == 0) { calc_end=offset+size-1; } else { calc_end=MIN(overlap_segs[0]->left-1,offset+size-1); } new_seg = IVI_seg_malloc(calc_begin, calc_end, (void *)sfh_seg); _handle->s_state_cnt+=sfh_update_seg(_handle, sfh_seg,data+calc_begin-offset, calc_end-calc_begin+1, _handle->blocksize); effective_length+=(calc_end-calc_begin+1); co_overlap_segs[co_seg_num]=new_seg; co_seg_num++; } for(i=0;iivi,overlap_segs[i]); _handle->IVI_memory = IVI_mem_occupy(_handle->ivi); assert(ret==0); } for(i=0;iright+1,calc_begin); if(i+1left-1,offset+size-1); } else { calc_end=offset+size-1; } if(!after(calc_begin,calc_end)) { sfh_seg=(sfh_seg_t*)(co_overlap_segs[i]->data); _handle->s_state_cnt+=sfh_update_seg(_handle,sfh_seg,data+calc_begin-offset, calc_end-calc_begin+1, _handle->blocksize); effective_length+=(calc_end-calc_begin+1); co_overlap_segs[i]->right+=calc_end-calc_begin+1; calc_begin=calc_end+1; } } target_seg=co_overlap_segs[0]; for(i=0;idata)->r_index>0&&((sfh_seg_t*)co_overlap_segs[i]->data)->r_index>0) { memset(&result_p,0,sizeof(result_p)); result_p.data=rp_buff; result_p.size=sizeof(rp_buff); sfh_output_callback(target_seg,&result_p); memset(&result_n,0,sizeof(result_n)); result_n.data=rn_buff; result_n.size=sizeof(rn_buff); sfh_output_callback(co_overlap_segs[i],&result_n); printf("%s[%llu:%llu] %s[%llu:%llu]\n",rp_buff,target_seg->left, target_seg->right, rn_buff,co_overlap_segs[i]->left, co_overlap_segs[i]->right); } #endif _handle->s_state_cnt+=sfh_merge_seg(_handle,(sfh_seg_t*)target_seg->data, (sfh_seg_t*)co_overlap_segs[i]->data, _handle->blocksize); target_seg->right=co_overlap_segs[i]->right; IVI_seg_free(co_overlap_segs[i], fuzzy_node_free, (void *)_handle); } //IVI_seg_t * insert_seg=NULL; //insert_seg = IVI_seg_malloc(target_seg->left, target_seg->right, target_seg->data); ret=IVI_insert(_handle->ivi,target_seg); _handle->IVI_memory = IVI_mem_occupy(_handle->ivi); assert(ret==0); free(overlap_segs); return effective_length; } int cmp(const void * a, const void * b) { unsigned int tmp_a = *(unsigned int *)a; unsigned int tmp_b = *(unsigned int *)b; if(before(tmp_a, tmp_b)) { return -1; } else if(after(tmp_a, tmp_b)) { return 1; } else { return 0; } } double get_rs_entropy(unsigned int * r_array, unsigned int r_index) { qsort(r_array, r_index, sizeof(unsigned int), cmp); unsigned int current_r = r_array[0]; unsigned int * tmp_r = r_array; unsigned int count = 0; double sum = 0; int i = 0; for(i = 0; i <= r_index; i++) { if(i == r_index || *tmp_r != current_r) { double p = (double)count/r_index; //printf("count : %d\n",count); //printf("r_index: %u\n",r_index); //printf("p:%f\n",p); if(p != 0) { sum += p * (log(p)/log(2)); } current_r = *tmp_r; count = 0; } else { count++; } if(i < r_index) { tmp_r ++; } } return (-sum); } void sfh_tune_seg(sfh_seg_t * p, unsigned long long blocksize) { int i = 0, j = 0; struct zt_state_t tmp_zt; int new_zt_cnt=0; zt_hash_initial(&tmp_zt); for(j = 0; j < p->r_cnt; j++) { if(j == 0) { zt_hash_arymul(&tmp_zt, &(p->p_state)); } else { zt_hash_arymul(&tmp_zt, &(p->s_array[j - 1])); } if(p->r_array[j] % blocksize == blocksize - 1) { p->r_array[i]=p->r_array[j]; i++; if(i>1) { p->s_array[new_zt_cnt].val=tmp_zt.val; new_zt_cnt++; } else { p->p_state.val=tmp_zt.val; } zt_hash_initial(&tmp_zt); } } zt_hash_arymul(&tmp_zt, &(p->s_state)); if(i == 0) { zt_hash_initial(&(p->p_state)); } p->s_state.val = tmp_zt.val; p->s_cnt = new_zt_cnt; p->r_cnt = i; assert(p->r_cnt>=p->s_cnt); } void sfh_tune_callback(IVI_seg_t * seg, void * user_para) { sfh_seg_t * p = (sfh_seg_t *)(seg->data); if(p->r_cnt== 0) { return; } fuzzy_handle_inner_t * _handle = (fuzzy_handle_inner_t *)user_para; unsigned long long blocksize = _handle->blocksize; _handle->s_state_cnt-=p->s_cnt; sfh_tune_seg(p, blocksize); _handle->s_state_cnt+=p->s_cnt; //printf("after state_cnt:%d,block:%llu\n",_handle->s_state_cnt,_handle->blocksize); } int write_uint_array(unsigned int ** array,unsigned int *index, unsigned int *size,unsigned int value) { int mem_size=0; if(*index==*size) { (*size)*=2; mem_size+=*size; *array=(unsigned int*)realloc(*array,sizeof(unsigned int)*(*size)); } (*array)[*index]=value; (*index)++; return mem_size; } int sfh_update_seg(fuzzy_handle_inner_t * _handle, sfh_seg_t * p, const char * data, unsigned long data_size,unsigned long long blocksize) { unsigned long i = 0; unsigned int roll_hash_value = 0; int state_inc_cnt=0; if(p->msize < ROLLING_WINDOW - 1) { for(i = 0; i < ROLLING_WINDOW - p->msize && i < data_size; i++) { p->mbuf[p->msize + i] = data[i]; roll_hash(&(p->r_state), data[i]); } p->msize += i; } for(; i < data_size; i++) { roll_hash(&(p->r_state), data[i]); roll_hash_value = roll_sum(&(p->r_state)); zt_hash(&(p->s_state),data[i]); if((roll_hash_value % (blocksize)) == blocksize - 1) { p->slice_num ++; if(p->r_cnt==0) { p->p_state.val=p->s_state.val; } else { #ifdef DEBUG_PRINT printf("p->s_cnt:%u\n",p->s_cnt); printf("p->s_size:%u\n",p->s_size); #endif _handle->fuzzy_node_memory+=write_uint_array((unsigned int**)(&(p->s_array)), &(p->s_cnt),&(p->s_size),p->s_state.val); state_inc_cnt++; } #ifdef DEBUG_PRINT printf("p->r_cnt:%u\n",p->s_cnt); printf("p->r_size:%u\n",p->s_size); #endif _handle->fuzzy_node_memory+=write_uint_array(&(p->r_array),&(p->r_cnt),&(p->r_size),roll_hash_value); zt_hash_initial(&(p->s_state)); } } assert(p->r_cnt>=p->s_cnt); p->right_offset+=data_size; return state_inc_cnt; } int sfh_merge_seg(fuzzy_handle_inner_t * _handle, sfh_seg_t * p, sfh_seg_t * n,unsigned long long blocksize) { unsigned int roll_hash_value = 0; int i = 0,state_inc_cnt=0; struct roll_state_t * rs = &(p->r_state); for(i = 0; i < n->msize; i++) { roll_hash(rs, n->mbuf[i]); roll_hash_value = roll_sum(rs); zt_hash(&(p->s_state), n->mbuf[i]); if(roll_hash_value % blocksize == blocksize - 1) { p->slice_num ++; if(p->r_cnt == 0) { p->p_state.val = p->s_state.val; } else { _handle->fuzzy_node_memory+=write_uint_array((unsigned int **)(&(p->s_array)), &(p->s_cnt), &(p->s_size), p->s_state.val); state_inc_cnt++; } _handle->fuzzy_node_memory+=write_uint_array(&(p->r_array),&(p->r_cnt), &(p->r_size), roll_hash_value); zt_hash_initial(&(p->s_state)); } } if(n->r_cnt==0) { zt_hash_arymul(&(p->s_state),&(n->p_state)); zt_hash_arymul(&(p->s_state), &(n->s_state)); } else { if(p->r_cnt==0) { zt_hash_arymul(&(p->s_state),&(n->p_state)); p->p_state.val=p->s_state.val; } else { zt_hash_arymul(&(p->s_state), &(n->p_state)); _handle->fuzzy_node_memory+=write_uint_array((unsigned int **)(&(p->s_array)), &(p->s_cnt), &(p->s_size), p->s_state.val); state_inc_cnt++; } p->s_state.val=n->s_state.val; } for(i=0;ir_cnt;i++) { _handle->fuzzy_node_memory+=write_uint_array(&(p->r_array),&(p->r_cnt), &(p->r_size), n->r_array[i]); } for(i=0;is_cnt;i++) { _handle->fuzzy_node_memory+=write_uint_array((unsigned int **)(&(p->s_array)), &(p->s_cnt), &(p->s_size), n->s_array[i].val); } memcpy(&(p->r_state),&(n->r_state),sizeof(p->r_state)); assert(p->r_cnt>=p->s_cnt); return state_inc_cnt; } /** * 取出区间链表里面的hash_result值,并进行拼接,形成最后的result输出,并且满足abc[1:100]def[200:300]这种格式 */ int SFH_digest(sfh_instance_t * handle, char * hash_buffer, unsigned int size) { fuzzy_handle_inner_t* _handle=(fuzzy_handle_inner_t *)handle; unsigned int estimate_len=_handle->s_state_cnt+IVI_seg_cnt(_handle->ivi)*24+1; int actual_len=0; char* p=NULL; sfh_output_t result; memset(&result,0,sizeof(result)); result.size_b1 = estimate_len; result.size_b2 = estimate_len; result.hash_b1 = (char*)calloc(sizeof(char),estimate_len); result.hash_b2 = (char*)calloc(sizeof(char),estimate_len); result.offset_b1 = 0; result.offset_b2 = 0; result.b1=_handle->blocksize; result.b2=_handle->blocksize*MULTIPLE; IVI_traverse(_handle->ivi, sfh_output_callback, (void *) &result); if(result.offset_b1==0||result.offset_b2==0) { hash_buffer[0]='\0'; goto fast_out; } p =strrchr(result.hash_b1,'['); assert(p!=NULL); memmove(p+1,p,strlen(p)); *p=result.last_char_b1; p =strrchr(result.hash_b2,'['); assert(p!=NULL); memmove(p+1,p,strlen(p)); *p=result.last_char_b2; actual_len=snprintf(hash_buffer,size,"%llu:%s#%llu:%s",result.b1,result.hash_b1, result.b2,result.hash_b2); fast_out: free(result.hash_b1); result.hash_b1=NULL; free(result.hash_b2); result.hash_b2=NULL; return actual_len; } sfh_seg_t* sfh_clone_seg(sfh_seg_t* origin) { sfh_seg_t* clone=NULL; clone=(sfh_seg_t*)calloc(sizeof(sfh_seg_t),1); memcpy(clone,origin,sizeof(sfh_seg_t)); clone->s_array=calloc(sizeof(struct zt_state_t),clone->s_size); memcpy(clone->s_array,origin->s_array,sizeof(struct zt_state_t)*clone->s_size); clone->r_array=calloc(sizeof(unsigned int),clone->r_size); memcpy(clone->r_array,origin->r_array,sizeof(unsigned int)*clone->r_size); return clone; } int sfh_print_seg(sfh_seg_t* p, char* hash_result, int size,char* last_char) { int idx=0,i=0; if(p->left_offset== 0) { hash_result[idx] = map_to64bytes[zt_hash_code(&(p->p_state)) & 0x3F]; idx++; } for(i = 0; i < p->s_cnt&&idxs_array[i].val) & 0x3F]; } if(p->s_state.val!=*((unsigned int*)ZT_INIT_VAL)) { *last_char=map_to64bytes[zt_hash_code(&(p->s_state)) & 0x3F]; } else { *last_char='\0'; } // p->right_offset-1 to get a closed interval idx+=snprintf(hash_result+idx,size-idx,"[%llu:%llu]",p->left_offset, p->right_offset-1); assert(idxdata); sfh_seg_t* tmp; if(node->s_cnt==0&&!(seg->left==0&&node->s_cnt > 0)) { return; } result->offset_b1+=sfh_print_seg(node,result->hash_b1+result->offset_b1,result->size_b1-result->offset_b1,&(result->last_char_b1)); tmp=sfh_clone_seg(node); sfh_tune_seg(tmp, result->b2); result->offset_b2+=sfh_print_seg(tmp,result->hash_b2+result->offset_b2,result->size_b2-result->offset_b2,&(result->last_char_b2)); destroy_sfh_seg(tmp); tmp=NULL; return; } /** * 计算fuzzy_hash的各种长度 */ unsigned long long SFH_status(sfh_instance_t * handle, int type) { unsigned long long length; fuzzy_handle_inner_t * _handle = (fuzzy_handle_inner_t *)(handle); final_length tmp_length; char buffer[64]; switch(type) { case TOTAL_LENGTH: //已经计算过hash值的全部长度 length = IVI_seg_length(_handle->ivi); break; case EFFECTIVE_LENGTH: //包含在计算hash值里面的有效长度 length = _handle->effective_length; break; case HASH_LENGTH: //最后输出哈希结果的长度 tmp_length.hash_length = 0; tmp_length.first_ZTH_offset = 0; tmp_length.last_ZTH_offset = 0; tmp_length.hash_length+=snprintf(buffer,sizeof(buffer),"%llu:",_handle->blocksize); IVI_traverse(_handle->ivi, fuzzy_hash_length, (void *)&tmp_length); length = tmp_length.hash_length + 1; break; case MEMORY_OCCUPY: length = _handle->fuzzy_node_memory + _handle->IVI_memory; break; default: return 0; } return length; } void fuzzy_hash_length(IVI_seg_t * seg, void * user_para) { char buffer[100]; final_length * tmp = (final_length *)user_para; sfh_seg_t * node = (sfh_seg_t *)(seg->data); if(node->s_cnt==0&&!(seg->left==0&&node->r_cnt > 0)) { return; } snprintf(buffer, sizeof(buffer), "[%llu:%llu]", seg->left, seg->right); tmp->hash_length += 2*node->r_cnt*sizeof(char) + 2*strlen(buffer); return; }