#include #include #include #include #include #include #include #include #include #include #include "edit_element.h" int cjson_element_foreach(cJSON *a, int *depth, int *step, int *step_level, char **node, const struct edit_element_rule * rules, int *match_num, int loop); static void html_node_list(const struct edit_element_rule * rules, xmlNodePtr node, xmlNodePtr *parent_array, size_t *n_parent, int *match); size_t parse_json_output_unformatted(const char * in, size_t in_sz, const struct edit_element_rule * rules, char** out); enum search_scope scope_name_to_id(const char * name) { const char * std_name[] = {"inside_element","whole_file"}; size_t i = 0; for (i = 0; i < sizeof(std_name) / sizeof(const char *); i++) { if (0 == strcasecmp(name, std_name[i])) { break; } } return (enum search_scope) i; } int match_start_indicator(xmlNodePtr parent, char * start_indicator) { if(parent->properties == NULL) { return 0; } struct _xmlAttr *properties = parent->properties; if(properties->children == NULL || properties->children->content == NULL) { return 0; } xmlNodePtr children = properties->children; if(!strcasecmp((char *)children->content, start_indicator)) { return 1; } return 0; } int construct_cjson_by_treatment(cJSON *a_element, char **node, int *step, int *step_array_level, const struct edit_element_rule * rules) { const char *element_treatment=rules->element_treatment; char * start_indicator = rules->start_indicator; if(element_treatment != NULL && !strcasecmp(element_treatment, "mark")) { if (rules->scope == kScopeInside) { if(a_element->type == cJSON_Object) { if(*node != NULL && strcasecmp(*node, start_indicator) != 0) { return 0; } } if(a_element->type == cJSON_Array) { if(a_element->string != NULL && strcasecmp(a_element->string, start_indicator)) { return 0; } } } if(a_element->type==cJSON_Object) { cJSON_AddBoolToObject(a_element, "need_filter", true); } if(a_element->type == cJSON_Array) { cJSON *object = NULL; object = cJSON_GetArrayItem(a_element, step_array_level[*step]); if(object != NULL) { cJSON_AddBoolToObject(object, "need_filter", true); } } } if(element_treatment != NULL && !strcasecmp(element_treatment, "remove")) { if (rules->scope == kScopeInside) { if(a_element->type == cJSON_Object) { if(*node != NULL && strcasecmp(*node, start_indicator) != 0) { return 0; } } if(a_element->type == cJSON_Array) { if(a_element->string != NULL && strcasecmp(a_element->string, start_indicator)) { return 0; } } } if(a_element->type == cJSON_Object && *node != NULL) { cJSON_DeleteItemFromObject(a_element, *node); } if(a_element->type == cJSON_Array) { cJSON_DeleteItemFromArray(a_element, step_array_level[*step]); } } return 0; } int construct_html_by_treatment(const struct edit_element_rule * rules, xmlNodePtr node, xmlNodePtr *parent_array, size_t *n_parent, int *match) { int k=0; char *new_out=NULL; size_t output_size=0; char * token = NULL, * saveptr = NULL; if(strcasestr((char *)node->content, "var ytInitialData")) { token = strtok_r((char *)node->content, "=", &saveptr); if(token != NULL && ((saveptr[0] == '{') || (saveptr[1] == '{'))) { output_size = parse_json_output_unformatted(saveptr, strlen(saveptr), rules, &new_out); if(output_size != 0 && new_out != NULL) { new_out[output_size] = ';'; FREE(&node->content); node->content = (xmlChar*)new_out; *match =1; return 0; } } } if(strcasestr((char *)node->content, rules->contained_keyword) == NULL) { return 0; } char * start_indicator = rules->start_indicator; const char *element_treatment=rules->element_treatment; int distane_from_matching = (rules->distane_from_matching + 1); if(element_treatment != NULL && !strcasecmp(element_treatment, "mark")) { xmlNodePtr parent = node->parent; k++; while (parent != NULL) { if(k == distane_from_matching) { if (rules->scope == kScopeInside && match_start_indicator(parent, start_indicator) == 0) { break; } xmlNewProp(parent, (const xmlChar *)"need_filter", (const xmlChar *)"true"); *match =1; break; } k++; parent = parent->parent; } } if(element_treatment != NULL && !strcasecmp(element_treatment, "remove")) { xmlNodePtr parent = node->parent; k++; while (parent != NULL) { if(k == distane_from_matching) { if (rules->scope == kScopeInside && match_start_indicator(parent, start_indicator) == 0) { break; } /*This is the top floor, Don't deal with**/ if(parent->parent == NULL) { break; } if(*n_parent < 16) { parent_array[*n_parent] = parent; *n_parent = *n_parent+1; } *match =1; break; } k++; parent = parent->parent; } } return 0; } int cjson_dump_array(cJSON *a, int *depth, int *step, int *step_array_level, char **node, const struct edit_element_rule * rules, int *match_num, int loop) { int xret=0, array_cnt=0; cJSON *a_element = a->child; *step= *step + 1; for (; (a_element != NULL);) { xret = cjson_element_foreach(a_element, depth, step, step_array_level, node, rules, match_num, loop); if(xret == -1) { return -1; } if(*depth == 0) { construct_cjson_by_treatment(a_element, node, step, step_array_level, rules); } if(xret == 1) { *step = (*step >= 2047) ? 2047 : *step; step_array_level[*step] = array_cnt; *node = a_element->string; *depth = *depth -1; return 1; } array_cnt++; a_element = a_element->next; } return xret; } int cjson_dump_object(cJSON *a, int *depth, int *step, int *step_array_level, char **node, const struct edit_element_rule * rules, int *match_num, int loop) { int xret=0; cJSON *a_element=NULL; cJSON_ArrayForEach(a_element, a) { xret = cjson_element_foreach(a_element, depth, step, step_array_level, node, rules, match_num, loop); if(xret == -1) { return -1; } if(*depth == 0) { construct_cjson_by_treatment(a_element, node, step, step_array_level, rules); } if(xret == 1) { *node = a_element->string; *depth = *depth -1; return 1; } } return xret; } int cjson_dump_string(cJSON *a, int *depth, const struct edit_element_rule * rules, int *match_num, int loop) { int xret=0; if((a->valuestring != NULL) && strcasestr(a->valuestring, rules->contained_keyword)) { if(*depth != -1) { if(!strcasecmp(rules->element_treatment, "mark")) { if(*match_num == loop) { xret = 1; goto finish; } } else { xret = 1; goto finish; } } *match_num = *match_num + 1; } finish: return xret; } int cjson_element_foreach(cJSON *a, int *depth, int *step, int *step_array_level, char **node, const struct edit_element_rule * rules, int *match_num, int loop) { if ((a == NULL) || cJSON_IsInvalid(a)) { return -1; } switch (a->type & 0xFF) { case cJSON_String: case cJSON_Raw: return cjson_dump_string(a, depth, rules, match_num, loop); case cJSON_Array: return cjson_dump_array(a, depth, step, step_array_level, node, rules, match_num, loop); case cJSON_Object: return cjson_dump_object(a, depth, step, step_array_level, node, rules, match_num, loop); case cJSON_Number: case cJSON_False: case cJSON_True: case cJSON_NULL: return 0; default: return -1; } return 0; } static void html_namespace_list(xmlNsPtr ns) { while (ns != NULL) { ns = ns->next; } } static void html_attr_list(const struct edit_element_rule * rules, xmlAttrPtr attr, xmlNodePtr *parent_array, size_t *n_parent, int *match) { while (attr != NULL) { if (attr->children != NULL) { html_node_list(rules, attr->children, parent_array, n_parent, match); } attr = attr->next; } } static void html_dump_one_node(const struct edit_element_rule * rules, xmlNodePtr node, xmlNodePtr *parent_array, size_t *n_parent, int *match) { switch (node->type) { case XML_ELEMENT_NODE: case XML_ELEMENT_DECL: case XML_CDATA_SECTION_NODE: case XML_ENTITY_REF_NODE: case XML_ENTITY_NODE: case XML_PI_NODE: case XML_COMMENT_NODE: case XML_DOCUMENT_TYPE_NODE: case XML_DOCUMENT_FRAG_NODE: case XML_NOTATION_NODE: case XML_TEXT_NODE: break; default: return; } if ((node->type == XML_ELEMENT_NODE) && (node->nsDef != NULL)) { html_namespace_list(node->nsDef); } if ((node->type == XML_ELEMENT_NODE) && (node->properties != NULL)) { html_attr_list(rules, node->properties, parent_array, n_parent, match); } if (node->type != XML_ENTITY_REF_NODE) { if ((node->type != XML_ELEMENT_NODE) && (node->content != NULL)) { construct_html_by_treatment(rules, node, parent_array, n_parent, match); } } } static void html_dump_node(const struct edit_element_rule * rules, xmlNodePtr node, xmlNodePtr *parent_array, size_t *n_parent, int *match) { html_dump_one_node(rules, node, parent_array, n_parent, match); if ((node->type != XML_NAMESPACE_DECL) && (node->children != NULL) && (node->type != XML_ENTITY_REF_NODE)) { html_node_list(rules, node->children, parent_array, n_parent, match); } } static void html_node_list(const struct edit_element_rule * rules, xmlNodePtr node, xmlNodePtr *parent_array, size_t *n_parent, int *match) { while (node != NULL) { html_dump_node(rules, node, parent_array, n_parent, match); node = node->next; } } static void html_element_foreach(const struct edit_element_rule * rules, xmlDocPtr doc, xmlNodePtr *parent_array, size_t *n_parent, int *match) { if (((doc->type == XML_DOCUMENT_NODE) || (doc->type == XML_HTML_DOCUMENT_NODE)) && (doc->children != NULL)) { html_node_list(rules, doc->children, parent_array, n_parent, match); } } size_t parse_json_output_unformatted(const char * in, size_t in_sz, const struct edit_element_rule * rules, char** out) { int match_num_peer=0; int step=0, depth=0, match_num=0,i=0, match=0; cJSON* interator=NULL; char* new_out = NULL, *node=NULL; size_t outlen=0; char *element_treatment=NULL; int step_array_level[2048] = {0}; interator = cJSON_Parse(in); if(interator==NULL) { goto finish; } depth = -1; element_treatment=rules->element_treatment; cjson_element_foreach(interator, &depth, &step, step_array_level, &node, rules, &match_num, 0); match_num_peer = match_num; for(i=0; i< match_num_peer; i++) { depth = (rules->distane_from_matching + 1); step=0; node=NULL; match_num=0; memset(step_array_level, 0, sizeof(step_array_level)); match |= cjson_element_foreach(interator, &depth, &step, step_array_level, &node, rules, &match_num, i); if(!strcasecmp(element_treatment, "remove") && match == 1 && node != NULL && depth == 0) { cJSON_DeleteItemFromObject(interator, node); } } if(match==1 && element_treatment != NULL && !strcasecmp(element_treatment, "mark")) { if(interator->type==cJSON_Object) { cJSON_AddBoolToObject(interator, "need_check", true); } if(interator->type==cJSON_Array) { cJSON *child = interator->child; for (; (child != NULL);) { cJSON_AddBoolToObject(child, "need_check", true); child = child->next; } } } new_out = cJSON_PrintUnformatted(interator); if(new_out!=NULL) { *out = new_out; outlen = strlen(new_out); } finish: if(interator != NULL) cJSON_Delete(interator); return outlen; } size_t format_json_file_type(const char * in, size_t in_sz, const struct edit_element_rule * rules, char** out) { int match_num_peer=0; int step=0, depth=0, match=0, i=0; cJSON* interator=NULL; char* new_out = NULL, *node=NULL; size_t outlen=0; int match_num=0; char *element_treatment=NULL; int step_array_level[2048] = {0}; char*new_in = ALLOC(char, in_sz+1); memcpy(new_in, in, in_sz); interator = cJSON_Parse(new_in); if(interator==NULL) { goto finish; } depth = -1; element_treatment=rules->element_treatment; /*When the node has inclusion relation, cjson is not null when deleted So multiple loops delete **/ cjson_element_foreach(interator, &depth, &step, step_array_level, &node, rules, &match_num, 0); match_num_peer = match_num; for(i=0; i< match_num_peer; i++) { depth = (rules->distane_from_matching + 1); step=0; node=NULL; match_num=0; memset(step_array_level, 0, sizeof(step_array_level)); match |= cjson_element_foreach(interator, &depth, &step, step_array_level, &node, rules, &match_num, i); if(!strcasecmp(element_treatment, "remove") && match == 1 && node != NULL && depth == 0) { cJSON_DeleteItemFromObject(interator, node); } match_num--; } if(match == 0) { goto finish; } if(element_treatment != NULL && !strcasecmp(element_treatment, "mark")) { if(interator->type==cJSON_Object) { cJSON_AddBoolToObject(interator, "need_check", true); } if(interator->type==cJSON_Array) { cJSON *child = interator->child; for (; (child != NULL);) { cJSON_AddBoolToObject(child, "need_check", true); child = child->next; } } } new_out = cJSON_PrintUnformatted(interator); if(new_out!=NULL) { *out = new_out; outlen = strlen(*out); } finish: if(interator != NULL) cJSON_Delete(interator); FREE(&new_in); return outlen; } size_t format_multidelete_json_type(const char * in, size_t in_sz, const struct edit_element_rule * rules, char** out) { char *new_out=NULL, *pre_out=NULL; char * tmp = ALLOC(char, in_sz+1); char * token = NULL, * sub_token = NULL, * saveptr = NULL; size_t output_size = 0; size_t new_out_len=0; /**Follow-up optimization */ new_out = ALLOC(char, in_sz+in_sz/3); memcpy(tmp, in, in_sz); for (token = tmp;; token = NULL) { sub_token = strtok_r(token, "\n", &saveptr); if (sub_token == NULL) { new_out[new_out_len-2]='\0'; break; } output_size = parse_json_output_unformatted(sub_token, strlen(sub_token), rules, &pre_out); if(output_size>0 && pre_out!=NULL) { memcpy(new_out+new_out_len, pre_out, strlen(pre_out)); new_out_len += strlen(pre_out); memcpy(new_out+new_out_len, "\r\n", 2); new_out_len +=2; FREE(&pre_out); } } if(new_out) { *out = new_out; output_size = strlen(new_out); } free(tmp); tmp = NULL; return output_size; } size_t construct_format_html(htmlDocPtr doc, char**out) { size_t outlen=0; xmlBufferPtr out_buffer; const xmlChar *xmlCharBuffer; xmlSaveCtxtPtr saveCtxtPtr; out_buffer = xmlBufferCreate(); if (out_buffer == NULL) { goto finish; } saveCtxtPtr = xmlSaveToBuffer(out_buffer, "UTF-8", XML_SAVE_NO_DECL | XML_SAVE_AS_HTML); if (xmlSaveDoc(saveCtxtPtr, doc) < 0) { goto finish; } xmlSaveClose(saveCtxtPtr); xmlCharBuffer = xmlBufferContent(out_buffer); if(xmlCharBuffer != NULL) { char*new_out = ALLOC(char, strlen((char *)xmlCharBuffer)+1); memcpy(new_out, (char *)xmlCharBuffer, strlen((char *)xmlCharBuffer)); *out = new_out; outlen = strlen((char *)xmlCharBuffer); } finish: if(out_buffer != NULL) { xmlBufferFree(out_buffer); } return outlen; } size_t format_input_html(const char * in, size_t in_sz, const struct edit_element_rule * rules, char** out) { size_t outlen=0, n_parent=0, n_parent_peer=0; int match=0, i=0; htmlDocPtr doc = NULL; const char *element_treatment=NULL; xmlNodePtr parent_array[16]; int options = XML_PARSE_NOERROR | HTML_PARSE_NODEFDTD; doc = htmlReadMemory(in, in_sz, NULL, NULL, options); if (doc == NULL) { goto finish; } /*When the node has inclusion relation, libxml2 is not null when deleted So multiple loops delete **/ html_element_foreach(rules, doc, parent_array, &n_parent, &match); if(match != 1) { goto finish; } n_parent_peer = n_parent; element_treatment=rules->element_treatment; if(element_treatment != NULL && !strcasecmp(element_treatment, "remove")) { for(i=0; i < (int)n_parent_peer; i++) { match =0; n_parent = 0; html_element_foreach(rules, doc, parent_array, &n_parent, &match); if(match == 1) { xmlUnlinkNode(parent_array[0]); xmlFreeNode(parent_array[0]); } } } if(element_treatment != NULL && !strcasecmp(element_treatment, "mark")) { if(doc->children != NULL && doc->children->next != NULL) { xmlNewProp(doc->children->next, (const xmlChar *)"need_check", (const xmlChar *)"true"); } else if(doc->children != NULL) { xmlNewProp(doc->children, (const xmlChar *)"need_check", (const xmlChar *)"true"); } } outlen = construct_format_html(doc, out); if(outlen<=0) { outlen=0; } finish: if(doc!=NULL) { xmlFreeDoc(doc); } return outlen; } size_t format_html_file_type(const char * interator, size_t interator_sz, const struct edit_element_rule *rule, char **new_out) { size_t output_size=0; if(interator[0] == '{') { output_size = format_multidelete_json_type(interator, interator_sz, rule, new_out); } else { output_size = format_input_html(interator, interator_sz, rule, new_out); } return output_size; } size_t parse_string(const char * interator, size_t interator_sz, const struct edit_element_rule *rule, char **new_out, int options) { size_t output_size=0; if(options) { output_size = format_json_file_type(interator, interator_sz, rule, new_out); } else { output_size = format_html_file_type(interator, interator_sz, rule, new_out); } return output_size; } size_t execute_edit_element_rule(const char * in, size_t in_sz, const struct edit_element_rule *rules, size_t n_rule, char** out, int options) { const struct edit_element_rule * todo = rules; size_t i = 0, interator_sz=0, pre_out_sz=0; const char * interator = NULL; char* new_out = NULL, * pre_out = NULL; size_t output_size=0; if (in_sz == 0 || in==NULL) { return 0; } interator = in; interator_sz = in_sz; for (i = 0; i < n_rule; i++) { output_size = parse_string(interator, interator_sz, &(todo[i]), &new_out, options); if (output_size == 0) { continue; } if (pre_out != NULL) { free(pre_out); pre_out = NULL; } pre_out = new_out; pre_out_sz = output_size; interator = new_out; interator_sz = output_size; new_out=NULL; output_size=0; } if(pre_out_sz>0) { *out=pre_out; return pre_out_sz; } else { return 0; } } size_t __attribute__((__unused__)) format_edit_element_rule(struct edit_element_rule *edit_element, const char *user_region, size_t n_edit_element) { size_t idx=0; cJSON *json=NULL, *rules=NULL, *item=NULL, *sub_item=NULL; json=cJSON_Parse(user_region); if(json !=NULL ) { rules = cJSON_GetObjectItem(json, "rules"); if(rules == NULL) { goto finish; } idx = 0; for (item = rules->child; item != NULL; item = item->next) { sub_item=cJSON_GetObjectItem(item,"anchor_element"); if(sub_item != NULL && sub_item->type ==cJSON_Object) { char * search_scope = cJSON_GetObjectItem(sub_item , "search_scope")->valuestring; if (search_scope == NULL) break; edit_element[idx].scope = scope_name_to_id(search_scope); if (edit_element[idx].scope == KScopeMax) { break; } if(edit_element[idx].scope == kScopeInside) { edit_element[idx].start_indicator = tfe_strdup(cJSON_GetObjectItem(sub_item , "start_indicator")->valuestring); } edit_element[idx].contained_keyword = tfe_strdup(cJSON_GetObjectItem(sub_item,"contained_keyword")->valuestring); } sub_item=cJSON_GetObjectItem(item,"target_element"); if(sub_item != NULL && sub_item->type ==cJSON_Object) { edit_element[idx].distane_from_matching = cJSON_GetObjectItem(sub_item , "target_distance_from_matching")->valueint; edit_element[idx].element_treatment = tfe_strdup(cJSON_GetObjectItem(sub_item,"element_treatment")->valuestring); } if (idx == n_edit_element) { break; } idx++; } } finish: if (json) cJSON_Delete(json); return idx; } void simple_edit_element(const char *user_region, const char* input, size_t in_sz, char** output, size_t *output_sz, int options) { size_t n_got_rule=0, i=0; struct edit_element_rule rules[16]; memset(rules, 0, sizeof(struct edit_element_rule)*16); n_got_rule=format_edit_element_rule(rules, user_region, sizeof(rules)/sizeof(rules[0])); *output_sz=execute_edit_element_rule(input, strlen(input), rules, n_got_rule, output, options); for(i=0; i