From b8f5a1ae0ffbb9a224ebf82fe2c81f40ab96ef0f Mon Sep 17 00:00:00 2001 From: zhengchao Date: Thu, 27 Sep 2018 19:06:57 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BD=BF=E7=94=A8pcre2=E6=9B=BF=E4=BB=A3glib?= =?UTF-8?q?=E4=BB=8E=E4=B8=AD=E7=9A=84regex=E8=BF=9B=E8=A1=8C=E6=9B=BF?= =?UTF-8?q?=E6=8D=A2=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- platform/CMakeLists.txt | 2 +- plugin/business/pangu-http/CMakeLists.txt | 8 +- plugin/business/pangu-http/pangu_http.cpp | 263 +------------ .../business/pangu-http/pattern_replace.cpp | 212 +++++++++++ plugin/business/pangu-http/pattern_replace.h | 26 ++ .../pangu-http/test_data/facebook_index.html | 69 ++++ .../test_data/google_search_gtest_cnblog.html | 353 ++++++++++++++++++ .../pangu-http/test_pattern_replace.cpp | 45 +++ vendor/CMakeLists.txt | 14 + 9 files changed, 731 insertions(+), 261 deletions(-) create mode 100644 plugin/business/pangu-http/pattern_replace.cpp create mode 100644 plugin/business/pangu-http/pattern_replace.h create mode 100644 plugin/business/pangu-http/test_data/facebook_index.html create mode 100644 plugin/business/pangu-http/test_data/google_search_gtest_cnblog.html create mode 100644 plugin/business/pangu-http/test_pattern_replace.cpp diff --git a/platform/CMakeLists.txt b/platform/CMakeLists.txt index 8bb50b2..98c61bf 100644 --- a/platform/CMakeLists.txt +++ b/platform/CMakeLists.txt @@ -7,7 +7,7 @@ target_link_libraries(tfe common) target_link_libraries(tfe pthread dl openssl-ssl-static openssl-crypto-static - pthread libevent-static + libevent-static libevent-static-openssl libevent-static-pthreads MESA_handle_logger diff --git a/plugin/business/pangu-http/CMakeLists.txt b/plugin/business/pangu-http/CMakeLists.txt index f064e3e..e0f6758 100644 --- a/plugin/business/pangu-http/CMakeLists.txt +++ b/plugin/business/pangu-http/CMakeLists.txt @@ -1,4 +1,8 @@ -add_library(pangu-http pangu_logger.cpp pangu_http.cpp) +add_library(pangu-http pangu_logger.cpp pangu_http.cpp pattern_replace.cpp) target_link_libraries(pangu-http common http) -target_link_libraries(pangu-http librdkafka-static ctemplate-static cjson) +target_link_libraries(pangu-http librdkafka-static ctemplate-static cjson pcre2-static) target_link_libraries(pangu-http maatframe) + +add_executable(test_pattern_replace test_pattern_replace.cpp pattern_replace.cpp) +target_link_libraries(test_pattern_replace common libevent-static gtest pcre2-static) +file(COPY test_data DESTINATION ./) diff --git a/plugin/business/pangu-http/pangu_http.cpp b/plugin/business/pangu-http/pangu_http.cpp index cbcb11f..f17b08b 100644 --- a/plugin/business/pangu-http/pangu_http.cpp +++ b/plugin/business/pangu-http/pangu_http.cpp @@ -1,4 +1,5 @@ #include "pangu_logger.h" +#include "pattern_replace.h" #include #include @@ -19,11 +20,10 @@ #include #include #include -#include + #define MAX_SCAN_RESULT 16 #define MAX_EDIT_ZONE_NUM 64 -#define MAX_EDIT_MATCHES 16 enum pangu_action//Bigger action number is prior. { @@ -221,21 +221,7 @@ static void _wrap_non_std_field_write(struct tfe_http_half * half, const char* f return; } #endif -enum replace_zone -{ - kZoneRequestUri = 0, - kZoneRequestHeaders, - kZoneRequestBody, - kZoneResponseHeader, - kZoneResponseBody, - kZoneMax -}; -struct replace_rule -{ - enum replace_zone zone; - char * find; - char * replace_with; -}; + struct replace_ctx { struct replace_rule * rule; @@ -365,10 +351,10 @@ static enum pangu_action decide_ctrl_action(const struct Maat_rule_t * hit_rules return prior_action; } -//https://github.com/AndiDittrich/HttpErrorPages +//HTML template is downloaded from https://github.com/AndiDittrich/HttpErrorPages static void html_generate(int cfg_id, int status_code, char ** page_buff, size_t * page_size) { - ctemplate::TemplateDictionary dict("pg_page_dict"); + ctemplate::TemplateDictionary dict("pg_page_dict"); //dict is automatically finalized after function returned. dict.SetIntValue("cfg_id", cfg_id); std::string output; ctemplate::Template * tpl = NULL; @@ -385,7 +371,6 @@ static void html_generate(int cfg_id, int status_code, char ** page_buff, size_t } tpl->Expand(&output, &dict); - //todo: do I need to delete dict? *page_size = output.length(); *page_buff = ALLOC(char, *page_size); memcpy(*page_buff, output.c_str(), *page_size); @@ -409,244 +394,6 @@ static int is_http_request(enum tfe_http_event events) } } -enum replace_zone zone_name_to_id(const char * name) -{ - const char * std_name[] = {"http_req_uri", - "http_req_header", - "http_req_body", - "http_resp_header", - "http_resp_body", - "http_resp_body"}; - size_t i = 0; - for (i = 0; i < sizeof(std_name) / sizeof(const char *); i++) - { - if (0 == strcasecmp(name, std_name[i])) - { - break; - } - } - return (enum replace_zone) i; -} -static char * strchr_esc(char * s, const char delim) -{ - char * token; - if (s == NULL) - return NULL; - for (token = s; *token != '\0'; token++) - { - if (*token == '\\') - { - token++; - continue; - } - if (*token == delim) - break; - } - if (*token == '\0') - { - return NULL; - } - else - { - return token; - } -} -static char * strtok_r_esc(char * s, const char delim, char ** save_ptr) -{ - char * token; - - if (s == NULL) s = *save_ptr; - - /* Scan leading delimiters. */ - token = strchr_esc(s, delim); - if (token == NULL) - { - *save_ptr = token; - return s; - } - /* Find the end of the token. */ - *token = '\0'; - token++; - *save_ptr = token; - - return s; -} - -size_t format_replace_rule(const char * exec_para, struct replace_rule * replace, size_t n_replace) -{ - char * tmp = ALLOC(char, strlen(exec_para) + 1); - char * token = NULL, * sub_token = NULL, * saveptr = NULL, * saveptr2 = NULL; - size_t idx = 0; - - const char * str_zone = "zone="; - const char * str_subs = "substitute="; - memcpy(tmp, exec_para, strlen(exec_para)); - - for (token = tmp;; token = NULL) - { - sub_token = strtok_r(token, ";", &saveptr); - if (sub_token == NULL) break; - - if (0 == strncasecmp(sub_token, str_zone, strlen(str_zone))) - { - replace[idx].zone = zone_name_to_id(sub_token + strlen(str_zone)); - if (replace[idx].zone == kZoneMax) - { - break; - } - } - - sub_token = strtok_r(NULL, ";", &saveptr); - if (0 == strncasecmp(sub_token, str_subs, strlen(str_subs))) - { - sub_token += strlen(str_subs) + 1; - replace[idx].find = tfe_strdup(strtok_r_esc(sub_token, '/', &saveptr2)); - replace[idx].replace_with = tfe_strdup(strtok_r_esc(NULL, '/', &saveptr2)); - - idx++; - if (idx == n_replace) - { - break; - } - } - } - - free(tmp); - tmp = NULL; - return idx; -} - -size_t select_replace_rule(enum replace_zone zone, const struct replace_rule * replace, size_t n_replace, - const struct replace_rule ** selected, size_t n_selected) -{ - size_t i = 0, j = 0; - for (i = 0; i < n_replace && j < n_selected; i++) - { - if (replace[i].zone == zone) - { - selected[j] = replace + i; - j++; - } - } - return j; -} - -static struct evbuffer * replace_string(const char * in, const struct replace_rule * zone) -{ - //Reference to https://www.lemoda.net/c/unix-regex/ - // Regular Expression test: https://regex101.com/ - regex_t reg; - int status = 0, is_replaced = 0; - struct evbuffer * out = NULL; - size_t in_sz = strlen(in); - - size_t replace_len = strlen(zone->replace_with); - - assert(strlen(zone->find) != 0); - status = regcomp(®, zone->find, REG_EXTENDED | REG_NEWLINE); - if (status != 0) - { - char error_message[TFE_STRING_MAX]; - regerror(status, ®, error_message, sizeof(error_message)); - TFE_LOG_ERROR(g_pangu_rt->local_logger, "Regex error compiling '%s': %s\n", - zone->find, error_message); - regfree(®); - return NULL; - } - - /* "p" is a pointer into the string which points to the end of the previous match. */ - const char * p = in; - /* "pre_sub_expr_end" is a pointer into the string which points to the end of the previous sub expression match. */ - const char * pre_sub_expr_end = NULL; - - /* "N_matches" is the maximum number of matches allowed. */ - const int n_matches = MAX_EDIT_MATCHES; - /* "M" contains the matches found. */ - regmatch_t m[n_matches]; - int i = 0; - - while (1) - { - int nomatch = regexec(®, p, n_matches, m, 0); - if (nomatch) - { - break; - } - if (is_replaced == 0) - { - out = evbuffer_new(); - is_replaced = 1; - } - assert(m[0].rm_so != -1); - pre_sub_expr_end = p; - if (m[1].rm_so == -1)//no sub expr, replace the entire expr. - { - evbuffer_add(out, pre_sub_expr_end, m[0].rm_so - (pre_sub_expr_end - p)); - evbuffer_add(out, zone->replace_with, replace_len); - pre_sub_expr_end = p + m[0].rm_eo; - } - else //have sub expr, replace the sub expr. - { - for (i = 1, pre_sub_expr_end = p; i < n_matches; i++) - { - if (m[i].rm_so == -1) - { - break; - } - evbuffer_add(out, pre_sub_expr_end, m[i].rm_so - (pre_sub_expr_end - p)); - evbuffer_add(out, zone->replace_with, replace_len); - pre_sub_expr_end = p + m[i].rm_eo; - } - } - p += m[0].rm_eo; - } - - if (is_replaced) - { - evbuffer_add(out, pre_sub_expr_end, in_sz - (pre_sub_expr_end - p)); - } - - regfree(®); - return out; -} - -struct evbuffer * execute_replace_rule(const char * in, size_t in_sz, - enum replace_zone zone, const struct replace_rule * rules, size_t n_rule) -{ - const struct replace_rule * todo[MAX_EDIT_ZONE_NUM]; - size_t n_todo = 0, i = 0; - struct evbuffer * out = NULL; - const char * interator = NULL; - struct evbuffer * new_out = NULL, * pre_out = NULL; - if (in == 0) - { - return NULL; - } - //Do not process buffer that contains '\0'. - if (0 != memchr(in, '\0', in_sz)) - { - return NULL; - } - n_todo = select_replace_rule(zone, rules, n_rule, todo, MAX_EDIT_ZONE_NUM); - interator = in; - for (i = 0; i < n_todo; i++) - { - new_out = replace_string(interator, todo[i]); - if (new_out != NULL) - { - pre_out = out; - out = new_out; - interator = (char *) evbuffer_pullup(out, -1); - - if (pre_out != NULL) - { - evbuffer_free(pre_out); - pre_out = NULL; - } - } - } - return out; -} void http_replace(const struct tfe_stream * stream, const struct tfe_http_session * session, enum tfe_http_event events, const unsigned char * body_frag, size_t frag_size, struct pangu_http_ctx * ctx) diff --git a/plugin/business/pangu-http/pattern_replace.cpp b/plugin/business/pangu-http/pattern_replace.cpp new file mode 100644 index 0000000..66d93ab --- /dev/null +++ b/plugin/business/pangu-http/pattern_replace.cpp @@ -0,0 +1,212 @@ +#include "pattern_replace.h" + +#include + +#define PCRE2_CODE_UNIT_WIDTH 8 +#include + +#include +#include +#include +#include + +#define MAX_EDIT_MATCHES 16 + +enum replace_zone zone_name_to_id(const char * name) +{ + const char * std_name[] = {"http_req_uri", + "http_req_header", + "http_req_body", + "http_resp_header", + "http_resp_body", + "http_resp_body"}; + size_t i = 0; + for (i = 0; i < sizeof(std_name) / sizeof(const char *); i++) + { + if (0 == strcasecmp(name, std_name[i])) + { + break; + } + } + return (enum replace_zone) i; +} +static char * strchr_esc(char * s, const char delim) +{ + char * token; + if (s == NULL) + return NULL; + for (token = s; *token != '\0'; token++) + { + if (*token == '\\') + { + token++; + continue; + } + if (*token == delim) + break; + } + if (*token == '\0') + { + return NULL; + } + else + { + return token; + } +} +static char * strtok_r_esc(char * s, const char delim, char ** save_ptr) +{ + char * token; + + if (s == NULL) s = *save_ptr; + + /* Scan leading delimiters. */ + token = strchr_esc(s, delim); + if (token == NULL) + { + *save_ptr = token; + return s; + } + /* Find the end of the token. */ + *token = '\0'; + token++; + *save_ptr = token; + + return s; +} + + +size_t format_replace_rule(const char * exec_para, struct replace_rule * replace, size_t n_replace) +{ + char * tmp = ALLOC(char, strlen(exec_para) + 1); + char * token = NULL, * sub_token = NULL, * saveptr = NULL, * saveptr2 = NULL; + size_t idx = 0; + + const char * str_zone = "zone="; + const char * str_subs = "substitute="; + memcpy(tmp, exec_para, strlen(exec_para)); + + for (token = tmp;; token = NULL) + { + sub_token = strtok_r(token, ";", &saveptr); + if (sub_token == NULL) break; + + if (0 == strncasecmp(sub_token, str_zone, strlen(str_zone))) + { + replace[idx].zone = zone_name_to_id(sub_token + strlen(str_zone)); + if (replace[idx].zone == kZoneMax) + { + break; + } + } + + sub_token = strtok_r(NULL, ";", &saveptr); + if (0 == strncasecmp(sub_token, str_subs, strlen(str_subs))) + { + sub_token += strlen(str_subs) + 1; + replace[idx].find = tfe_strdup(strtok_r_esc(sub_token, '/', &saveptr2)); + replace[idx].replace_with = tfe_strdup(strtok_r_esc(NULL, '/', &saveptr2)); + + idx++; + if (idx == n_replace) + { + break; + } + } + } + + free(tmp); + tmp = NULL; + return idx; +} + +size_t select_replace_rule(enum replace_zone zone, const struct replace_rule * replace, size_t n_replace, + const struct replace_rule ** selected, size_t n_selected) +{ + size_t i = 0, j = 0; + for (i = 0; i < n_replace && j < n_selected; i++) + { + if (replace[i].zone == zone) + { + selected[j] = replace + i; + j++; + } + } + return j; +} + +static struct evbuffer * replace_string(const char * in, size_t in_sz, const struct replace_rule * zone) +{ + + int status = 0, is_replaced = 0; + struct evbuffer * out = NULL; + + size_t replace_len = strlen(zone->replace_with); + + assert(strlen(zone->find) != 0); + + int error; + PCRE2_SIZE erroffset; + + const PCRE2_SPTR pattern = (PCRE2_SPTR)zone->find; + const PCRE2_SPTR subject = (PCRE2_SPTR)in; + const PCRE2_SPTR replacement = (PCRE2_SPTR)zone->replace_with; + + pcre2_code *re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0, &error, &erroffset, 0); + if (re == 0) + return NULL; + + pcre2_jit_compile(re, PCRE2_JIT_COMPLETE); + + PCRE2_SIZE outlen = in_sz*2; + PCRE2_UCHAR* output = (PCRE2_UCHAR*)malloc(sizeof(PCRE2_UCHAR)*outlen); + + int rc = pcre2_substitute(re, subject, in_sz, 0, PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED, 0, 0, replacement, PCRE2_ZERO_TERMINATED, output, &outlen); + if (rc >= 0) + printf("%s\n", output); + + pcre2_code_free(re); + free(output); + return NULL; +} + +struct evbuffer * execute_replace_rule(const char * in, size_t in_sz, + enum replace_zone zone, const struct replace_rule * rules, size_t n_rule) +{ + const struct replace_rule * todo[n_rule]; + size_t n_todo = 0, i = 0, interator_sz=0; + struct evbuffer * out = NULL; + const char * interator = NULL; + struct evbuffer * new_out = NULL, * pre_out = NULL; + if (in == 0) + { + return NULL; + } + //Do not process buffer that contains '\0'. + if (0 != memchr(in, '\0', in_sz)) + { + return NULL; + } + n_todo = select_replace_rule(zone, rules, n_rule, todo, n_rule); + interator = in; + interator_sz = in_sz; + for (i = 0; i < n_todo; i++) + { + new_out = replace_string(interator, interator_sz, todo[i]); + if (new_out != NULL) + { + pre_out = out; + out = new_out; + interator = (char *) evbuffer_pullup(out, -1); + interator_sz = evbuffer_get_length(out); + if (pre_out != NULL) + { + evbuffer_free(pre_out); + pre_out = NULL; + } + } + } + return out; +} + + diff --git a/plugin/business/pangu-http/pattern_replace.h b/plugin/business/pangu-http/pattern_replace.h new file mode 100644 index 0000000..716c6d2 --- /dev/null +++ b/plugin/business/pangu-http/pattern_replace.h @@ -0,0 +1,26 @@ +#pragma once + +#include +enum replace_zone +{ + kZoneRequestUri = 0, + kZoneRequestHeaders, + kZoneRequestBody, + kZoneResponseHeader, + kZoneResponseBody, + kZoneMax +}; +struct replace_rule +{ + enum replace_zone zone; + char * find; + char * replace_with; +}; + + +//@parm exec_para example input: +//zone=http_req_body; substitute=/中華民國/中华人民共和国;zone=http_resp_header; substitute=/Content-Type:\btext\/html/Content-Type:\bvideo\/mp4 +//@return formated rule number. +size_t format_replace_rule(const char * exec_para, struct replace_rule * replace, size_t n_replace); +struct evbuffer * execute_replace_rule(const char * in, size_t in_sz, enum replace_zone zone, const struct replace_rule * rules, size_t n_rule); + diff --git a/plugin/business/pangu-http/test_data/facebook_index.html b/plugin/business/pangu-http/test_data/facebook_index.html new file mode 100644 index 0000000..65919b8 --- /dev/null +++ b/plugin/business/pangu-http/test_data/facebook_index.html @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + + + + +Facebook
+ + + + + +
+ + + +
+ + + + + + + + + +
+ + + +
+ + + +
+ + + + + + + + + +
+ + + +
+ + + + + + diff --git a/plugin/business/pangu-http/test_data/google_search_gtest_cnblog.html b/plugin/business/pangu-http/test_data/google_search_gtest_cnblog.html new file mode 100644 index 0000000..917949e --- /dev/null +++ b/plugin/business/pangu-http/test_data/google_search_gtest_cnblog.html @@ -0,0 +1,353 @@ + + +gtest cnblog - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
About 2,880,000 results (0.31 seconds) 

Searches related to gtest cnblog

Footer links

Russia
Russia - From your Internet address - Use precise location - Learn more
\ No newline at end of file diff --git a/plugin/business/pangu-http/test_pattern_replace.cpp b/plugin/business/pangu-http/test_pattern_replace.cpp new file mode 100644 index 0000000..3465997 --- /dev/null +++ b/plugin/business/pangu-http/test_pattern_replace.cpp @@ -0,0 +1,45 @@ +#include "pattern_replace.h" + +#include //fstat +#include +#include +#include +#include +TEST(PatternReplace, Pure) +{ + const char* filename="./test_data/facebook_index.html"; + const char* exec_para="zone=http_resp_body;substitute=/添加手机号/Mobile\bPhone"; + size_t n_got_rule=0; + struct replace_rule rules[16]; + n_got_rule=format_replace_rule(exec_para, rules, sizeof(rules)/sizeof(rules[0])); + EXPECT_EQ(n_got_rule, 1); + + FILE* fp=NULL; + struct stat file_info; + stat(filename, &file_info); + size_t file_size=file_info.st_size; + + fp=fopen(filename,"r"); + ASSERT_FALSE(fp==NULL); + if(fp==NULL) + { + return; + } + char* file_buff=(char*)malloc(file_size); + fread(file_buff,1,file_size,fp); + fclose(fp); + + struct evbuffer* output=NULL; + output=execute_replace_rule(file_buff, file_size, kZoneResponseBody, rules,n_got_rule); + EXPECT_FALSE(output==NULL); + return; +} + + +int main(int argc, char ** argv) +{ + + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/vendor/CMakeLists.txt b/vendor/CMakeLists.txt index ce03ca9..70c6662 100644 --- a/vendor/CMakeLists.txt +++ b/vendor/CMakeLists.txt @@ -193,3 +193,17 @@ add_library(ctemplate-static STATIC IMPORTED GLOBAL) add_dependencies(ctemplate-static ctemplate) set_property(TARGET ctemplate-static PROPERTY IMPORTED_LOCATION ${INSTALL_DIR}/lib/libctemplate.a) set_property(TARGET ctemplate-static PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include) + +### pcre2 +ExternalProject_Add(pcre2 PREFIX pcre2 + URL ${CMAKE_CURRENT_SOURCE_DIR}/pcre2-10.32.tar.gz + URL_MD5 a660db882ff171e6a0de5fb1decd5ff5 + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX= -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}) + +ExternalProject_Get_Property(pcre2 INSTALL_DIR) +file(MAKE_DIRECTORY ${INSTALL_DIR}/include) + +add_library(pcre2-static STATIC IMPORTED GLOBAL) +add_dependencies(pcre2-static ctemplate) +set_property(TARGET pcre2-static PROPERTY IMPORTED_LOCATION ${INSTALL_DIR}/lib/libpcre2-8.a) +set_property(TARGET pcre2-static PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)