使用pcre2替代glib从中的regex进行替换。

This commit is contained in:
zhengchao
2018-09-27 19:06:57 +08:00
parent 20c55b74d8
commit 5e6dde5e51
9 changed files with 731 additions and 261 deletions

View File

@@ -7,7 +7,7 @@ target_link_libraries(tfe common)
target_link_libraries(tfe pthread dl
openssl-ssl-static
openssl-crypto-static
pthread libevent-static
libevent-static
libevent-static-openssl
libevent-static-pthreads
MESA_handle_logger

View File

@@ -1,4 +1,8 @@
add_library(pangu-http pangu_logger.cpp pangu_http.cpp)
add_library(pangu-http pangu_logger.cpp pangu_http.cpp pattern_replace.cpp)
target_link_libraries(pangu-http common http)
target_link_libraries(pangu-http librdkafka-static ctemplate-static cjson)
target_link_libraries(pangu-http librdkafka-static ctemplate-static cjson pcre2-static)
target_link_libraries(pangu-http maatframe)
add_executable(test_pattern_replace test_pattern_replace.cpp pattern_replace.cpp)
target_link_libraries(test_pattern_replace common libevent-static gtest pcre2-static)
file(COPY test_data DESTINATION ./)

View File

@@ -1,4 +1,5 @@
#include "pangu_logger.h"
#include "pattern_replace.h"
#include <tfe_stream.h>
#include <tfe_utils.h>
@@ -19,11 +20,10 @@
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <regex.h>
#define MAX_SCAN_RESULT 16
#define MAX_EDIT_ZONE_NUM 64
#define MAX_EDIT_MATCHES 16
enum pangu_action//Bigger action number is prior.
{
@@ -221,21 +221,7 @@ static void _wrap_non_std_field_write(struct tfe_http_half * half, const char* f
return;
}
#endif
enum replace_zone
{
kZoneRequestUri = 0,
kZoneRequestHeaders,
kZoneRequestBody,
kZoneResponseHeader,
kZoneResponseBody,
kZoneMax
};
struct replace_rule
{
enum replace_zone zone;
char * find;
char * replace_with;
};
struct replace_ctx
{
struct replace_rule * rule;
@@ -365,10 +351,10 @@ static enum pangu_action decide_ctrl_action(const struct Maat_rule_t * hit_rules
return prior_action;
}
//https://github.com/AndiDittrich/HttpErrorPages
//HTML template is downloaded from https://github.com/AndiDittrich/HttpErrorPages
static void html_generate(int cfg_id, int status_code, char ** page_buff, size_t * page_size)
{
ctemplate::TemplateDictionary dict("pg_page_dict");
ctemplate::TemplateDictionary dict("pg_page_dict"); //dict is automatically finalized after function returned.
dict.SetIntValue("cfg_id", cfg_id);
std::string output;
ctemplate::Template * tpl = NULL;
@@ -385,7 +371,6 @@ static void html_generate(int cfg_id, int status_code, char ** page_buff, size_t
}
tpl->Expand(&output, &dict);
//todo: do I need to delete dict?
*page_size = output.length();
*page_buff = ALLOC(char, *page_size);
memcpy(*page_buff, output.c_str(), *page_size);
@@ -409,244 +394,6 @@ static int is_http_request(enum tfe_http_event events)
}
}
enum replace_zone zone_name_to_id(const char * name)
{
const char * std_name[] = {"http_req_uri",
"http_req_header",
"http_req_body",
"http_resp_header",
"http_resp_body",
"http_resp_body"};
size_t i = 0;
for (i = 0; i < sizeof(std_name) / sizeof(const char *); i++)
{
if (0 == strcasecmp(name, std_name[i]))
{
break;
}
}
return (enum replace_zone) i;
}
static char * strchr_esc(char * s, const char delim)
{
char * token;
if (s == NULL)
return NULL;
for (token = s; *token != '\0'; token++)
{
if (*token == '\\')
{
token++;
continue;
}
if (*token == delim)
break;
}
if (*token == '\0')
{
return NULL;
}
else
{
return token;
}
}
static char * strtok_r_esc(char * s, const char delim, char ** save_ptr)
{
char * token;
if (s == NULL) s = *save_ptr;
/* Scan leading delimiters. */
token = strchr_esc(s, delim);
if (token == NULL)
{
*save_ptr = token;
return s;
}
/* Find the end of the token. */
*token = '\0';
token++;
*save_ptr = token;
return s;
}
size_t format_replace_rule(const char * exec_para, struct replace_rule * replace, size_t n_replace)
{
char * tmp = ALLOC(char, strlen(exec_para) + 1);
char * token = NULL, * sub_token = NULL, * saveptr = NULL, * saveptr2 = NULL;
size_t idx = 0;
const char * str_zone = "zone=";
const char * str_subs = "substitute=";
memcpy(tmp, exec_para, strlen(exec_para));
for (token = tmp;; token = NULL)
{
sub_token = strtok_r(token, ";", &saveptr);
if (sub_token == NULL) break;
if (0 == strncasecmp(sub_token, str_zone, strlen(str_zone)))
{
replace[idx].zone = zone_name_to_id(sub_token + strlen(str_zone));
if (replace[idx].zone == kZoneMax)
{
break;
}
}
sub_token = strtok_r(NULL, ";", &saveptr);
if (0 == strncasecmp(sub_token, str_subs, strlen(str_subs)))
{
sub_token += strlen(str_subs) + 1;
replace[idx].find = tfe_strdup(strtok_r_esc(sub_token, '/', &saveptr2));
replace[idx].replace_with = tfe_strdup(strtok_r_esc(NULL, '/', &saveptr2));
idx++;
if (idx == n_replace)
{
break;
}
}
}
free(tmp);
tmp = NULL;
return idx;
}
size_t select_replace_rule(enum replace_zone zone, const struct replace_rule * replace, size_t n_replace,
const struct replace_rule ** selected, size_t n_selected)
{
size_t i = 0, j = 0;
for (i = 0; i < n_replace && j < n_selected; i++)
{
if (replace[i].zone == zone)
{
selected[j] = replace + i;
j++;
}
}
return j;
}
static struct evbuffer * replace_string(const char * in, const struct replace_rule * zone)
{
//Reference to https://www.lemoda.net/c/unix-regex/
// Regular Expression test: https://regex101.com/
regex_t reg;
int status = 0, is_replaced = 0;
struct evbuffer * out = NULL;
size_t in_sz = strlen(in);
size_t replace_len = strlen(zone->replace_with);
assert(strlen(zone->find) != 0);
status = regcomp(&reg, zone->find, REG_EXTENDED | REG_NEWLINE);
if (status != 0)
{
char error_message[TFE_STRING_MAX];
regerror(status, &reg, error_message, sizeof(error_message));
TFE_LOG_ERROR(g_pangu_rt->local_logger, "Regex error compiling '%s': %s\n",
zone->find, error_message);
regfree(&reg);
return NULL;
}
/* "p" is a pointer into the string which points to the end of the previous match. */
const char * p = in;
/* "pre_sub_expr_end" is a pointer into the string which points to the end of the previous sub expression match. */
const char * pre_sub_expr_end = NULL;
/* "N_matches" is the maximum number of matches allowed. */
const int n_matches = MAX_EDIT_MATCHES;
/* "M" contains the matches found. */
regmatch_t m[n_matches];
int i = 0;
while (1)
{
int nomatch = regexec(&reg, p, n_matches, m, 0);
if (nomatch)
{
break;
}
if (is_replaced == 0)
{
out = evbuffer_new();
is_replaced = 1;
}
assert(m[0].rm_so != -1);
pre_sub_expr_end = p;
if (m[1].rm_so == -1)//no sub expr, replace the entire expr.
{
evbuffer_add(out, pre_sub_expr_end, m[0].rm_so - (pre_sub_expr_end - p));
evbuffer_add(out, zone->replace_with, replace_len);
pre_sub_expr_end = p + m[0].rm_eo;
}
else //have sub expr, replace the sub expr.
{
for (i = 1, pre_sub_expr_end = p; i < n_matches; i++)
{
if (m[i].rm_so == -1)
{
break;
}
evbuffer_add(out, pre_sub_expr_end, m[i].rm_so - (pre_sub_expr_end - p));
evbuffer_add(out, zone->replace_with, replace_len);
pre_sub_expr_end = p + m[i].rm_eo;
}
}
p += m[0].rm_eo;
}
if (is_replaced)
{
evbuffer_add(out, pre_sub_expr_end, in_sz - (pre_sub_expr_end - p));
}
regfree(&reg);
return out;
}
struct evbuffer * execute_replace_rule(const char * in, size_t in_sz,
enum replace_zone zone, const struct replace_rule * rules, size_t n_rule)
{
const struct replace_rule * todo[MAX_EDIT_ZONE_NUM];
size_t n_todo = 0, i = 0;
struct evbuffer * out = NULL;
const char * interator = NULL;
struct evbuffer * new_out = NULL, * pre_out = NULL;
if (in == 0)
{
return NULL;
}
//Do not process buffer that contains '\0'.
if (0 != memchr(in, '\0', in_sz))
{
return NULL;
}
n_todo = select_replace_rule(zone, rules, n_rule, todo, MAX_EDIT_ZONE_NUM);
interator = in;
for (i = 0; i < n_todo; i++)
{
new_out = replace_string(interator, todo[i]);
if (new_out != NULL)
{
pre_out = out;
out = new_out;
interator = (char *) evbuffer_pullup(out, -1);
if (pre_out != NULL)
{
evbuffer_free(pre_out);
pre_out = NULL;
}
}
}
return out;
}
void http_replace(const struct tfe_stream * stream, const struct tfe_http_session * session,
enum tfe_http_event events, const unsigned char * body_frag, size_t frag_size, struct pangu_http_ctx * ctx)

View File

@@ -0,0 +1,212 @@
#include "pattern_replace.h"
#include <tfe_utils.h>
#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>
#include <string.h>
#include <sys/types.h>
#include <stdlib.h>
#include <assert.h>
#define MAX_EDIT_MATCHES 16
enum replace_zone zone_name_to_id(const char * name)
{
const char * std_name[] = {"http_req_uri",
"http_req_header",
"http_req_body",
"http_resp_header",
"http_resp_body",
"http_resp_body"};
size_t i = 0;
for (i = 0; i < sizeof(std_name) / sizeof(const char *); i++)
{
if (0 == strcasecmp(name, std_name[i]))
{
break;
}
}
return (enum replace_zone) i;
}
static char * strchr_esc(char * s, const char delim)
{
char * token;
if (s == NULL)
return NULL;
for (token = s; *token != '\0'; token++)
{
if (*token == '\\')
{
token++;
continue;
}
if (*token == delim)
break;
}
if (*token == '\0')
{
return NULL;
}
else
{
return token;
}
}
static char * strtok_r_esc(char * s, const char delim, char ** save_ptr)
{
char * token;
if (s == NULL) s = *save_ptr;
/* Scan leading delimiters. */
token = strchr_esc(s, delim);
if (token == NULL)
{
*save_ptr = token;
return s;
}
/* Find the end of the token. */
*token = '\0';
token++;
*save_ptr = token;
return s;
}
size_t format_replace_rule(const char * exec_para, struct replace_rule * replace, size_t n_replace)
{
char * tmp = ALLOC(char, strlen(exec_para) + 1);
char * token = NULL, * sub_token = NULL, * saveptr = NULL, * saveptr2 = NULL;
size_t idx = 0;
const char * str_zone = "zone=";
const char * str_subs = "substitute=";
memcpy(tmp, exec_para, strlen(exec_para));
for (token = tmp;; token = NULL)
{
sub_token = strtok_r(token, ";", &saveptr);
if (sub_token == NULL) break;
if (0 == strncasecmp(sub_token, str_zone, strlen(str_zone)))
{
replace[idx].zone = zone_name_to_id(sub_token + strlen(str_zone));
if (replace[idx].zone == kZoneMax)
{
break;
}
}
sub_token = strtok_r(NULL, ";", &saveptr);
if (0 == strncasecmp(sub_token, str_subs, strlen(str_subs)))
{
sub_token += strlen(str_subs) + 1;
replace[idx].find = tfe_strdup(strtok_r_esc(sub_token, '/', &saveptr2));
replace[idx].replace_with = tfe_strdup(strtok_r_esc(NULL, '/', &saveptr2));
idx++;
if (idx == n_replace)
{
break;
}
}
}
free(tmp);
tmp = NULL;
return idx;
}
size_t select_replace_rule(enum replace_zone zone, const struct replace_rule * replace, size_t n_replace,
const struct replace_rule ** selected, size_t n_selected)
{
size_t i = 0, j = 0;
for (i = 0; i < n_replace && j < n_selected; i++)
{
if (replace[i].zone == zone)
{
selected[j] = replace + i;
j++;
}
}
return j;
}
static struct evbuffer * replace_string(const char * in, size_t in_sz, const struct replace_rule * zone)
{
int status = 0, is_replaced = 0;
struct evbuffer * out = NULL;
size_t replace_len = strlen(zone->replace_with);
assert(strlen(zone->find) != 0);
int error;
PCRE2_SIZE erroffset;
const PCRE2_SPTR pattern = (PCRE2_SPTR)zone->find;
const PCRE2_SPTR subject = (PCRE2_SPTR)in;
const PCRE2_SPTR replacement = (PCRE2_SPTR)zone->replace_with;
pcre2_code *re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, 0, &error, &erroffset, 0);
if (re == 0)
return NULL;
pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
PCRE2_SIZE outlen = in_sz*2;
PCRE2_UCHAR* output = (PCRE2_UCHAR*)malloc(sizeof(PCRE2_UCHAR)*outlen);
int rc = pcre2_substitute(re, subject, in_sz, 0, PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED, 0, 0, replacement, PCRE2_ZERO_TERMINATED, output, &outlen);
if (rc >= 0)
printf("%s\n", output);
pcre2_code_free(re);
free(output);
return NULL;
}
struct evbuffer * execute_replace_rule(const char * in, size_t in_sz,
enum replace_zone zone, const struct replace_rule * rules, size_t n_rule)
{
const struct replace_rule * todo[n_rule];
size_t n_todo = 0, i = 0, interator_sz=0;
struct evbuffer * out = NULL;
const char * interator = NULL;
struct evbuffer * new_out = NULL, * pre_out = NULL;
if (in == 0)
{
return NULL;
}
//Do not process buffer that contains '\0'.
if (0 != memchr(in, '\0', in_sz))
{
return NULL;
}
n_todo = select_replace_rule(zone, rules, n_rule, todo, n_rule);
interator = in;
interator_sz = in_sz;
for (i = 0; i < n_todo; i++)
{
new_out = replace_string(interator, interator_sz, todo[i]);
if (new_out != NULL)
{
pre_out = out;
out = new_out;
interator = (char *) evbuffer_pullup(out, -1);
interator_sz = evbuffer_get_length(out);
if (pre_out != NULL)
{
evbuffer_free(pre_out);
pre_out = NULL;
}
}
}
return out;
}

View File

@@ -0,0 +1,26 @@
#pragma once
#include <event2/buffer.h>
enum replace_zone
{
kZoneRequestUri = 0,
kZoneRequestHeaders,
kZoneRequestBody,
kZoneResponseHeader,
kZoneResponseBody,
kZoneMax
};
struct replace_rule
{
enum replace_zone zone;
char * find;
char * replace_with;
};
//@parm exec_para example input:
//zone=http_req_body; substitute=/中華民國/中华人民共和国;zone=http_resp_header; substitute=/Content-Type:\btext\/html/Content-Type:\bvideo\/mp4
//@return formated rule number.
size_t format_replace_rule(const char * exec_para, struct replace_rule * replace, size_t n_replace);
struct evbuffer * execute_replace_rule(const char * in, size_t in_sz, enum replace_zone zone, const struct replace_rule * rules, size_t n_rule);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,45 @@
#include "pattern_replace.h"
#include <sys/types.h>//fstat
#include <sys/ioctl.h>
#include <string.h>
#include <stdio.h>
#include <gtest/gtest.h>
TEST(PatternReplace, Pure)
{
const char* filename="./test_data/facebook_index.html";
const char* exec_para="zone=http_resp_body;substitute=/添加手机号/Mobile\bPhone";
size_t n_got_rule=0;
struct replace_rule rules[16];
n_got_rule=format_replace_rule(exec_para, rules, sizeof(rules)/sizeof(rules[0]));
EXPECT_EQ(n_got_rule, 1);
FILE* fp=NULL;
struct stat file_info;
stat(filename, &file_info);
size_t file_size=file_info.st_size;
fp=fopen(filename,"r");
ASSERT_FALSE(fp==NULL);
if(fp==NULL)
{
return;
}
char* file_buff=(char*)malloc(file_size);
fread(file_buff,1,file_size,fp);
fclose(fp);
struct evbuffer* output=NULL;
output=execute_replace_rule(file_buff, file_size, kZoneResponseBody, rules,n_got_rule);
EXPECT_FALSE(output==NULL);
return;
}
int main(int argc, char ** argv)
{
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

14
vendor/CMakeLists.txt vendored
View File

@@ -193,3 +193,17 @@ add_library(ctemplate-static STATIC IMPORTED GLOBAL)
add_dependencies(ctemplate-static ctemplate)
set_property(TARGET ctemplate-static PROPERTY IMPORTED_LOCATION ${INSTALL_DIR}/lib/libctemplate.a)
set_property(TARGET ctemplate-static PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)
### pcre2
ExternalProject_Add(pcre2 PREFIX pcre2
URL ${CMAKE_CURRENT_SOURCE_DIR}/pcre2-10.32.tar.gz
URL_MD5 a660db882ff171e6a0de5fb1decd5ff5
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})
ExternalProject_Get_Property(pcre2 INSTALL_DIR)
file(MAKE_DIRECTORY ${INSTALL_DIR}/include)
add_library(pcre2-static STATIC IMPORTED GLOBAL)
add_dependencies(pcre2-static ctemplate)
set_property(TARGET pcre2-static PROPERTY IMPORTED_LOCATION ${INSTALL_DIR}/lib/libpcre2-8.a)
set_property(TARGET pcre2-static PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include)