This repository has been archived on 2025-09-14. You can view files and clone it, but cannot push or open issues or pull requests.
Files
tango-maat/scanner/expr_matcher/expr_matcher.cpp

681 lines
23 KiB
C++

/*
**********************************************************************************************
* File: expr_matcher.cpp
* Description:
* Authors: Liu wentan <liuwentan@geedgenetworks.com>
* Date: 2023-06-30
* Copyright: (c) Since 2023 Geedge Networks, Ltd. All rights reserved.
***********************************************************************************************
*/
#include <unistd.h>
#include <assert.h>
#include <sys/syscall.h>
#include "log/log.h"
#include "bloom/bloom.h"
#include "maat_utils.h"
#include "../bool_matcher/bool_matcher.h"
#include "expr_matcher_inc.h"
#include "adapter_hs/adapter_hs.h"
#include "adapter_rs/adapter_rs.h"
pid_t expr_matcher_gettid()
{
return syscall(SYS_gettid);
}
static const char *expr_matcher_module_name_str(const char *name)
{
static __thread char module[64];
snprintf(module, sizeof(module), "%s(%d)", name, expr_matcher_gettid());
return module;
}
#define MODULE_EXPR_MATCHER expr_matcher_module_name_str("maat.expr_matcher")
struct expr_matcher {
size_t n_thread;
enum expr_engine_type engine_type;
void *lit_runtime;
void *regex_runtime;
struct pattern_attribute *pat_attr;
struct bool_matcher *bm;
struct bool_expr_match **bool_match_buffs;
struct log_handle *logger;
struct bool_expr *bool_exprs;
};
struct expr_matcher_stream {
int thread_id;
enum expr_engine_type engine_type;
void *lit_stream;
void *regex_stream;
UT_array *all_hit_lit_pattern_ids;
struct expr_matcher *ref_matcher;
};
struct db_operations {
enum expr_engine_type type;
void *(*compile_data_new)(enum expr_pattern_type pat_type, size_t n_pattern);
void (*compile_data_free)(void *compile_data);
void (*populate_compile_data)(void *compile_data, size_t index, int pattern_id,
char *pat, size_t pat_len, int case_sensitive);
int (*build_db)(void **lit_db, void *compile_data, struct log_handle *logger);
};
UT_icd ut_pattern_id_icd = {sizeof(unsigned long long), NULL, NULL, NULL};
struct db_operations db_ops[EXPR_ENGINE_TYPE_AUTO] = {
{
.type = EXPR_ENGINE_TYPE_HS,
.compile_data_new = hs_compile_data_new,
.compile_data_free = hs_compile_data_free,
.populate_compile_data = hs_populate_compile_data,
.build_db = hs_build_lit_db
},
{
.type = EXPR_ENGINE_TYPE_RS,
.compile_data_new = rs_compile_data_new,
.compile_data_free = rs_compile_data_free,
.populate_compile_data = rs_populate_compile_data,
.build_db = rs_build_lit_db
}
};
struct engine_operations {
enum expr_engine_type type;
void *(*engine_new)(struct expr_rule *rules, size_t n_rule,
struct pattern_attribute *pat_attr,
void *hs_lit_db, size_t n_thread,
struct log_handle *logger);
void (*engine_free)(void *engine);
int (*engine_scan)(void *engine, int thread_id,
const char *data, size_t data_len,
unsigned long long *pattern_id_array,
size_t array_size, size_t *n_pattern_id);
void *(*stream_open)(void *engine, int thread_id);
void (*stream_close)(void *stream);
int (*scan_stream)(void *stream, const char *data, size_t data_len,
unsigned long long *pattern_id_array, size_t array_size,
size_t *n_pattern_id);
};
struct engine_operations engine_ops[EXPR_ENGINE_TYPE_AUTO] = {
{
.type = EXPR_ENGINE_TYPE_HS,
.engine_new = hs_lit_engine_new,
.engine_free = hs_lit_engine_free,
.engine_scan = hs_lit_engine_scan,
.stream_open = hs_lit_stream_open,
.stream_close = hs_lit_stream_close,
.scan_stream = hs_lit_stream_scan
},
{
.type = EXPR_ENGINE_TYPE_RS,
.engine_new = rs_lit_engine_new,
.engine_free = rs_lit_engine_free,
.engine_scan = rs_lit_engine_scan,
.stream_open = rs_lit_stream_open,
.stream_close = rs_lit_stream_close,
.scan_stream = rs_lit_stream_scan
}
};
int expr_matcher_verify_regex_expression(const char *regex_expr,
struct log_handle *logger)
{
int ret = hs_verify_regex_expression(regex_expr, logger);
if (ret == 0) {
return 0;
}
return rs_verify_regex_expression(regex_expr, logger);
}
static int expr_rule_pattern_count(struct expr_rule *rules, size_t n_rule,
size_t *n_lit_pat, size_t *n_regex_pat,
struct log_handle *logger)
{
size_t lit_pat_num = 0;
size_t regex_pat_num = 0;
for (size_t i = 0; i < n_rule; i++) {
if (rules[i].n_patterns > MAX_EXPR_PATTERN_NUM) {
char uuid_str[37];
uuid_unparse(rules[i].expr_uuid, uuid_str);
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d] the number of patterns in expr_rule(rule_id:%s)"
" should less than %d", __FUNCTION__, __LINE__,
uuid_str, MAX_EXPR_PATTERN_NUM);
return -1;
}
for (size_t j = 0; j < rules[i].n_patterns; j++) {
/* pat_len should not 0 */
if (0 == rules[i].patterns[j].pat_len) {
char uuid_str[37];
uuid_unparse(rules[i].expr_uuid, uuid_str);
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d] expr rule %s pattern length should not 0",
__FUNCTION__, __LINE__, uuid_str);
return -1;
}
if (rules[i].patterns[j].type == EXPR_PATTERN_TYPE_STR) {
lit_pat_num++;
} else {
regex_pat_num++;
}
}
}
if (0 == lit_pat_num && 0 == regex_pat_num) {
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d] exprs has no valid pattern",
__FUNCTION__, __LINE__);
return -1;
}
*n_lit_pat = lit_pat_num;
*n_regex_pat = regex_pat_num;
return 0;
}
static struct bool_expr *bool_exprs_new(struct expr_rule *rules, size_t n_rule,
enum expr_engine_type engine_type,
struct pattern_attribute *pat_attr,
void *lit_compile_data, void *regex_compile_data)
{
uint32_t pattern_index = 0;
uint32_t literal_index = 0;
uint32_t regex_index = 0;
struct bool_expr *bool_exprs = ALLOC(struct bool_expr, n_rule);
/* populate adpt_hs_compile_data and bool_expr */
for (size_t i = 0; i < n_rule; i++) {
for (size_t j = 0; j < rules[i].n_patterns; j++) {
pat_attr[pattern_index].pattern_id = pattern_index;
pat_attr[pattern_index].match_mode = rules[i].patterns[j].match_mode;
if (pat_attr[pattern_index].match_mode == EXPR_MATCH_MODE_SUB ||
pat_attr[pattern_index].match_mode == EXPR_MATCH_MODE_EXACTLY) {
pat_attr[pattern_index].offset.start = rules[i].patterns[j].start_offset;
pat_attr[pattern_index].offset.end = rules[i].patterns[j].end_offset;
}
/* literal pattern */
if (rules[i].patterns[j].type == EXPR_PATTERN_TYPE_STR) {
db_ops[engine_type].populate_compile_data(lit_compile_data, literal_index,
pattern_index, rules[i].patterns[j].pat,
rules[i].patterns[j].pat_len,
rules[i].patterns[j].case_sensitive);
literal_index++;
} else {
/* regex pattern */
hs_populate_compile_data(regex_compile_data, regex_index, pattern_index,
rules[i].patterns[j].pat, rules[i].patterns[j].pat_len,
rules[i].patterns[j].case_sensitive);
regex_index++;
}
bool_exprs[i].items[j].item_id = pattern_index++;
bool_exprs[i].items[j].negate_option = 0;
}
uuid_copy(bool_exprs[i].expr_uuid, rules[i].expr_uuid);
bool_exprs[i].item_num = rules[i].n_patterns;
bool_exprs[i].user_tag = &(bool_exprs[i]);
}
return bool_exprs;
}
void expr_matcher_free(struct expr_matcher *matcher)
{
if (NULL == matcher) {
return;
}
if (matcher->lit_runtime != NULL) {
engine_ops[matcher->engine_type].engine_free(matcher->lit_runtime);
matcher->lit_runtime = NULL;
}
if (matcher->regex_runtime != NULL) {
hs_regex_engine_free(matcher->regex_runtime);
matcher->regex_runtime = NULL;
}
if (matcher->bm != NULL) {
bool_matcher_free(matcher->bm);
matcher->bm = NULL;
}
if (matcher->bool_match_buffs != NULL) {
for (size_t i = 0; i < matcher->n_thread; i++) {
if (matcher->bool_match_buffs[i] != NULL) {
FREE(matcher->bool_match_buffs[i]);
}
}
FREE(matcher->bool_match_buffs);
}
if (matcher->pat_attr != NULL) {
FREE(matcher->pat_attr);
}
if (matcher->bool_exprs != NULL) {
FREE(matcher->bool_exprs);
}
FREE(matcher);
}
struct expr_matcher *
expr_matcher_new(struct expr_rule *rules, size_t n_rule,
enum expr_engine_type engine_type,
size_t n_thread, struct log_handle *logger)
{
if (NULL == rules || 0 == n_rule || 0 == n_thread ||
(engine_type != EXPR_ENGINE_TYPE_HS &&
engine_type != EXPR_ENGINE_TYPE_RS)) {
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d]engine type:%d is illegal",
__FUNCTION__, __LINE__, engine_type);
return NULL;
}
size_t lit_pat_cnt = 0;
size_t regex_pat_cnt = 0;
size_t pat_cnt = 0;
int ret = expr_rule_pattern_count(rules, n_rule, &lit_pat_cnt,
&regex_pat_cnt, logger);
if (ret < 0) {
return NULL;
}
pat_cnt = lit_pat_cnt + regex_pat_cnt;
void *lit_compile_data = NULL;
void *regex_compile_data = NULL;
if (lit_pat_cnt > 0) {
lit_compile_data = db_ops[engine_type].compile_data_new(EXPR_PATTERN_TYPE_STR,
lit_pat_cnt);
}
if (regex_pat_cnt > 0) {
regex_compile_data = hs_compile_data_new(EXPR_PATTERN_TYPE_REG, regex_pat_cnt);
}
struct pattern_attribute *pat_attr = ALLOC(struct pattern_attribute, pat_cnt);
struct bool_expr *bool_exprs = bool_exprs_new(rules, n_rule, engine_type,
pat_attr, lit_compile_data,
regex_compile_data);
size_t mem_size = 0;
int bm_ret = 0;
struct expr_matcher *matcher = ALLOC(struct expr_matcher, 1);
matcher->n_thread = n_thread;
matcher->pat_attr = pat_attr;
matcher->engine_type = engine_type;
matcher->logger = logger;
matcher->bool_exprs = bool_exprs;
matcher->bm = bool_matcher_new(bool_exprs, n_rule, &mem_size);
if (matcher->bm != NULL) {
log_info(logger, MODULE_EXPR_MATCHER,
"expr_matcher module: build bool matcher of %zu expressions"
" with %zu bytes memory", n_rule, mem_size);
} else {
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d] expr_matcher module: build bool matcher failed",
__FUNCTION__, __LINE__);
bm_ret = -1;
}
matcher->bool_match_buffs = ALLOC(struct bool_expr_match *, n_thread);
for (size_t i = 0; i < n_thread; i++) {
matcher->bool_match_buffs[i] = ALLOC(struct bool_expr_match, MAX_HIT_PATTERN_NUM);
}
void *lit_db = NULL;
if (lit_compile_data != NULL) {
ret = db_ops[engine_type].build_db(&lit_db, lit_compile_data, logger);
if (ret < 0) {
bm_ret = -1;
}
db_ops[engine_type].compile_data_free(lit_compile_data);
}
if (lit_db != NULL) {
matcher->lit_runtime = engine_ops[engine_type].engine_new(rules, n_rule, pat_attr,
lit_db, n_thread, logger);
if (NULL == matcher->lit_runtime) {
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d]expr_matcher new lit runtime failed.",
__FUNCTION__, __LINE__);
bm_ret = -1;
}
}
void *regex_db = NULL;
if (regex_compile_data != NULL) {
ret = hs_build_regex_db(&regex_db, regex_compile_data, logger);
if (ret < 0) {
bm_ret = -1;
}
hs_compile_data_free(regex_compile_data);
}
if (regex_db != NULL) {
matcher->regex_runtime = hs_regex_engine_new(rules, n_rule, pat_attr,
regex_db, n_thread, logger);
if (NULL == matcher->regex_runtime) {
log_fatal(logger, MODULE_EXPR_MATCHER,
"[%s:%d]expr_matcher new regex runtime failed.",
__FUNCTION__, __LINE__);
bm_ret = -1;
}
}
if (bm_ret < 0) {
goto error;
}
return matcher;
error:
expr_matcher_free(matcher);
return NULL;
}
static inline int compare_pattern_id(const void *a, const void *b)
{
long long ret = *(const unsigned long long *)a - *(const unsigned long long *)b;
if (ret == 0) {
return 0;
} else if (ret < 0) {
return -1;
} else {
return 1;
}
}
static int expr_matcher_bool_matcher_match(struct bool_matcher *bm, struct bool_expr_match *match_buff,
size_t buff_size, unsigned long long *hit_pattern_ids,
size_t n_hit_pattern, size_t *n_hit_result)
{
unsigned long long prev_pat_id = 0xFFFFFFFFFFFFFFFF;
unsigned long long tmp_pat_id = 0;
unsigned long long unique_pat_ids[n_hit_pattern];
size_t n_unique_pat_id = 0;
qsort(hit_pattern_ids, n_hit_pattern, sizeof(unsigned long long), compare_pattern_id);
for (size_t i = 0; i < n_hit_pattern; i++) {
tmp_pat_id = hit_pattern_ids[i];
if (tmp_pat_id != prev_pat_id) {
unique_pat_ids[n_unique_pat_id++] = tmp_pat_id;
prev_pat_id = tmp_pat_id;
}
}
int bool_matcher_ret = bool_matcher_match(bm, unique_pat_ids, n_unique_pat_id,
match_buff, MAX_HIT_PATTERN_NUM);
if (bool_matcher_ret < 0) {
goto next;
}
if (bool_matcher_ret > (int)buff_size) {
bool_matcher_ret = buff_size;
}
*n_hit_result = bool_matcher_ret;
next:
return bool_matcher_ret;
}
int expr_matcher_match(struct expr_matcher *matcher, int thread_id,
const char *data, size_t data_len,
uuid_t *result_array,
size_t array_size, size_t *n_hit_result,
size_t *n_hit_pattern)
{
if (NULL == matcher || thread_id < 0 || NULL == data || 0 == data_len
|| NULL == result_array || 0 == array_size || NULL == n_hit_result) {
return -1;
}
int err_count = 0;
unsigned long long lit_pat_ids[MAX_HIT_PATTERN_NUM];
unsigned long long regex_pat_ids[MAX_HIT_PATTERN_NUM];
size_t lit_pat_cnt = 0;
size_t regex_pat_cnt = 0;
size_t pat_cnt = 0;
int ret = engine_ops[matcher->engine_type].engine_scan(matcher->lit_runtime, thread_id,
data, data_len, lit_pat_ids,
MAX_HIT_PATTERN_NUM, &lit_pat_cnt);
if (ret < 0) {
err_count++;
}
ret = hs_regex_engine_scan(matcher->regex_runtime, thread_id, data, data_len,
regex_pat_ids, MAX_HIT_PATTERN_NUM, &regex_pat_cnt);
if (ret < 0) {
err_count++;
}
if (err_count == 2) {
return -1;
}
pat_cnt = lit_pat_cnt + regex_pat_cnt;
*n_hit_pattern = pat_cnt;
if (pat_cnt > MAX_HIT_PATTERN_NUM) {
pat_cnt = MAX_HIT_PATTERN_NUM;
}
size_t j = 0;
for (size_t i = lit_pat_cnt; i < pat_cnt; i++, j++) {
lit_pat_ids[i] = regex_pat_ids[j];
}
struct bool_expr_match *match_buff = matcher->bool_match_buffs[thread_id];
ret = expr_matcher_bool_matcher_match(matcher->bm, match_buff, MAX_HIT_PATTERN_NUM,
lit_pat_ids, pat_cnt, n_hit_result);
for (size_t i = 0; i < *n_hit_result && i < array_size; i++) {
uuid_copy(result_array[i], match_buff[i].expr_uuid);
}
if (*n_hit_result > array_size) {
*n_hit_result = array_size;
}
return ret;
}
struct expr_matcher_stream *
expr_matcher_stream_open(struct expr_matcher *matcher, int thread_id)
{
if (NULL == matcher || thread_id < 0) {
return NULL;
}
size_t err_count = 0;
void *lit_stream = engine_ops[matcher->engine_type].stream_open(matcher->lit_runtime,
thread_id);
if (NULL == lit_stream && matcher->lit_runtime != NULL) {
log_fatal(matcher->logger, MODULE_EXPR_MATCHER,
"[%s:%d] expr_matcher open lit engine stream failed.",
__FUNCTION__, __LINE__);
err_count++;
}
void *regex_stream = hs_regex_stream_open(matcher->regex_runtime, thread_id);
if (NULL == regex_stream && matcher->regex_runtime != NULL) {
engine_ops[matcher->engine_type].stream_close(lit_stream);
log_fatal(matcher->logger, MODULE_EXPR_MATCHER,
"[%s:%d] expr_matcher open regex engine stream failed.",
__FUNCTION__, __LINE__);
err_count++;
}
if (err_count == 2) {
return NULL;
}
struct expr_matcher_stream *stream = ALLOC(struct expr_matcher_stream, 1);
stream->engine_type = matcher->engine_type;
stream->thread_id = thread_id;
stream->lit_stream = lit_stream;
stream->regex_stream = regex_stream;
stream->ref_matcher = matcher;
utarray_new(stream->all_hit_lit_pattern_ids, &ut_pattern_id_icd);
return stream;
}
static int expr_has_pattern_id_in_array(struct bool_expr *expr, unsigned long long *pat_ids, size_t n_pat)
{
for (size_t i = 0; i < expr->item_num; i++) {
for (size_t j = 0; j < n_pat; j++) {
if (expr->items[i].item_id == pat_ids[j]) {
return 1;
}
}
}
return 0;
}
int expr_matcher_stream_match(struct expr_matcher_stream *stream,
const char *data, size_t data_len,
uuid_t *result_array,
size_t array_size, size_t *n_hit_result,
size_t *n_hit_pattern)
{
if (NULL == stream || NULL == data || 0 == data_len ||
NULL == result_array || 0 == array_size ||
NULL == n_hit_result) {
return -1;
}
int err_count = 0;
unsigned long long lit_pat_ids[MAX_HIT_PATTERN_NUM];
unsigned long long regex_pat_ids[MAX_HIT_PATTERN_NUM];
size_t lit_pat_cnt = 0;
size_t regex_pat_cnt = 0;
size_t all_hit_pat_cnt = 0;
int ret = engine_ops[stream->engine_type].scan_stream(stream->lit_stream, data, data_len,
lit_pat_ids, MAX_HIT_PATTERN_NUM,
&lit_pat_cnt);
if (ret < 0) {
err_count++;
}
ret = hs_regex_stream_scan(stream->regex_stream, data, data_len, regex_pat_ids,
MAX_HIT_PATTERN_NUM, &regex_pat_cnt);
if (ret < 0) {
err_count++;
}
if (err_count == 2) {
return -1;
}
*n_hit_pattern = lit_pat_cnt + regex_pat_cnt;
/*
1.some expr items may contain multi patterns such as "aaa&bbb", so we need to keep all hit patterns to ensure no expr item is missed by scanning multi times.
2.while thinking of maat api function maat_state_get_direct_hit_objects, bool_matcher(all_hit_patterns) will return all expr items every time, while this scan
may not hit some of items, so we need to check them.
*/
//1. add lit pattern ids to all_hit_lit_pattern_ids, and remove duplicate
for (size_t i = 0; i < lit_pat_cnt; i++) {
if (utarray_find(stream->all_hit_lit_pattern_ids, &lit_pat_ids[i], compare_pattern_id) == NULL) {
utarray_push_back(stream->all_hit_lit_pattern_ids, &lit_pat_ids[i]);
utarray_sort(stream->all_hit_lit_pattern_ids, compare_pattern_id);
}
}
//2. find expr item uuid by all hit lit pattern ids with bool_matcher
size_t all_hit_lit_pat_cnt = utarray_len(stream->all_hit_lit_pattern_ids);
unsigned long long all_hit_pat_ids[MAX_HIT_PATTERN_NUM];
all_hit_pat_cnt = all_hit_lit_pat_cnt + regex_pat_cnt;
if (all_hit_pat_cnt > MAX_HIT_PATTERN_NUM) {
all_hit_pat_cnt = MAX_HIT_PATTERN_NUM;
}
for (size_t i = 0; i < all_hit_lit_pat_cnt; i++) {
all_hit_pat_ids[i] = *(unsigned long long *)utarray_eltptr(stream->all_hit_lit_pattern_ids, i);
}
for (size_t i = all_hit_lit_pat_cnt, j = 0; i < all_hit_pat_cnt; i++, j++) {
all_hit_pat_ids[i] = regex_pat_ids[j];
}
struct expr_matcher *matcher = stream->ref_matcher;
struct bool_expr_match *match_buff = matcher->bool_match_buffs[stream->thread_id];
size_t n_hit_expr = 0;
ret = expr_matcher_bool_matcher_match(matcher->bm, match_buff, MAX_HIT_PATTERN_NUM,
all_hit_pat_ids, all_hit_pat_cnt, &n_hit_expr);
//3. check the result of bool_matcher
*n_hit_result = 0;
for (size_t i = 0; i < n_hit_expr; i++) {
struct bool_expr *expr = (struct bool_expr *)match_buff[i].user_tag;
if (expr_has_pattern_id_in_array(expr, lit_pat_ids, lit_pat_cnt) ||
expr_has_pattern_id_in_array(expr, regex_pat_ids, regex_pat_cnt)) {
uuid_copy(result_array[*n_hit_result], expr->expr_uuid);
(*n_hit_result)++;
if (*n_hit_result >= array_size) {
break;
}
}
}
return ret;
}
void expr_matcher_stream_close(struct expr_matcher_stream *stream)
{
if (NULL == stream) {
return;
}
if (stream->lit_stream != NULL) {
engine_ops[stream->engine_type].stream_close(stream->lit_stream);
stream->lit_stream = NULL;
}
if (stream->regex_stream != NULL) {
hs_regex_stream_close(stream->regex_stream);
stream->regex_stream = NULL;
}
if (stream->all_hit_lit_pattern_ids != NULL) {
utarray_free(stream->all_hit_lit_pattern_ids);
}
FREE(stream);
}