ipmatcher rule_id -> long long & scanner engine centralization

This commit is contained in:
liuwentan
2023-03-01 09:32:36 +08:00
parent a6fb2b6fdd
commit 1566a30002
63 changed files with 4695 additions and 115 deletions

View File

@@ -0,0 +1,305 @@
#include <stdio.h>
#include <assert.h>
#include <malloc.h>
#include "cgranges.h"
/**************
* Radix sort *
**************/
#define RS_MIN_SIZE 64
#define RS_MAX_BITS 8
#define KRADIX_SORT_INIT(name, rstype_t, rskey, sizeof_key) \
typedef struct { \
rstype_t *b, *e; \
} rsbucket_##name##_t; \
void rs_insertsort_##name(rstype_t *beg, rstype_t *end) \
{ \
rstype_t *i; \
for (i = beg + 1; i < end; ++i) \
if (rskey(*i) < rskey(*(i - 1))) { \
rstype_t *j, tmp = *i; \
for (j = i; j > beg && rskey(tmp) < rskey(*(j-1)); --j) \
*j = *(j - 1); \
*j = tmp; \
} \
} \
void rs_sort_##name(rstype_t *beg, rstype_t *end, int n_bits, int s) \
{ \
rstype_t *i; \
int size = 1<<n_bits, m = size - 1; \
rsbucket_##name##_t *k, b[1<<RS_MAX_BITS], *be = b + size; \
assert(n_bits <= RS_MAX_BITS); \
for (k = b; k != be; ++k) k->b = k->e = beg; \
for (i = beg; i != end; ++i) ++b[rskey(*i)>>s&m].e; \
for (k = b + 1; k != be; ++k) \
k->e += (k-1)->e - beg, k->b = (k-1)->e; \
for (k = b; k != be;) { \
if (k->b != k->e) { \
rsbucket_##name##_t *l; \
if ((l = b + (rskey(*k->b)>>s&m)) != k) { \
rstype_t tmp = *k->b, swap; \
do { \
swap = tmp; tmp = *l->b; *l->b++ = swap; \
l = b + (rskey(tmp)>>s&m); \
} while (l != k); \
*k->b++ = tmp; \
} else ++k->b; \
} else ++k; \
} \
for (b->b = beg, k = b + 1; k != be; ++k) k->b = (k-1)->e; \
if (s) { \
s = s > n_bits? s - n_bits : 0; \
for (k = b; k != be; ++k) \
if (k->e - k->b > RS_MIN_SIZE) rs_sort_##name(k->b, k->e, n_bits, s); \
else if (k->e - k->b > 1) rs_insertsort_##name(k->b, k->e); \
} \
} \
void radix_sort_##name(rstype_t *beg, rstype_t *end) \
{ \
if (end - beg <= RS_MIN_SIZE) rs_insertsort_##name(beg, end); \
else rs_sort_##name(beg, end, RS_MAX_BITS, (sizeof_key - 1) * RS_MAX_BITS); \
}
/*********************
* Convenient macros *
*********************/
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define CALLOC(type, len) ((type*)calloc((len), sizeof(type)))
#define REALLOC(ptr, len) ((ptr) = (__typeof__(ptr))realloc((ptr), (len) * sizeof(*(ptr))))
#define EXPAND(a, m) do { \
(m) = (m)? (m) + ((m)>>1) : 16; \
REALLOC((a), (m)); \
} while (0)
/********************
* Basic operations *
********************/
#define cr_intv_key(r) ((r).x)
KRADIX_SORT_INIT(cr_intv, cr_intv_t, cr_intv_key, 8)
cgranges_t *cr_init(void)
{
cgranges_t *cr;
cr = CALLOC(cgranges_t, 1);
return cr;
}
void cr_destroy(cgranges_t *cr)
{
if (cr == 0) return;
if (cr->n_r && cr->r)
{
free(cr->r);
cr->r = NULL;
}
free(cr);
cr = NULL;
}
// int32_t cr_add_ctg(cgranges_t *cr, const char *ctg, int32_t len)
// {
// int absent;
// khint_t k;
// strhash_t *h = (strhash_t*)cr->hc;
// k = kh_put(str, h, ctg, &absent);
// if (absent) {
// cr_ctg_t *p;
// if (cr->n_ctg == cr->m_ctg)
// EXPAND(cr->ctg, cr->m_ctg);
// kh_val(h, k) = cr->n_ctg;
// p = &cr->ctg[cr->n_ctg++];
// p->name = strdup(ctg);
// kh_key(h, k) = p->name;
// p->len = len;
// p->n = 0, p->off = -1;
// }
// if (len > cr->ctg[kh_val(h, k)].len)
// cr->ctg[kh_val(h, k)].len = len;
// return kh_val(h, k);
// }
// int32_t cr_get_ctg(const cgranges_t *cr, const char *ctg)
// {
// khint_t k;
// strhash_t *h = (strhash_t*)cr->hc;
// k = kh_get(str, h, ctg);
// return k == kh_end(h)? -1 : kh_val(h, k);
// }
cr_intv_t *cr_add(cgranges_t *cr, uint64_t st, uint64_t en, user_label_t label)
{
cr_intv_t *p;
if (st > en) return 0;
if (cr->n_r == cr->m_r)
EXPAND(cr->r, cr->m_r);
p = &cr->r[cr->n_r++];
p->x = st;
p->real_y = en;
p->label = label;
if (cr->len < en)
cr->len = en;
return p;
}
void cr_sort(cgranges_t *cr)
{
// if (cr->n_ctg == 0 || cr->n_r == 0) return;
if (cr->n_r == 0) return;
radix_sort_cr_intv(cr->r, cr->r + cr->n_r);
}
int32_t cr_is_sorted(const cgranges_t *cr)
{
uint64_t i;
for (i = 1; i < cr->n_r; ++i)
if (cr->r[i-1].x > cr->r[i].x)
break;
return (i == cr->n_r);
}
/************
* Indexing *
************/
int64_t cr_index1(cr_intv_t *a, int64_t n)
{
int64_t i, last_i;
uint64_t last;
int64_t k;
if (n <= 0) return -1;
for (i = 0; i < n; i += 2) last_i = i, last = a[i].y = (a[i].real_y);
for (k = 1; 1LL<<k <= n; ++k) {
int64_t x = 1LL<<(k-1), i0 = (x<<1) - 1, step = x<<2;
for (i = i0; i < n; i += step) {
uint64_t el = a[i - x].y;
uint64_t er = i + x < n? a[i + x].y : last;
uint64_t e = a[i].real_y;
e = e > el? e : el;
e = e > er? e : er;
a[i].y = e;
}
last_i = last_i>>k&1? last_i - x : last_i + x;
if (last_i < n && a[last_i].y > last)
last = a[last_i].y;
}
return k - 1;
}
void cr_index(cgranges_t *cr)
{
if (!cr_is_sorted(cr)) cr_sort(cr);
cr->root_k = cr_index1(cr->r, cr->n_r);
}
/*********
* Query *
*********/
// int64_t cr_min_start_int(const cgranges_t *cr, int32_t ctg_id, int32_t st) // find the smallest i such that cr_st(&r[i]) >= st
// {
// int64_t left, right;
// const cr_ctg_t *c;
// const cr_intv_t *r;
// if (ctg_id < 0 || ctg_id >= cr->n_ctg) return -1;
// c = &cr->ctg[ctg_id];
// r = &cr->r[c->off];
// if (c->n == 0) return -1;
// left = 0, right = c->n;
// while (right > left) {
// int64_t mid = left + ((right - left) >> 1);
// if (cr_st(&r[mid]) >= st) right = mid;
// else left = mid + 1;
// }
// assert(left == right);
// return left == c->n? -1 : c->off + left;
// }
typedef struct {
int64_t x;
int64_t k, w;
} istack_t;
int64_t cr_overlap_int(const cgranges_t *cr, uint64_t st, uint64_t en, int64_t **b_, int64_t *m_b_)
{
int32_t t = 0;
const cr_intv_t *r;
int64_t *b = *b_, m_b = *m_b_, n = 0;
istack_t stack[64], *p;
r = cr->r;
p = &stack[t++];
p->k = cr->root_k, p->x = (1LL<<p->k) - 1, p->w = 0; // push the root into the stack
while (t) { // stack is not empyt
istack_t z = stack[--t];
if (z.k <= 3) { // the subtree is no larger than (1<<(z.k+1))-1; do a linear scan
int64_t i, i0 = z.x >> z.k << z.k, i1 = i0 + (1LL<<(z.k+1)) - 1;
if (i1 >= cr->n_r) i1 = cr->n_r;
for (i = i0; i < i1 && cr_st(&r[i]) < en; ++i)
if (st < cr_en(&r[i])) {
if (n == m_b) EXPAND(b, m_b);
b[n++] = i;
}
} else if (z.w == 0) { // if left child not processed
int64_t y = z.x - (1LL<<(z.k-1));
p = &stack[t++];
p->k = z.k, p->x = z.x, p->w = 1;
if (y >= cr->n_r || r[y].y > st) {
p = &stack[t++];
p->k = z.k - 1, p->x = y, p->w = 0; // push the left child to the stack
}
} else if (z.x < cr->n_r && cr_st(&r[z.x]) < en) {
if (st < cr_en(&r[z.x])) { // then z.x overlaps the query; write to the output array
if (n == m_b) EXPAND(b, m_b);
b[n++] = z.x;
}
p = &stack[t++];
p->k = z.k - 1, p->x = z.x + (1LL<<(z.k-1)), p->w = 0; // push the right child
}
}
*b_ = b, *m_b_ = m_b;
return n;
}
// int64_t cr_contain_int(const cgranges_t *cr, int32_t ctg_id, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
// {
// int64_t n = 0, i, s, e, *b = *b_, m_b = *m_b_;
// s = cr_min_start_int(cr, ctg_id, st);
// if (s < 0) return 0;
// e = cr->ctg[ctg_id].off + cr->ctg[ctg_id].n;
// for (i = s; i < e; ++i) {
// const cr_intv_t *r = &cr->r[i];
// if (cr_st(r) >= en) break;
// if (cr_st(r) >= st && cr_en(r) <= en) {
// if (n == m_b) EXPAND(b, m_b);
// b[n++] = i;
// }
// }
// *b_ = b, *m_b_ = m_b;
// return n;
// }
// int64_t cr_min_start(const cgranges_t *cr, const char *ctg, int32_t st)
// {
// return cr_min_start_int(cr, cr_get_ctg(cr, ctg), st);
// }
int64_t cr_overlap(const cgranges_t *cr, uint64_t st, uint64_t en, int64_t **b_, int64_t *m_b_)
{
return cr_overlap_int(cr, st, en, b_, m_b_);
}
// int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_)
// {
// return cr_contain_int(cr, cr_get_ctg(cr, ctg), st, en, b_, m_b_);
// }

View File

@@ -0,0 +1,103 @@
/* The MIT License
Copyright (c) 2019 Dana-Farber Cancer Institute
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
#ifndef CRANGES_H
#define CRANGES_H
#include <stdint.h>
#include "interval_matcher.h"
typedef struct interval_result user_label_t;
typedef struct { // a contig
char *name; // name of the contig
int32_t len; // max length seen in data
int32_t root_k;
int64_t n, off; // sum of lengths of previous contigs
} cr_ctg_t;
typedef struct { // an interval
// uint64_t x; // prior to cr_index(), x = ctg_id<<32|start_pos; after: x = start_pos<<32|end_pos
// uint32_t y:31, rev:1;
uint64_t x;
uint64_t y;
uint64_t real_y;
user_label_t label; // NOT used
} cr_intv_t;
typedef struct {
int64_t n_r, m_r; // number and max number of intervals
cr_intv_t *r; // list of intervals (of size _n_r_)
// int32_t n_ctg, m_ctg; // number and max number of contigs
// cr_ctg_t ctg; // list of contigs (of size _n_ctg_)
// void *hc; // dictionary for converting contig names to integers
uint64_t len; // max length seen in data
int64_t root_k;
} cgranges_t;
#ifdef __cplusplus
extern "C" {
#endif
// retrieve start and end positions from a cr_intv_t object
// static inline int32_t cr_st(const cr_intv_t *r) { return (int32_t)(r->x>>32); }
// static inline int32_t cr_en(const cr_intv_t *r) { return (int32_t)r->x; }
static inline uint64_t cr_st(const cr_intv_t *r) { return r->x; }
static inline uint64_t cr_en(const cr_intv_t *r) { return r->real_y; }
static inline uint64_t cr_start(const cgranges_t *cr, int64_t i) { return cr_st(&cr->r[i]); }
static inline uint64_t cr_end(const cgranges_t *cr, int64_t i) { return cr_en(&cr->r[i]); }
static inline user_label_t cr_label(const cgranges_t *cr, int64_t i) { return cr->r[i].label; }
// Initialize
cgranges_t *cr_init(void);
// Deallocate
void cr_destroy(cgranges_t *cr);
// Add an interval
// cr_intv_t *cr_add(cgranges_t *cr, const char *ctg, int32_t st, int32_t en, user_label_t label);
cr_intv_t *cr_add(cgranges_t *cr, uint64_t st, uint64_t en, user_label_t label);
// Sort and index intervals
void cr_index(cgranges_t *cr);
// int64_t cr_overlap(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_);
// int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_);
int64_t cr_overlap(const cgranges_t *cr, uint64_t st, uint64_t en, int64_t **b_, int64_t *m_b_);
int64_t cr_contain(const cgranges_t *cr, const char *ctg, int32_t st, int32_t en, int64_t **b_, int64_t *m_b_);
// Add a contig and length. Call this for desired contig ordering. _len_ can be 0.
int32_t cr_add_ctg(cgranges_t *cr, const char *ctg, int32_t len);
// Get the contig ID given its name
int32_t cr_get_ctg(const cgranges_t *cr, const char *ctg);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,86 @@
#include <stdio.h>
#include <stdlib.h>
#include "interval_matcher.h"
#include "cgranges.h"
#ifndef MIN
#define MIN(a,b) ((a)>(b) ? (b) : (a))
#endif
struct interval_matcher
{
cgranges_t *cr;
size_t n_rule;
};
struct interval_matcher *interval_matcher_new(struct interval_rule *rule, size_t n_rule)
{
if (!rule || !n_rule)
{
return NULL;
}
struct interval_matcher *matcher = (struct interval_matcher *)calloc(sizeof(struct interval_matcher), 1);
user_label_t label;
matcher->cr = cr_init();
matcher->n_rule = n_rule;
for(size_t i = 0; i < n_rule; i ++)
{
label = rule[i].result;
cr_add(matcher->cr, rule[i].start, rule[i].end + 1, label);
}
cr_index(matcher->cr);
if (matcher->cr->root_k == -1)
{
return NULL;
}
return matcher;
}
void interval_matcher_free(struct interval_matcher *interval_matcher)
{
if (!interval_matcher)
{
return;
}
cr_destroy(interval_matcher->cr);
free(interval_matcher);
interval_matcher = NULL;
return;
}
int interval_matcher_match(struct interval_matcher *interval_matcher, uint64_t target, struct interval_result *result, size_t n_result)
{
if (interval_matcher == NULL || result == NULL || n_result == 0 || target == (uint64_t)(-1))
{
return -1;
}
int64_t i, n, *b = 0, max_b = 0;
n = cr_overlap(interval_matcher->cr, target, target + 1, &b, &max_b);
if (n <= 0 || b == NULL || max_b == 0)
{
return 0;
}
n = MIN(MIN((uint64_t)n, n_result), (uint32_t)(-1));
for (i = 0; i < n; i ++)
{
result[i] = interval_matcher->cr->r[b[i]].label;
}
free(b);
b = NULL;
return (int)n;
}

View File

@@ -0,0 +1,72 @@
/*
* @Author: Yang Yubo yangyubo@geedgenetworks.com
* @Date: 2023-1-18
* @LastEditors: Yang Yubo yangyubo@geedgenetworks.com
* @FilePath: /interval_matcher/include/interval_matcher.h
*/
#ifndef INTERVAL_MATCHER_H
#define INTERVAL_MATCHER_H
#include <stdint.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C"
{
#endif
// if matched, return id and tag;
struct interval_result
{
uint64_t rule_id;
/* A transparent user tag for convenient accessing,
the caller is responsible for its memory management. */
void *user_tag;
};
struct interval_rule
{
uint64_t start; // interval's start
uint64_t end; // interval's end the max is ((uint64_t)(-1) - 1)
struct interval_result result;
};
/* forward declaration;
The internal structure
is not open to the outside */
struct interval_matcher;
/**
* @description: to build a interval_matcher for matching;
* @param {struct interval_rule} *rule: it's a array for rules;
* @param {size_t} n_rule: it's the number of rules;
* @return {struct interval_matcher*}: if NULL, build failed!
*/
struct interval_matcher *interval_matcher_new(struct interval_rule *rule, size_t n_rule);
/**
* @description: to destroy interval_matcher after used;
* @param {interval_matcher} *interval_matcher: the target need to free, can't be NULL;
* @return {*}
*/
void interval_matcher_free(struct interval_matcher *interval_matcher);
/**
* @description: matching, after this api, user can get an array of rules matched;
* @param {struct interval_matcher} *interval_matcher: a matcher;
* @param {struct interval_result} *result: rusult arrays, user alloc memory;
* @param {size_t} n_result: the MAX number of rules matched;
* @param {uint64_t} target: need to match, the max is ((uint64_t)(-1) - 1);
* @return {int}: The return value is the number of matched rules, which may be 0; if -1, invalid parameter;
*/
int interval_matcher_match(struct interval_matcher *interval_matcher, uint64_t target, struct interval_result *result, size_t n_result);
#ifdef __cplusplus
}
#endif
#endif // INTERVAL_MATCHER_H