267 lines
10 KiB
Rust
267 lines
10 KiB
Rust
use core::mem::size_of;
|
|
|
|
use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo};
|
|
|
|
/// The minimum length of a needle required for this algorithm. The minimum
|
|
/// is 2 since a length of 1 should just use memchr and a length of 0 isn't
|
|
/// a case handled by this searcher.
|
|
pub(crate) const MIN_NEEDLE_LEN: usize = 2;
|
|
|
|
/// The maximum length of a needle required for this algorithm.
|
|
///
|
|
/// In reality, there is no hard max here. The code below can handle any
|
|
/// length needle. (Perhaps that suggests there are missing optimizations.)
|
|
/// Instead, this is a heuristic and a bound guaranteeing our linear time
|
|
/// complexity.
|
|
///
|
|
/// It is a heuristic because when a candidate match is found, memcmp is run.
|
|
/// For very large needles with lots of false positives, memcmp can make the
|
|
/// code run quite slow.
|
|
///
|
|
/// It is a bound because the worst case behavior with memcmp is multiplicative
|
|
/// in the size of the needle and haystack, and we want to keep that additive.
|
|
/// This bound ensures we still meet that bound theoretically, since it's just
|
|
/// a constant. We aren't acting in bad faith here, memcmp on tiny needles
|
|
/// is so fast that even in pathological cases (see pathological vector
|
|
/// benchmarks), this is still just as fast or faster in practice.
|
|
///
|
|
/// This specific number was chosen by tweaking a bit and running benchmarks.
|
|
/// The rare-medium-needle, for example, gets about 5% faster by using this
|
|
/// algorithm instead of a prefilter-accelerated Two-Way. There's also a
|
|
/// theoretical desire to keep this number reasonably low, to mitigate the
|
|
/// impact of pathological cases. I did try 64, and some benchmarks got a
|
|
/// little better, and others (particularly the pathological ones), got a lot
|
|
/// worse. So... 32 it is?
|
|
pub(crate) const MAX_NEEDLE_LEN: usize = 32;
|
|
|
|
/// The implementation of the forward vector accelerated substring search.
|
|
///
|
|
/// This is extremely similar to the prefilter vector module by the same name.
|
|
/// The key difference is that this is not a prefilter. Instead, it handles
|
|
/// confirming its own matches. The trade off is that this only works with
|
|
/// smaller needles. The speed up here is that an inlined memcmp on a tiny
|
|
/// needle is very quick, even on pathological inputs. This is much better than
|
|
/// combining a prefilter with Two-Way, where using Two-Way to confirm the
|
|
/// match has higher latency.
|
|
///
|
|
/// So why not use this for all needles? We could, and it would probably work
|
|
/// really well on most inputs. But its worst case is multiplicative and we
|
|
/// want to guarantee worst case additive time. Some of the benchmarks try to
|
|
/// justify this (see the pathological ones).
|
|
///
|
|
/// The prefilter variant of this has more comments. Also note that we only
|
|
/// implement this for forward searches for now. If you have a compelling use
|
|
/// case for accelerated reverse search, please file an issue.
|
|
#[derive(Clone, Copy, Debug)]
|
|
pub(crate) struct Forward {
|
|
rare1i: u8,
|
|
rare2i: u8,
|
|
}
|
|
|
|
impl Forward {
|
|
/// Create a new "generic simd" forward searcher. If one could not be
|
|
/// created from the given inputs, then None is returned.
|
|
pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> {
|
|
let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8();
|
|
// If the needle is too short or too long, give up. Also, give up
|
|
// if the rare bytes detected are at the same position. (It likely
|
|
// suggests a degenerate case, although it should technically not be
|
|
// possible.)
|
|
if needle.len() < MIN_NEEDLE_LEN
|
|
|| needle.len() > MAX_NEEDLE_LEN
|
|
|| rare1i == rare2i
|
|
{
|
|
return None;
|
|
}
|
|
Some(Forward { rare1i, rare2i })
|
|
}
|
|
|
|
/// Returns the minimum length of haystack that is needed for this searcher
|
|
/// to work for a particular vector. Passing a haystack with a length
|
|
/// smaller than this will cause `fwd_find` to panic.
|
|
#[inline(always)]
|
|
pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize {
|
|
self.rare2i as usize + size_of::<V>()
|
|
}
|
|
}
|
|
|
|
/// Searches the given haystack for the given needle. The needle given should
|
|
/// be the same as the needle that this searcher was initialized with.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// When the given haystack has a length smaller than `min_haystack_len`.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Since this is meant to be used with vector functions, callers need to
|
|
/// specialize this inside of a function with a `target_feature` attribute.
|
|
/// Therefore, callers must ensure that whatever target feature is being used
|
|
/// supports the vector functions that this function is specialized for. (For
|
|
/// the specific vector functions used, see the Vector trait implementations.)
|
|
#[inline(always)]
|
|
pub(crate) unsafe fn fwd_find<V: Vector>(
|
|
fwd: &Forward,
|
|
haystack: &[u8],
|
|
needle: &[u8],
|
|
) -> Option<usize> {
|
|
// It would be nice if we didn't have this check here, since the meta
|
|
// searcher should handle it for us. But without this, I don't think we
|
|
// guarantee that end_ptr.sub(needle.len()) won't result in UB. We could
|
|
// put it as part of the safety contract, but it makes it more complicated
|
|
// than necessary.
|
|
if haystack.len() < needle.len() {
|
|
return None;
|
|
}
|
|
let min_haystack_len = fwd.min_haystack_len::<V>();
|
|
assert!(haystack.len() >= min_haystack_len, "haystack too small");
|
|
debug_assert!(needle.len() <= haystack.len());
|
|
debug_assert!(
|
|
needle.len() >= MIN_NEEDLE_LEN,
|
|
"needle must be at least {} bytes",
|
|
MIN_NEEDLE_LEN,
|
|
);
|
|
debug_assert!(
|
|
needle.len() <= MAX_NEEDLE_LEN,
|
|
"needle must be at most {} bytes",
|
|
MAX_NEEDLE_LEN,
|
|
);
|
|
|
|
let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize);
|
|
let rare1chunk = V::splat(needle[rare1i]);
|
|
let rare2chunk = V::splat(needle[rare2i]);
|
|
|
|
let start_ptr = haystack.as_ptr();
|
|
let end_ptr = start_ptr.add(haystack.len());
|
|
let max_ptr = end_ptr.sub(min_haystack_len);
|
|
let mut ptr = start_ptr;
|
|
|
|
// N.B. I did experiment with unrolling the loop to deal with size(V)
|
|
// bytes at a time and 2*size(V) bytes at a time. The double unroll was
|
|
// marginally faster while the quadruple unroll was unambiguously slower.
|
|
// In the end, I decided the complexity from unrolling wasn't worth it. I
|
|
// used the memmem/krate/prebuilt/huge-en/ benchmarks to compare.
|
|
while ptr <= max_ptr {
|
|
let m = fwd_find_in_chunk(
|
|
fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0,
|
|
);
|
|
if let Some(chunki) = m {
|
|
return Some(matched(start_ptr, ptr, chunki));
|
|
}
|
|
ptr = ptr.add(size_of::<V>());
|
|
}
|
|
if ptr < end_ptr {
|
|
let remaining = diff(end_ptr, ptr);
|
|
debug_assert!(
|
|
remaining < min_haystack_len,
|
|
"remaining bytes should be smaller than the minimum haystack \
|
|
length of {}, but there are {} bytes remaining",
|
|
min_haystack_len,
|
|
remaining,
|
|
);
|
|
if remaining < needle.len() {
|
|
return None;
|
|
}
|
|
debug_assert!(
|
|
max_ptr < ptr,
|
|
"after main loop, ptr should have exceeded max_ptr",
|
|
);
|
|
let overlap = diff(ptr, max_ptr);
|
|
debug_assert!(
|
|
overlap > 0,
|
|
"overlap ({}) must always be non-zero",
|
|
overlap,
|
|
);
|
|
debug_assert!(
|
|
overlap < size_of::<V>(),
|
|
"overlap ({}) cannot possibly be >= than a vector ({})",
|
|
overlap,
|
|
size_of::<V>(),
|
|
);
|
|
// The mask has all of its bits set except for the first N least
|
|
// significant bits, where N=overlap. This way, any matches that
|
|
// occur in find_in_chunk within the overlap are automatically
|
|
// ignored.
|
|
let mask = !((1 << overlap) - 1);
|
|
ptr = max_ptr;
|
|
let m = fwd_find_in_chunk(
|
|
fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask,
|
|
);
|
|
if let Some(chunki) = m {
|
|
return Some(matched(start_ptr, ptr, chunki));
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Search for an occurrence of two rare bytes from the needle in the chunk
|
|
/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When
|
|
/// an occurrence is found, memcmp is run to check if a match occurs at the
|
|
/// corresponding position.
|
|
///
|
|
/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
|
|
/// bytes repeated in each 8-bit lane, respectively.
|
|
///
|
|
/// mask should have bits set corresponding the positions in the chunk in which
|
|
/// matches are considered. This is only used for the last vector load where
|
|
/// the beginning of the vector might have overlapped with the last load in
|
|
/// the main loop. The mask lets us avoid visiting positions that have already
|
|
/// been discarded as matches.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// It must be safe to do an unaligned read of size(V) bytes starting at both
|
|
/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned
|
|
/// loads on ptr up to (end_ptr - needle.len()).
|
|
#[inline(always)]
|
|
unsafe fn fwd_find_in_chunk<V: Vector>(
|
|
fwd: &Forward,
|
|
needle: &[u8],
|
|
ptr: *const u8,
|
|
end_ptr: *const u8,
|
|
rare1chunk: V,
|
|
rare2chunk: V,
|
|
mask: u32,
|
|
) -> Option<usize> {
|
|
let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize));
|
|
let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize));
|
|
|
|
let eq0 = chunk0.cmpeq(rare1chunk);
|
|
let eq1 = chunk1.cmpeq(rare2chunk);
|
|
|
|
let mut match_offsets = eq0.and(eq1).movemask() & mask;
|
|
while match_offsets != 0 {
|
|
let offset = match_offsets.trailing_zeros() as usize;
|
|
let ptr = ptr.add(offset);
|
|
if end_ptr.sub(needle.len()) < ptr {
|
|
return None;
|
|
}
|
|
let chunk = core::slice::from_raw_parts(ptr, needle.len());
|
|
if memcmp(needle, chunk) {
|
|
return Some(offset);
|
|
}
|
|
match_offsets &= match_offsets - 1;
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Accepts a chunk-relative offset and returns a haystack relative offset
|
|
/// after updating the prefilter state.
|
|
///
|
|
/// See the same function with the same name in the prefilter variant of this
|
|
/// algorithm to learned why it's tagged with inline(never). Even here, where
|
|
/// the function is simpler, inlining it leads to poorer codegen. (Although
|
|
/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.)
|
|
#[cold]
|
|
#[inline(never)]
|
|
fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize {
|
|
diff(ptr, start_ptr) + chunki
|
|
}
|
|
|
|
/// Subtract `b` from `a` and return the difference. `a` must be greater than
|
|
/// or equal to `b`.
|
|
fn diff(a: *const u8, b: *const u8) -> usize {
|
|
debug_assert!(a >= b);
|
|
(a as usize) - (b as usize)
|
|
}
|