RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.
This commit is contained in:
18
zeroidc/vendor/unicode-normalization/src/__test_api.rs
vendored
Normal file
18
zeroidc/vendor/unicode-normalization/src/__test_api.rs
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
// This crate comprises hacks and glue required to test private functions from tests/
|
||||
//
|
||||
// Keep this as slim as possible.
|
||||
//
|
||||
// If you're caught using this outside this crates tests/, you get to clean up the mess.
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
use crate::stream_safe::StreamSafe;
|
||||
|
||||
pub fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
|
||||
pub mod quick_check {
|
||||
pub use crate::quick_check::*;
|
||||
}
|
||||
161
zeroidc/vendor/unicode-normalization/src/decompose.rs
vendored
Normal file
161
zeroidc/vendor/unicode-normalization/src/decompose.rs
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use core::fmt::{self, Write};
|
||||
use core::iter::Fuse;
|
||||
use core::ops::Range;
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum DecompositionType {
|
||||
Canonical,
|
||||
Compatible,
|
||||
}
|
||||
|
||||
/// External iterator for a string decomposition's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Decompositions<I> {
|
||||
kind: DecompositionType,
|
||||
iter: Fuse<I>,
|
||||
|
||||
// This buffer stores pairs of (canonical combining class, character),
|
||||
// pushed onto the end in text order.
|
||||
//
|
||||
// It's divided into up to three sections:
|
||||
// 1) A prefix that is free space;
|
||||
// 2) "Ready" characters which are sorted and ready to emit on demand;
|
||||
// 3) A "pending" block which stills needs more characters for us to be able
|
||||
// to sort in canonical order and is not safe to emit.
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Canonical,
|
||||
iter: iter.fuse(),
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Compatible,
|
||||
iter: iter.fuse(),
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Decompositions<I> {
|
||||
#[inline]
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = super::char::canonical_combining_class(ch);
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sort_pending(&mut self) {
|
||||
// NB: `sort_by_key` is stable, so it will preserve the original text's
|
||||
// order within a combining class.
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reset_buffer(&mut self) {
|
||||
// Equivalent to `self.buffer.drain(0..self.ready.end)`
|
||||
// but faster than drain() if the buffer is a SmallVec or TinyVec
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
if next == self.ready.end {
|
||||
self.reset_buffer();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
while self.ready.end == 0 {
|
||||
match (self.iter.next(), &self.kind) {
|
||||
(Some(ch), &DecompositionType::Canonical) => {
|
||||
super::char::decompose_canonical(ch, |d| self.push_back(d));
|
||||
}
|
||||
(Some(ch), &DecompositionType::Compatible) => {
|
||||
super::char::decompose_compatible(ch, |d| self.push_back(d));
|
||||
}
|
||||
(None, _) => {
|
||||
if self.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.sort_pending();
|
||||
self.ready.end = self.buffer.len();
|
||||
|
||||
// This implementation means that we can call `next`
|
||||
// on an exhausted iterator; the last outer `next` call
|
||||
// will result in an inner `next` call. To make this
|
||||
// safe, we use `fuse`.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We can assume here that, if `self.ready.end` is greater than zero,
|
||||
// it's also greater than `self.ready.start`. That's because we only
|
||||
// increment `self.ready.start` inside `increment_next_ready`, and
|
||||
// whenever it reaches equality with `self.ready.end`, we reset both
|
||||
// to zero, maintaining the invariant that:
|
||||
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
|
||||
//
|
||||
// This less-than-obviously-safe implementation is chosen for performance,
|
||||
// minimizing the number & complexity of branches in `next` in the common
|
||||
// case of buffering then unbuffering a single character with each call.
|
||||
let (_, ch) = self.buffer[self.ready.start];
|
||||
self.increment_next_ready();
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
199
zeroidc/vendor/unicode-normalization/src/lib.rs
vendored
Normal file
199
zeroidc/vendor/unicode-normalization/src/lib.rs
vendored
Normal file
@@ -0,0 +1,199 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Unicode character composition and decomposition utilities
|
||||
//! as described in
|
||||
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
//!
|
||||
//! ```rust
|
||||
//! extern crate unicode_normalization;
|
||||
//!
|
||||
//! use unicode_normalization::char::compose;
|
||||
//! use unicode_normalization::UnicodeNormalization;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
|
||||
//!
|
||||
//! let s = "ÅΩ";
|
||||
//! let c = s.nfc().collect::<String>();
|
||||
//! assert_eq!(c, "ÅΩ");
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! # crates.io
|
||||
//!
|
||||
//! You can use this package in your project by adding the following
|
||||
//! to your `Cargo.toml`:
|
||||
//!
|
||||
//! ```toml
|
||||
//! [dependencies]
|
||||
//! unicode-normalization = "0.1.19"
|
||||
//! ```
|
||||
|
||||
#![deny(missing_docs, unsafe_code)]
|
||||
#![doc(
|
||||
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
|
||||
)]
|
||||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
extern crate core;
|
||||
|
||||
extern crate tinyvec;
|
||||
|
||||
pub use crate::decompose::Decompositions;
|
||||
pub use crate::quick_check::{
|
||||
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
|
||||
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
|
||||
IsNormalized,
|
||||
};
|
||||
pub use crate::recompose::Recompositions;
|
||||
pub use crate::replace::Replacements;
|
||||
pub use crate::stream_safe::StreamSafe;
|
||||
pub use crate::tables::UNICODE_VERSION;
|
||||
use core::str::Chars;
|
||||
|
||||
mod no_std_prelude;
|
||||
|
||||
mod decompose;
|
||||
mod lookups;
|
||||
mod normalize;
|
||||
mod perfect_hash;
|
||||
mod quick_check;
|
||||
mod recompose;
|
||||
mod replace;
|
||||
mod stream_safe;
|
||||
|
||||
#[rustfmt::skip]
|
||||
mod tables;
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod __test_api;
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
/// Methods for composing and decomposing characters.
|
||||
pub mod char {
|
||||
pub use crate::normalize::{
|
||||
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
|
||||
};
|
||||
|
||||
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
|
||||
|
||||
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
|
||||
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
|
||||
/// of Unicode.
|
||||
pub use crate::tables::is_public_assigned;
|
||||
}
|
||||
|
||||
/// Methods for iterating over strings while applying Unicode normalizations
|
||||
/// as described in
|
||||
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
|
||||
/// Returns an iterator over the string in Unicode Normalization Form D
|
||||
/// (canonical decomposition).
|
||||
fn nfd(self) -> Decompositions<I>;
|
||||
|
||||
/// Returns an iterator over the string in Unicode Normalization Form KD
|
||||
/// (compatibility decomposition).
|
||||
fn nfkd(self) -> Decompositions<I>;
|
||||
|
||||
/// An Iterator over the string in Unicode Normalization Form C
|
||||
/// (canonical decomposition followed by canonical composition).
|
||||
fn nfc(self) -> Recompositions<I>;
|
||||
|
||||
/// An Iterator over the string in Unicode Normalization Form KC
|
||||
/// (compatibility decomposition followed by canonical composition).
|
||||
fn nfkc(self) -> Recompositions<I>;
|
||||
|
||||
/// A transformation which replaces CJK Compatibility Ideograph codepoints
|
||||
/// with normal forms using Standardized Variation Sequences. This is not
|
||||
/// part of the canonical or compatibility decomposition algorithms, but
|
||||
/// performing it before those algorithms produces normalized output which
|
||||
/// better preserves the intent of the original text.
|
||||
///
|
||||
/// Note that many systems today ignore variation selectors, so these
|
||||
/// may not immediately help text display as intended, but they at
|
||||
/// least preserve the information in a standardized form, giving
|
||||
/// implementations the option to recognize them.
|
||||
fn cjk_compat_variants(self) -> Replacements<I>;
|
||||
|
||||
/// An Iterator over the string with Conjoining Grapheme Joiner characters
|
||||
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
|
||||
fn stream_safe(self) -> StreamSafe<I>;
|
||||
}
|
||||
|
||||
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<Chars<'a>> {
|
||||
decompose::new_canonical(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<Chars<'a>> {
|
||||
decompose::new_compatible(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<Chars<'a>> {
|
||||
recompose::new_canonical(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<Chars<'a>> {
|
||||
recompose::new_compatible(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
|
||||
replace::new_cjk_compat_variants(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
|
||||
StreamSafe::new(self.chars())
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<I> {
|
||||
decompose::new_canonical(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<I> {
|
||||
decompose::new_compatible(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<I> {
|
||||
recompose::new_canonical(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<I> {
|
||||
recompose::new_compatible(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<I> {
|
||||
replace::new_cjk_compat_variants(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<I> {
|
||||
StreamSafe::new(self)
|
||||
}
|
||||
}
|
||||
135
zeroidc/vendor/unicode-normalization/src/lookups.rs
vendored
Normal file
135
zeroidc/vendor/unicode-normalization/src/lookups.rs
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Lookups of unicode properties using minimal perfect hashing.
|
||||
|
||||
use crate::perfect_hash::mph_lookup;
|
||||
use crate::tables::*;
|
||||
|
||||
/// Look up the canonical combining class for a codepoint.
|
||||
///
|
||||
/// The value returned is as defined in the Unicode Character Database.
|
||||
pub fn canonical_combining_class(c: char) -> u8 {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_COMBINING_CLASS_SALT,
|
||||
CANONICAL_COMBINING_CLASS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
|
||||
if c1 < '\u{10000}' && c2 < '\u{10000}' {
|
||||
mph_lookup(
|
||||
(c1 as u32) << 16 | (c2 as u32),
|
||||
COMPOSITION_TABLE_SALT,
|
||||
COMPOSITION_TABLE_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
composition_table_astral(c1, c2)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_DECOMPOSED_SALT,
|
||||
CANONICAL_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMPATIBILITY_DECOMPOSED_SALT,
|
||||
COMPATIBILITY_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
|
||||
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return whether the given character is a combining mark (`General_Category=Mark`)
|
||||
pub fn is_combining_mark(c: char) -> bool {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMBINING_MARK_SALT,
|
||||
COMBINING_MARK_KV,
|
||||
bool_lookup_fk,
|
||||
bool_lookup_fv,
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
TRAILING_NONSTARTERS_SALT,
|
||||
TRAILING_NONSTARTERS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
) as usize
|
||||
}
|
||||
|
||||
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
|
||||
#[inline]
|
||||
fn u8_lookup_fk(kv: u32) -> u32 {
|
||||
kv >> 8
|
||||
}
|
||||
|
||||
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
|
||||
#[inline]
|
||||
fn u8_lookup_fv(kv: u32) -> u8 {
|
||||
(kv & 0xff) as u8
|
||||
}
|
||||
|
||||
/// Extract the key for a boolean lookup.
|
||||
#[inline]
|
||||
fn bool_lookup_fk(kv: u32) -> u32 {
|
||||
kv
|
||||
}
|
||||
|
||||
/// Extract the value for a boolean lookup.
|
||||
#[inline]
|
||||
fn bool_lookup_fv(_kv: u32) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Extract the key in a pair.
|
||||
#[inline]
|
||||
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
|
||||
kv.0
|
||||
}
|
||||
|
||||
/// Extract the value in a pair, returning an option.
|
||||
#[inline]
|
||||
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
|
||||
Some(kv.1)
|
||||
}
|
||||
6
zeroidc/vendor/unicode-normalization/src/no_std_prelude.rs
vendored
Normal file
6
zeroidc/vendor/unicode-normalization/src/no_std_prelude.rs
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
#[cfg(not(feature = "std"))]
|
||||
pub use alloc::{
|
||||
str::Chars,
|
||||
string::{String, ToString},
|
||||
vec::Vec,
|
||||
};
|
||||
201
zeroidc/vendor/unicode-normalization/src/normalize.rs
vendored
Normal file
201
zeroidc/vendor/unicode-normalization/src/normalize.rs
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Functions for computing canonical and compatible decompositions for Unicode characters.
|
||||
use crate::lookups::{
|
||||
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
|
||||
compatibility_fully_decomposed, composition_table,
|
||||
};
|
||||
|
||||
use core::{char, ops::FnMut};
|
||||
|
||||
/// Compute canonical Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_canonical<F>(c: char, emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
decompose(c, canonical_fully_decomposed, emit_char)
|
||||
}
|
||||
|
||||
/// Compute canonical or compatible Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
|
||||
let decompose_char =
|
||||
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
decompose(c, decompose_char, emit_char)
|
||||
}
|
||||
|
||||
/// Compute standard-variation decomposition for character.
|
||||
///
|
||||
/// [Standardized Variation Sequences] are used instead of the standard canonical
|
||||
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
|
||||
/// to avoid losing information. See the
|
||||
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
|
||||
/// "Other Enhancements" section of the
|
||||
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
emit_char(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't perform decomposition for Hangul
|
||||
|
||||
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
|
||||
for &d in decomposed {
|
||||
emit_char(d);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
emit_char(c);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
|
||||
where
|
||||
D: Fn(char) -> Option<&'static [char]>,
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
emit_char(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, emit_char);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = decompose_char(c) {
|
||||
for &d in decomposed {
|
||||
emit_char(d);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
emit_char(c);
|
||||
}
|
||||
|
||||
/// Compose two characters into a single character, if possible.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
pub fn compose(a: char, b: char) -> Option<char> {
|
||||
compose_hangul(a, b).or_else(|| composition_table(a, b))
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
const S_LAST: u32 = S_BASE + S_COUNT - 1;
|
||||
const L_LAST: u32 = L_BASE + L_COUNT - 1;
|
||||
const V_LAST: u32 = V_BASE + V_COUNT - 1;
|
||||
const T_LAST: u32 = T_BASE + T_COUNT - 1;
|
||||
|
||||
// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
|
||||
// i.e. `T_BASE + 1 ... T_LAST`.
|
||||
const T_FIRST: u32 = T_BASE + 1;
|
||||
|
||||
pub(crate) fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
fn decompose_hangul<F>(s: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
unsafe {
|
||||
emit_char(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
emit_char(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
emit_char(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
|
||||
let si = s as u32 - S_BASE;
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
3
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
|
||||
// Compose a pair of Hangul Jamo
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
#[allow(ellipsis_inclusive_range_patterns)]
|
||||
fn compose_hangul(a: char, b: char) -> Option<char> {
|
||||
let (a, b) = (a as u32, b as u32);
|
||||
match (a, b) {
|
||||
// Compose a leading consonant and a vowel together into an LV_Syllable
|
||||
(L_BASE...L_LAST, V_BASE...V_LAST) => {
|
||||
let l_index = a - L_BASE;
|
||||
let v_index = b - V_BASE;
|
||||
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
|
||||
let s = S_BASE + lv_index;
|
||||
Some(unsafe { char::from_u32_unchecked(s) })
|
||||
}
|
||||
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
|
||||
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
|
||||
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::compose_hangul;
|
||||
|
||||
// Regression test from a bugfix where we were composing an LV_Syllable with
|
||||
// T_BASE directly. (We should only compose an LV_Syllable with a character
|
||||
// in the range `T_BASE + 1 ... T_LAST`.)
|
||||
#[test]
|
||||
fn test_hangul_composition() {
|
||||
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
|
||||
}
|
||||
}
|
||||
50
zeroidc/vendor/unicode-normalization/src/perfect_hash.rs
vendored
Normal file
50
zeroidc/vendor/unicode-normalization/src/perfect_hash.rs
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Support for lookups based on minimal perfect hashing.
|
||||
|
||||
// This function is based on multiplication being fast and is "good enough". Also
|
||||
// it can share some work between the unsalted and salted versions.
|
||||
#[inline]
|
||||
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
|
||||
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
|
||||
let y = y ^ key.wrapping_mul(0x31415926);
|
||||
(((y as u64) * (n as u64)) >> 32) as usize
|
||||
}
|
||||
|
||||
/// Do a lookup using minimal perfect hashing.
|
||||
///
|
||||
/// The table is stored as a sequence of "salt" values, then a sequence of
|
||||
/// values that contain packed key/value pairs. The strategy is to hash twice.
|
||||
/// The first hash retrieves a salt value that makes the second hash unique.
|
||||
/// The hash function doesn't have to be very good, just good enough that the
|
||||
/// resulting map is unique.
|
||||
#[inline]
|
||||
pub(crate) fn mph_lookup<KV, V, FK, FV>(
|
||||
x: u32,
|
||||
salt: &[u16],
|
||||
kv: &[KV],
|
||||
fk: FK,
|
||||
fv: FV,
|
||||
default: V,
|
||||
) -> V
|
||||
where
|
||||
KV: Copy,
|
||||
FK: Fn(KV) -> u32,
|
||||
FV: Fn(KV) -> V,
|
||||
{
|
||||
let s = salt[my_hash(x, 0, salt.len())] as u32;
|
||||
let key_val = kv[my_hash(x, s, salt.len())];
|
||||
if x == fk(key_val) {
|
||||
fv(key_val)
|
||||
} else {
|
||||
default
|
||||
}
|
||||
}
|
||||
187
zeroidc/vendor/unicode-normalization/src/quick_check.rs
vendored
Normal file
187
zeroidc/vendor/unicode-normalization/src/quick_check.rs
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::stream_safe;
|
||||
use crate::tables;
|
||||
use crate::UnicodeNormalization;
|
||||
|
||||
/// The QuickCheck algorithm can quickly determine if a text is or isn't
|
||||
/// normalized without any allocations in many cases, but it has to be able to
|
||||
/// return `Maybe` when a full decomposition and recomposition is necessary.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum IsNormalized {
|
||||
/// The text is definitely normalized.
|
||||
Yes,
|
||||
/// The text is definitely not normalized.
|
||||
No,
|
||||
/// The text may be normalized.
|
||||
Maybe,
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
||||
#[inline]
|
||||
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
F: Fn(char) -> IsNormalized,
|
||||
{
|
||||
let mut last_cc = 0u8;
|
||||
let mut nonstarter_count = 0;
|
||||
let mut result = IsNormalized::Yes;
|
||||
for ch in s {
|
||||
// For ASCII we know it's always allowed and a starter
|
||||
if ch <= '\x7f' {
|
||||
last_cc = 0;
|
||||
nonstarter_count = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, lookup the combining class and QC property
|
||||
let cc = canonical_combining_class(ch);
|
||||
if last_cc > cc && cc != 0 {
|
||||
return IsNormalized::No;
|
||||
}
|
||||
match is_allowed(ch) {
|
||||
IsNormalized::Yes => (),
|
||||
IsNormalized::No => return IsNormalized::No,
|
||||
IsNormalized::Maybe => {
|
||||
result = IsNormalized::Maybe;
|
||||
}
|
||||
}
|
||||
if stream_safe {
|
||||
let decomp = stream_safe::classify_nonstarters(ch);
|
||||
|
||||
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
|
||||
// stream-safe normalized.
|
||||
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
|
||||
return IsNormalized::No;
|
||||
}
|
||||
if decomp.leading_nonstarters == decomp.decomposition_len {
|
||||
nonstarter_count += decomp.decomposition_len;
|
||||
} else {
|
||||
nonstarter_count = decomp.trailing_nonstarters;
|
||||
}
|
||||
}
|
||||
last_cc = cc;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFC, potentially returning
|
||||
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
|
||||
/// like `s.chars().nfc().eq(s.chars())` should suffice.
|
||||
#[inline]
|
||||
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, true)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, true)
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc(s: &str) -> bool {
|
||||
match is_nfc_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc(s: &str) -> bool {
|
||||
match is_nfkc_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd(s: &str) -> bool {
|
||||
match is_nfd_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd(s: &str) -> bool {
|
||||
match is_nfkd_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe(s: &str) -> bool {
|
||||
match is_nfc_stream_safe_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe(s: &str) -> bool {
|
||||
match is_nfd_stream_safe_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfd() {
|
||||
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
|
||||
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
|
||||
|
||||
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
|
||||
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfc() {
|
||||
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
|
||||
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
|
||||
|
||||
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
|
||||
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
|
||||
}
|
||||
}
|
||||
154
zeroidc/vendor/unicode-normalization/src/recompose.rs
vendored
Normal file
154
zeroidc/vendor/unicode-normalization/src/recompose.rs
vendored
Normal file
@@ -0,0 +1,154 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use crate::decompose::Decompositions;
|
||||
use core::fmt::{self, Write};
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RecompositionState {
|
||||
Composing,
|
||||
Purging(usize),
|
||||
Finished(usize),
|
||||
}
|
||||
|
||||
/// External iterator for a string recomposition's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Recompositions<I> {
|
||||
iter: Decompositions<I>,
|
||||
state: RecompositionState,
|
||||
buffer: TinyVec<[char; 4]>,
|
||||
composee: Option<char>,
|
||||
last_ccc: Option<u8>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_canonical(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
buffer: TinyVec::new(),
|
||||
composee: None,
|
||||
last_ccc: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_compatible(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
buffer: TinyVec::new(),
|
||||
composee: None,
|
||||
last_ccc: None,
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
use self::RecompositionState::*;
|
||||
|
||||
loop {
|
||||
match self.state {
|
||||
Composing => {
|
||||
for ch in self.iter.by_ref() {
|
||||
let ch_class = super::char::canonical_combining_class(ch);
|
||||
let k = match self.composee {
|
||||
None => {
|
||||
if ch_class != 0 {
|
||||
return Some(ch);
|
||||
}
|
||||
self.composee = Some(ch);
|
||||
continue;
|
||||
}
|
||||
Some(k) => k,
|
||||
};
|
||||
match self.last_ccc {
|
||||
None => match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
},
|
||||
Some(l_class) => {
|
||||
if l_class >= ch_class {
|
||||
// `ch` is blocked from `composee`
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
self.last_ccc = None;
|
||||
self.state = Purging(0);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
continue;
|
||||
}
|
||||
match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.state = Finished(0);
|
||||
if self.composee.is_some() {
|
||||
return self.composee.take();
|
||||
}
|
||||
}
|
||||
Purging(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
self.state = Composing;
|
||||
}
|
||||
s => {
|
||||
self.state = Purging(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
Finished(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
return self.composee.take();
|
||||
}
|
||||
s => {
|
||||
self.state = Finished(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
61
zeroidc/vendor/unicode-normalization/src/replace.rs
vendored
Normal file
61
zeroidc/vendor/unicode-normalization/src/replace.rs
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use core::fmt::{self, Write};
|
||||
use tinyvec::ArrayVec;
|
||||
|
||||
/// External iterator for replacements for a string's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Replacements<I> {
|
||||
iter: I,
|
||||
// At this time, the longest replacement sequence has length 2, so we just
|
||||
// need buffer space for 1 codepoint.
|
||||
buffer: Option<char>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
|
||||
Replacements { iter, buffer: None }
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
if let Some(c) = self.buffer.take() {
|
||||
return Some(c);
|
||||
}
|
||||
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
// At this time, the longest replacement sequence has length 2.
|
||||
let mut buffer = ArrayVec::<[char; 2]>::new();
|
||||
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
|
||||
self.buffer = buffer.get(1).copied();
|
||||
Some(buffer[0])
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
170
zeroidc/vendor/unicode-normalization/src/stream_safe.rs
vendored
Normal file
170
zeroidc/vendor/unicode-normalization/src/stream_safe.rs
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
use crate::lookups::{
|
||||
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
|
||||
stream_safe_trailing_nonstarters,
|
||||
};
|
||||
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
|
||||
use crate::tables::stream_safe_leading_nonstarters;
|
||||
|
||||
pub(crate) const MAX_NONSTARTERS: usize = 30;
|
||||
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
|
||||
|
||||
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
|
||||
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
|
||||
/// (U+034F) if the count exceeds 30.
|
||||
pub struct StreamSafe<I> {
|
||||
iter: I,
|
||||
nonstarter_count: usize,
|
||||
buffer: Option<char>,
|
||||
}
|
||||
|
||||
impl<I> StreamSafe<I> {
|
||||
pub(crate) fn new(iter: I) -> Self {
|
||||
Self {
|
||||
iter,
|
||||
nonstarter_count: 0,
|
||||
buffer: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
|
||||
None => return None,
|
||||
Some(c) => c,
|
||||
};
|
||||
let d = classify_nonstarters(next_ch);
|
||||
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
|
||||
// Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
|
||||
// nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
|
||||
// iterator (via `self.buffer`), and we'll reclassify it next iteration.
|
||||
self.nonstarter_count = 0;
|
||||
self.buffer = Some(next_ch);
|
||||
return Some(COMBINING_GRAPHEME_JOINER);
|
||||
}
|
||||
|
||||
// Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
|
||||
// nonstarters in NKFD.
|
||||
if d.leading_nonstarters == d.decomposition_len {
|
||||
self.nonstarter_count += d.decomposition_len;
|
||||
}
|
||||
// Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
|
||||
else {
|
||||
self.nonstarter_count = d.trailing_nonstarters;
|
||||
}
|
||||
Some(next_ch)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Decomposition {
|
||||
pub(crate) leading_nonstarters: usize,
|
||||
pub(crate) trailing_nonstarters: usize,
|
||||
pub(crate) decomposition_len: usize,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
|
||||
// As usual, fast path for ASCII (which is always a starter)
|
||||
if c <= '\x7f' {
|
||||
return Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: 1,
|
||||
};
|
||||
}
|
||||
// Next, special case Hangul, since it's not handled by our tables.
|
||||
if is_hangul_syllable(c) {
|
||||
return Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: hangul_decomposition_length(c),
|
||||
};
|
||||
}
|
||||
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
match decomp {
|
||||
Some(decomp) => Decomposition {
|
||||
leading_nonstarters: stream_safe_leading_nonstarters(c),
|
||||
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
|
||||
decomposition_len: decomp.len(),
|
||||
},
|
||||
None => {
|
||||
let is_nonstarter = canonical_combining_class(c) != 0;
|
||||
let nonstarter = if is_nonstarter { 1 } else { 0 };
|
||||
Decomposition {
|
||||
leading_nonstarters: nonstarter,
|
||||
trailing_nonstarters: nonstarter,
|
||||
decomposition_len: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{classify_nonstarters, StreamSafe};
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::normalize::decompose_compatible;
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
use core::char;
|
||||
|
||||
fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
|
||||
assert_eq!(stream_safe(technically_okay), technically_okay);
|
||||
|
||||
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
|
||||
let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
|
||||
assert_eq!(stream_safe(too_much), fixed_it);
|
||||
|
||||
let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
|
||||
let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
|
||||
assert_eq!(stream_safe(woah_nelly), its_cool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_nonstarters() {
|
||||
let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
|
||||
let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
|
||||
assert_eq!(stream_safe(s), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_classify_nonstarters() {
|
||||
// Highest character in the `compat_fully_decomp` table is 2FA1D
|
||||
for ch in 0..0x2FA1E {
|
||||
let ch = match char::from_u32(ch) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
let c = classify_nonstarters(ch);
|
||||
let mut s = Vec::new();
|
||||
decompose_compatible(ch, |c| s.push(c));
|
||||
|
||||
assert_eq!(s.len(), c.decomposition_len);
|
||||
|
||||
let num_leading = s
|
||||
.iter()
|
||||
.take_while(|&c| canonical_combining_class(*c) != 0)
|
||||
.count();
|
||||
let num_trailing = s
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|&c| canonical_combining_class(*c) != 0)
|
||||
.count();
|
||||
|
||||
assert_eq!(num_leading, c.leading_nonstarters);
|
||||
assert_eq!(num_trailing, c.trailing_nonstarters);
|
||||
}
|
||||
}
|
||||
}
|
||||
26020
zeroidc/vendor/unicode-normalization/src/tables.rs
vendored
Normal file
26020
zeroidc/vendor/unicode-normalization/src/tables.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
125
zeroidc/vendor/unicode-normalization/src/test.rs
vendored
Normal file
125
zeroidc/vendor/unicode-normalization/src/test.rs
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::char::is_combining_mark;
|
||||
use super::UnicodeNormalization;
|
||||
use core::char;
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
#[test]
|
||||
fn test_nfd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfd().to_string(), $expected);
|
||||
// A dummy iterator that is not std::str::Chars directly;
|
||||
// note that `id_func` is used to ensure `Clone` implementation
|
||||
assert_eq!(
|
||||
$input.chars().map(|c| c).nfd().collect::<String>(),
|
||||
$expected
|
||||
);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
|
||||
t!("\u{2026}", "\u{2026}");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
|
||||
t!("a\u{301}", "a\u{301}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
|
||||
t!("\u{ac1c}", "\u{1100}\u{1162}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkd().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
|
||||
t!("\u{2026}", "...");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
|
||||
t!("a\u{301}", "a\u{301}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
|
||||
t!("\u{ac1c}", "\u{1100}\u{1162}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfc().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
|
||||
t!("\u{2026}", "\u{2026}");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
|
||||
t!("a\u{301}", "\u{e1}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkc().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
|
||||
t!("\u{2026}", "...");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
|
||||
t!("a\u{301}", "\u{e1}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_combining_mark_ascii() {
|
||||
for cp in 0..0x7f {
|
||||
assert!(!is_combining_mark(char::from_u32(cp).unwrap()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_combining_mark_misc() {
|
||||
// https://github.com/unicode-rs/unicode-normalization/issues/16
|
||||
// U+11C3A BHAIKSUKI VOWEL SIGN O
|
||||
// Category: Mark, Nonspacing [Mn]
|
||||
assert!(is_combining_mark('\u{11C3A}'));
|
||||
|
||||
// U+11C3F BHAIKSUKI SIGN VIRAMA
|
||||
// Category: Mark, Nonspacing [Mn]
|
||||
assert!(is_combining_mark('\u{11C3F}'));
|
||||
}
|
||||
Reference in New Issue
Block a user