RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.

This commit is contained in:
Adam Ierymenko
2022-06-08 07:32:16 -04:00
parent 373ca30269
commit d5ca4e5f52
12611 changed files with 2898014 additions and 284 deletions

View File

@@ -0,0 +1,18 @@
// This crate comprises hacks and glue required to test private functions from tests/
//
// Keep this as slim as possible.
//
// If you're caught using this outside this crates tests/, you get to clean up the mess.
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
use crate::stream_safe::StreamSafe;
pub fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
pub mod quick_check {
pub use crate::quick_check::*;
}

View File

@@ -0,0 +1,161 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::fmt::{self, Write};
use core::iter::Fuse;
use core::ops::Range;
use tinyvec::TinyVec;
#[derive(Clone)]
enum DecompositionType {
Canonical,
Compatible,
}
/// External iterator for a string decomposition's characters.
#[derive(Clone)]
pub struct Decompositions<I> {
kind: DecompositionType,
iter: Fuse<I>,
// This buffer stores pairs of (canonical combining class, character),
// pushed onto the end in text order.
//
// It's divided into up to three sections:
// 1) A prefix that is free space;
// 2) "Ready" characters which are sorted and ready to emit on demand;
// 3) A "pending" block which stills needs more characters for us to be able
// to sort in canonical order and is not safe to emit.
buffer: TinyVec<[(u8, char); 4]>,
ready: Range<usize>,
}
#[inline]
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Canonical,
iter: iter.fuse(),
buffer: TinyVec::new(),
ready: 0..0,
}
}
#[inline]
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
Decompositions {
kind: self::DecompositionType::Compatible,
iter: iter.fuse(),
buffer: TinyVec::new(),
ready: 0..0,
}
}
impl<I> Decompositions<I> {
#[inline]
fn push_back(&mut self, ch: char) {
let class = super::char::canonical_combining_class(ch);
if class == 0 {
self.sort_pending();
self.buffer.push((class, ch));
self.ready.end = self.buffer.len();
} else {
self.buffer.push((class, ch));
}
}
#[inline]
fn sort_pending(&mut self) {
// NB: `sort_by_key` is stable, so it will preserve the original text's
// order within a combining class.
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
}
#[inline]
fn reset_buffer(&mut self) {
// Equivalent to `self.buffer.drain(0..self.ready.end)`
// but faster than drain() if the buffer is a SmallVec or TinyVec
let pending = self.buffer.len() - self.ready.end;
for i in 0..pending {
self.buffer[i] = self.buffer[i + self.ready.end];
}
self.buffer.truncate(pending);
self.ready = 0..0;
}
#[inline]
fn increment_next_ready(&mut self) {
let next = self.ready.start + 1;
if next == self.ready.end {
self.reset_buffer();
} else {
self.ready.start = next;
}
}
}
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
while self.ready.end == 0 {
match (self.iter.next(), &self.kind) {
(Some(ch), &DecompositionType::Canonical) => {
super::char::decompose_canonical(ch, |d| self.push_back(d));
}
(Some(ch), &DecompositionType::Compatible) => {
super::char::decompose_compatible(ch, |d| self.push_back(d));
}
(None, _) => {
if self.buffer.is_empty() {
return None;
} else {
self.sort_pending();
self.ready.end = self.buffer.len();
// This implementation means that we can call `next`
// on an exhausted iterator; the last outer `next` call
// will result in an inner `next` call. To make this
// safe, we use `fuse`.
break;
}
}
}
}
// We can assume here that, if `self.ready.end` is greater than zero,
// it's also greater than `self.ready.start`. That's because we only
// increment `self.ready.start` inside `increment_next_ready`, and
// whenever it reaches equality with `self.ready.end`, we reset both
// to zero, maintaining the invariant that:
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
//
// This less-than-obviously-safe implementation is chosen for performance,
// minimizing the number & complexity of branches in `next` in the common
// case of buffering then unbuffering a single character with each call.
let (_, ch) = self.buffer[self.ready.start];
self.increment_next_ready();
Some(ch)
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

View File

@@ -0,0 +1,199 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Unicode character composition and decomposition utilities
//! as described in
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
//!
//! ```rust
//! extern crate unicode_normalization;
//!
//! use unicode_normalization::char::compose;
//! use unicode_normalization::UnicodeNormalization;
//!
//! fn main() {
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
//!
//! let s = "ÅΩ";
//! let c = s.nfc().collect::<String>();
//! assert_eq!(c, "ÅΩ");
//! }
//! ```
//!
//! # crates.io
//!
//! You can use this package in your project by adding the following
//! to your `Cargo.toml`:
//!
//! ```toml
//! [dependencies]
//! unicode-normalization = "0.1.19"
//! ```
#![deny(missing_docs, unsafe_code)]
#![doc(
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
)]
#![cfg_attr(not(feature = "std"), no_std)]
#[cfg(not(feature = "std"))]
extern crate alloc;
#[cfg(feature = "std")]
extern crate core;
extern crate tinyvec;
pub use crate::decompose::Decompositions;
pub use crate::quick_check::{
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
IsNormalized,
};
pub use crate::recompose::Recompositions;
pub use crate::replace::Replacements;
pub use crate::stream_safe::StreamSafe;
pub use crate::tables::UNICODE_VERSION;
use core::str::Chars;
mod no_std_prelude;
mod decompose;
mod lookups;
mod normalize;
mod perfect_hash;
mod quick_check;
mod recompose;
mod replace;
mod stream_safe;
#[rustfmt::skip]
mod tables;
#[doc(hidden)]
pub mod __test_api;
#[cfg(test)]
mod test;
/// Methods for composing and decomposing characters.
pub mod char {
pub use crate::normalize::{
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
};
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
/// of Unicode.
pub use crate::tables::is_public_assigned;
}
/// Methods for iterating over strings while applying Unicode normalizations
/// as described in
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
/// Returns an iterator over the string in Unicode Normalization Form D
/// (canonical decomposition).
fn nfd(self) -> Decompositions<I>;
/// Returns an iterator over the string in Unicode Normalization Form KD
/// (compatibility decomposition).
fn nfkd(self) -> Decompositions<I>;
/// An Iterator over the string in Unicode Normalization Form C
/// (canonical decomposition followed by canonical composition).
fn nfc(self) -> Recompositions<I>;
/// An Iterator over the string in Unicode Normalization Form KC
/// (compatibility decomposition followed by canonical composition).
fn nfkc(self) -> Recompositions<I>;
/// A transformation which replaces CJK Compatibility Ideograph codepoints
/// with normal forms using Standardized Variation Sequences. This is not
/// part of the canonical or compatibility decomposition algorithms, but
/// performing it before those algorithms produces normalized output which
/// better preserves the intent of the original text.
///
/// Note that many systems today ignore variation selectors, so these
/// may not immediately help text display as intended, but they at
/// least preserve the information in a standardized form, giving
/// implementations the option to recognize them.
fn cjk_compat_variants(self) -> Replacements<I>;
/// An Iterator over the string with Conjoining Grapheme Joiner characters
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
fn stream_safe(self) -> StreamSafe<I>;
}
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
#[inline]
fn nfd(self) -> Decompositions<Chars<'a>> {
decompose::new_canonical(self.chars())
}
#[inline]
fn nfkd(self) -> Decompositions<Chars<'a>> {
decompose::new_compatible(self.chars())
}
#[inline]
fn nfc(self) -> Recompositions<Chars<'a>> {
recompose::new_canonical(self.chars())
}
#[inline]
fn nfkc(self) -> Recompositions<Chars<'a>> {
recompose::new_compatible(self.chars())
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
replace::new_cjk_compat_variants(self.chars())
}
#[inline]
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
StreamSafe::new(self.chars())
}
}
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
#[inline]
fn nfd(self) -> Decompositions<I> {
decompose::new_canonical(self)
}
#[inline]
fn nfkd(self) -> Decompositions<I> {
decompose::new_compatible(self)
}
#[inline]
fn nfc(self) -> Recompositions<I> {
recompose::new_canonical(self)
}
#[inline]
fn nfkc(self) -> Recompositions<I> {
recompose::new_compatible(self)
}
#[inline]
fn cjk_compat_variants(self) -> Replacements<I> {
replace::new_cjk_compat_variants(self)
}
#[inline]
fn stream_safe(self) -> StreamSafe<I> {
StreamSafe::new(self)
}
}

View File

@@ -0,0 +1,135 @@
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Lookups of unicode properties using minimal perfect hashing.
use crate::perfect_hash::mph_lookup;
use crate::tables::*;
/// Look up the canonical combining class for a codepoint.
///
/// The value returned is as defined in the Unicode Character Database.
pub fn canonical_combining_class(c: char) -> u8 {
mph_lookup(
c.into(),
CANONICAL_COMBINING_CLASS_SALT,
CANONICAL_COMBINING_CLASS_KV,
u8_lookup_fk,
u8_lookup_fv,
0,
)
}
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
if c1 < '\u{10000}' && c2 < '\u{10000}' {
mph_lookup(
(c1 as u32) << 16 | (c2 as u32),
COMPOSITION_TABLE_SALT,
COMPOSITION_TABLE_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
} else {
composition_table_astral(c1, c2)
}
}
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CANONICAL_DECOMPOSED_SALT,
CANONICAL_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
COMPATIBILITY_DECOMPOSED_SALT,
COMPATIBILITY_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
mph_lookup(
c.into(),
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
pair_lookup_fk,
pair_lookup_fv_opt,
None,
)
}
/// Return whether the given character is a combining mark (`General_Category=Mark`)
pub fn is_combining_mark(c: char) -> bool {
mph_lookup(
c.into(),
COMBINING_MARK_SALT,
COMBINING_MARK_KV,
bool_lookup_fk,
bool_lookup_fv,
false,
)
}
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
mph_lookup(
c.into(),
TRAILING_NONSTARTERS_SALT,
TRAILING_NONSTARTERS_KV,
u8_lookup_fk,
u8_lookup_fv,
0,
) as usize
}
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
#[inline]
fn u8_lookup_fk(kv: u32) -> u32 {
kv >> 8
}
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
#[inline]
fn u8_lookup_fv(kv: u32) -> u8 {
(kv & 0xff) as u8
}
/// Extract the key for a boolean lookup.
#[inline]
fn bool_lookup_fk(kv: u32) -> u32 {
kv
}
/// Extract the value for a boolean lookup.
#[inline]
fn bool_lookup_fv(_kv: u32) -> bool {
true
}
/// Extract the key in a pair.
#[inline]
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
kv.0
}
/// Extract the value in a pair, returning an option.
#[inline]
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
Some(kv.1)
}

View File

@@ -0,0 +1,6 @@
#[cfg(not(feature = "std"))]
pub use alloc::{
str::Chars,
string::{String, ToString},
vec::Vec,
};

View File

@@ -0,0 +1,201 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Functions for computing canonical and compatible decompositions for Unicode characters.
use crate::lookups::{
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
compatibility_fully_decomposed, composition_table,
};
use core::{char, ops::FnMut};
/// Compute canonical Unicode decomposition for character.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
#[inline]
pub fn decompose_canonical<F>(c: char, emit_char: F)
where
F: FnMut(char),
{
decompose(c, canonical_fully_decomposed, emit_char)
}
/// Compute canonical or compatible Unicode decomposition for character.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
#[inline]
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
let decompose_char =
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
decompose(c, decompose_char, emit_char)
}
/// Compute standard-variation decomposition for character.
///
/// [Standardized Variation Sequences] are used instead of the standard canonical
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
/// to avoid losing information. See the
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
/// "Other Enhancements" section of the
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
/// for more information.
#[inline]
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
where
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
emit_char(c);
return;
}
// Don't perform decomposition for Hangul
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
for &d in decomposed {
emit_char(d);
}
return;
}
// Finally bottom out.
emit_char(c);
}
#[inline]
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
where
D: Fn(char) -> Option<&'static [char]>,
F: FnMut(char),
{
// 7-bit ASCII never decomposes
if c <= '\x7f' {
emit_char(c);
return;
}
// Perform decomposition for Hangul
if is_hangul_syllable(c) {
decompose_hangul(c, emit_char);
return;
}
if let Some(decomposed) = decompose_char(c) {
for &d in decomposed {
emit_char(d);
}
return;
}
// Finally bottom out.
emit_char(c);
}
/// Compose two characters into a single character, if possible.
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
/// for more information.
pub fn compose(a: char, b: char) -> Option<char> {
compose_hangul(a, b).or_else(|| composition_table(a, b))
}
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
const S_BASE: u32 = 0xAC00;
const L_BASE: u32 = 0x1100;
const V_BASE: u32 = 0x1161;
const T_BASE: u32 = 0x11A7;
const L_COUNT: u32 = 19;
const V_COUNT: u32 = 21;
const T_COUNT: u32 = 28;
const N_COUNT: u32 = V_COUNT * T_COUNT;
const S_COUNT: u32 = L_COUNT * N_COUNT;
const S_LAST: u32 = S_BASE + S_COUNT - 1;
const L_LAST: u32 = L_BASE + L_COUNT - 1;
const V_LAST: u32 = V_BASE + V_COUNT - 1;
const T_LAST: u32 = T_BASE + T_COUNT - 1;
// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
// i.e. `T_BASE + 1 ... T_LAST`.
const T_FIRST: u32 = T_BASE + 1;
pub(crate) fn is_hangul_syllable(c: char) -> bool {
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
}
// Decompose a precomposed Hangul syllable
#[allow(unsafe_code)]
#[inline(always)]
fn decompose_hangul<F>(s: char, mut emit_char: F)
where
F: FnMut(char),
{
let s_index = s as u32 - S_BASE;
let l_index = s_index / N_COUNT;
unsafe {
emit_char(char::from_u32_unchecked(L_BASE + l_index));
let v_index = (s_index % N_COUNT) / T_COUNT;
emit_char(char::from_u32_unchecked(V_BASE + v_index));
let t_index = s_index % T_COUNT;
if t_index > 0 {
emit_char(char::from_u32_unchecked(T_BASE + t_index));
}
}
}
#[inline]
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
let si = s as u32 - S_BASE;
let ti = si % T_COUNT;
if ti > 0 {
3
} else {
2
}
}
// Compose a pair of Hangul Jamo
#[allow(unsafe_code)]
#[inline(always)]
#[allow(ellipsis_inclusive_range_patterns)]
fn compose_hangul(a: char, b: char) -> Option<char> {
let (a, b) = (a as u32, b as u32);
match (a, b) {
// Compose a leading consonant and a vowel together into an LV_Syllable
(L_BASE...L_LAST, V_BASE...V_LAST) => {
let l_index = a - L_BASE;
let v_index = b - V_BASE;
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
let s = S_BASE + lv_index;
Some(unsafe { char::from_u32_unchecked(s) })
}
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::compose_hangul;
// Regression test from a bugfix where we were composing an LV_Syllable with
// T_BASE directly. (We should only compose an LV_Syllable with a character
// in the range `T_BASE + 1 ... T_LAST`.)
#[test]
fn test_hangul_composition() {
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
}
}

View File

@@ -0,0 +1,50 @@
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
//! Support for lookups based on minimal perfect hashing.
// This function is based on multiplication being fast and is "good enough". Also
// it can share some work between the unsalted and salted versions.
#[inline]
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
let y = y ^ key.wrapping_mul(0x31415926);
(((y as u64) * (n as u64)) >> 32) as usize
}
/// Do a lookup using minimal perfect hashing.
///
/// The table is stored as a sequence of "salt" values, then a sequence of
/// values that contain packed key/value pairs. The strategy is to hash twice.
/// The first hash retrieves a salt value that makes the second hash unique.
/// The hash function doesn't have to be very good, just good enough that the
/// resulting map is unique.
#[inline]
pub(crate) fn mph_lookup<KV, V, FK, FV>(
x: u32,
salt: &[u16],
kv: &[KV],
fk: FK,
fv: FV,
default: V,
) -> V
where
KV: Copy,
FK: Fn(KV) -> u32,
FV: Fn(KV) -> V,
{
let s = salt[my_hash(x, 0, salt.len())] as u32;
let key_val = kv[my_hash(x, s, salt.len())];
if x == fk(key_val) {
fv(key_val)
} else {
default
}
}

View File

@@ -0,0 +1,187 @@
use crate::lookups::canonical_combining_class;
use crate::stream_safe;
use crate::tables;
use crate::UnicodeNormalization;
/// The QuickCheck algorithm can quickly determine if a text is or isn't
/// normalized without any allocations in many cases, but it has to be able to
/// return `Maybe` when a full decomposition and recomposition is necessary.
#[derive(Debug, Eq, PartialEq)]
pub enum IsNormalized {
/// The text is definitely normalized.
Yes,
/// The text is definitely not normalized.
No,
/// The text may be normalized.
Maybe,
}
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
#[inline]
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
where
I: Iterator<Item = char>,
F: Fn(char) -> IsNormalized,
{
let mut last_cc = 0u8;
let mut nonstarter_count = 0;
let mut result = IsNormalized::Yes;
for ch in s {
// For ASCII we know it's always allowed and a starter
if ch <= '\x7f' {
last_cc = 0;
nonstarter_count = 0;
continue;
}
// Otherwise, lookup the combining class and QC property
let cc = canonical_combining_class(ch);
if last_cc > cc && cc != 0 {
return IsNormalized::No;
}
match is_allowed(ch) {
IsNormalized::Yes => (),
IsNormalized::No => return IsNormalized::No,
IsNormalized::Maybe => {
result = IsNormalized::Maybe;
}
}
if stream_safe {
let decomp = stream_safe::classify_nonstarters(ch);
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
// stream-safe normalized.
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
return IsNormalized::No;
}
if decomp.leading_nonstarters == decomp.decomposition_len {
nonstarter_count += decomp.decomposition_len;
} else {
nonstarter_count = decomp.trailing_nonstarters;
}
}
last_cc = cc;
}
result
}
/// Quickly check if a string is in NFC, potentially returning
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
/// like `s.chars().nfc().eq(s.chars())` should suffice.
#[inline]
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfc, false)
}
/// Quickly check if a string is in NFKC.
#[inline]
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfkc, false)
}
/// Quickly check if a string is in NFD.
#[inline]
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfd, false)
}
/// Quickly check if a string is in NFKD.
#[inline]
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfkd, false)
}
/// Quickly check if a string is Stream-Safe NFC.
#[inline]
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfc, true)
}
/// Quickly check if a string is Stream-Safe NFD.
#[inline]
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
quick_check(s, tables::qc_nfd, true)
}
/// Authoritatively check if a string is in NFC.
#[inline]
pub fn is_nfc(s: &str) -> bool {
match is_nfc_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfc()),
}
}
/// Authoritatively check if a string is in NFKC.
#[inline]
pub fn is_nfkc(s: &str) -> bool {
match is_nfkc_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
}
}
/// Authoritatively check if a string is in NFD.
#[inline]
pub fn is_nfd(s: &str) -> bool {
match is_nfd_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
}
}
/// Authoritatively check if a string is in NFKD.
#[inline]
pub fn is_nfkd(s: &str) -> bool {
match is_nfkd_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
}
}
/// Authoritatively check if a string is Stream-Safe NFC.
#[inline]
pub fn is_nfc_stream_safe(s: &str) -> bool {
match is_nfc_stream_safe_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
}
}
/// Authoritatively check if a string is Stream-Safe NFD.
#[inline]
pub fn is_nfd_stream_safe(s: &str) -> bool {
match is_nfd_stream_safe_quick(s.chars()) {
IsNormalized::Yes => true,
IsNormalized::No => false,
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
}
}
#[cfg(test)]
mod tests {
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
#[test]
fn test_stream_safe_nfd() {
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
}
#[test]
fn test_stream_safe_nfc() {
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
}
}

View File

@@ -0,0 +1,154 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use crate::decompose::Decompositions;
use core::fmt::{self, Write};
use tinyvec::TinyVec;
#[derive(Clone)]
enum RecompositionState {
Composing,
Purging(usize),
Finished(usize),
}
/// External iterator for a string recomposition's characters.
#[derive(Clone)]
pub struct Recompositions<I> {
iter: Decompositions<I>,
state: RecompositionState,
buffer: TinyVec<[char; 4]>,
composee: Option<char>,
last_ccc: Option<u8>,
}
#[inline]
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
Recompositions {
iter: super::decompose::new_canonical(iter),
state: self::RecompositionState::Composing,
buffer: TinyVec::new(),
composee: None,
last_ccc: None,
}
}
#[inline]
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
Recompositions {
iter: super::decompose::new_compatible(iter),
state: self::RecompositionState::Composing,
buffer: TinyVec::new(),
composee: None,
last_ccc: None,
}
}
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
use self::RecompositionState::*;
loop {
match self.state {
Composing => {
for ch in self.iter.by_ref() {
let ch_class = super::char::canonical_combining_class(ch);
let k = match self.composee {
None => {
if ch_class != 0 {
return Some(ch);
}
self.composee = Some(ch);
continue;
}
Some(k) => k,
};
match self.last_ccc {
None => match super::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
if ch_class == 0 {
self.composee = Some(ch);
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
},
Some(l_class) => {
if l_class >= ch_class {
// `ch` is blocked from `composee`
if ch_class == 0 {
self.composee = Some(ch);
self.last_ccc = None;
self.state = Purging(0);
return Some(k);
}
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
continue;
}
match super::char::compose(k, ch) {
Some(r) => {
self.composee = Some(r);
continue;
}
None => {
self.buffer.push(ch);
self.last_ccc = Some(ch_class);
}
}
}
}
}
self.state = Finished(0);
if self.composee.is_some() {
return self.composee.take();
}
}
Purging(next) => match self.buffer.get(next).cloned() {
None => {
self.buffer.clear();
self.state = Composing;
}
s => {
self.state = Purging(next + 1);
return s;
}
},
Finished(next) => match self.buffer.get(next).cloned() {
None => {
self.buffer.clear();
return self.composee.take();
}
s => {
self.state = Finished(next + 1);
return s;
}
},
}
}
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

View File

@@ -0,0 +1,61 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use core::fmt::{self, Write};
use tinyvec::ArrayVec;
/// External iterator for replacements for a string's characters.
#[derive(Clone)]
pub struct Replacements<I> {
iter: I,
// At this time, the longest replacement sequence has length 2, so we just
// need buffer space for 1 codepoint.
buffer: Option<char>,
}
#[inline]
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
Replacements { iter, buffer: None }
}
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
if let Some(c) = self.buffer.take() {
return Some(c);
}
match self.iter.next() {
Some(ch) => {
// At this time, the longest replacement sequence has length 2.
let mut buffer = ArrayVec::<[char; 2]>::new();
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
self.buffer = buffer.get(1).copied();
Some(buffer[0])
}
None => None,
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
let (lower, _) = self.iter.size_hint();
(lower, None)
}
}
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for c in self.clone() {
f.write_char(c)?;
}
Ok(())
}
}

View File

@@ -0,0 +1,170 @@
use crate::lookups::{
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
stream_safe_trailing_nonstarters,
};
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
use crate::tables::stream_safe_leading_nonstarters;
pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
/// (U+034F) if the count exceeds 30.
pub struct StreamSafe<I> {
iter: I,
nonstarter_count: usize,
buffer: Option<char>,
}
impl<I> StreamSafe<I> {
pub(crate) fn new(iter: I) -> Self {
Self {
iter,
nonstarter_count: 0,
buffer: None,
}
}
}
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
None => return None,
Some(c) => c,
};
let d = classify_nonstarters(next_ch);
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
// Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
// nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
// iterator (via `self.buffer`), and we'll reclassify it next iteration.
self.nonstarter_count = 0;
self.buffer = Some(next_ch);
return Some(COMBINING_GRAPHEME_JOINER);
}
// Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
// nonstarters in NKFD.
if d.leading_nonstarters == d.decomposition_len {
self.nonstarter_count += d.decomposition_len;
}
// Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
else {
self.nonstarter_count = d.trailing_nonstarters;
}
Some(next_ch)
}
}
#[derive(Debug)]
pub(crate) struct Decomposition {
pub(crate) leading_nonstarters: usize,
pub(crate) trailing_nonstarters: usize,
pub(crate) decomposition_len: usize,
}
#[inline]
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
// As usual, fast path for ASCII (which is always a starter)
if c <= '\x7f' {
return Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: 1,
};
}
// Next, special case Hangul, since it's not handled by our tables.
if is_hangul_syllable(c) {
return Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: hangul_decomposition_length(c),
};
}
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
match decomp {
Some(decomp) => Decomposition {
leading_nonstarters: stream_safe_leading_nonstarters(c),
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
decomposition_len: decomp.len(),
},
None => {
let is_nonstarter = canonical_combining_class(c) != 0;
let nonstarter = if is_nonstarter { 1 } else { 0 };
Decomposition {
leading_nonstarters: nonstarter,
trailing_nonstarters: nonstarter,
decomposition_len: 1,
}
}
}
}
#[cfg(test)]
mod tests {
use super::{classify_nonstarters, StreamSafe};
use crate::lookups::canonical_combining_class;
use crate::normalize::decompose_compatible;
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
use core::char;
fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
#[test]
fn test_simple() {
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
assert_eq!(stream_safe(technically_okay), technically_okay);
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
assert_eq!(stream_safe(too_much), fixed_it);
let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
assert_eq!(stream_safe(woah_nelly), its_cool);
}
#[test]
fn test_all_nonstarters() {
let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
assert_eq!(stream_safe(s), expected);
}
#[test]
fn test_classify_nonstarters() {
// Highest character in the `compat_fully_decomp` table is 2FA1D
for ch in 0..0x2FA1E {
let ch = match char::from_u32(ch) {
Some(c) => c,
None => continue,
};
let c = classify_nonstarters(ch);
let mut s = Vec::new();
decompose_compatible(ch, |c| s.push(c));
assert_eq!(s.len(), c.decomposition_len);
let num_leading = s
.iter()
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
let num_trailing = s
.iter()
.rev()
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
assert_eq!(num_leading, c.leading_nonstarters);
assert_eq!(num_trailing, c.trailing_nonstarters);
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,125 @@
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::char::is_combining_mark;
use super::UnicodeNormalization;
use core::char;
#[cfg(not(feature = "std"))]
use crate::no_std_prelude::*;
#[test]
fn test_nfd() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfd().to_string(), $expected);
// A dummy iterator that is not std::str::Chars directly;
// note that `id_func` is used to ensure `Clone` implementation
assert_eq!(
$input.chars().map(|c| c).nfd().collect::<String>(),
$expected
);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
t!("\u{2026}", "\u{2026}");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
t!("a\u{301}", "a\u{301}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
t!("\u{ac1c}", "\u{1100}\u{1162}");
}
#[test]
fn test_nfkd() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkd().to_string(), $expected);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
t!("\u{2026}", "...");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
t!("a\u{301}", "a\u{301}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
t!("\u{ac1c}", "\u{1100}\u{1162}");
}
#[test]
fn test_nfc() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfc().to_string(), $expected);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
t!("\u{2026}", "\u{2026}");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
t!("a\u{301}", "\u{e1}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{d4db}");
t!("\u{ac1c}", "\u{ac1c}");
t!(
"a\u{300}\u{305}\u{315}\u{5ae}b",
"\u{e0}\u{5ae}\u{305}\u{315}b"
);
}
#[test]
fn test_nfkc() {
macro_rules! t {
($input: expr, $expected: expr) => {
assert_eq!($input.nfkc().to_string(), $expected);
};
}
t!("abc", "abc");
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
t!("\u{2026}", "...");
t!("\u{2126}", "\u{3a9}");
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
t!("a\u{301}", "\u{e1}");
t!("\u{301}a", "\u{301}a");
t!("\u{d4db}", "\u{d4db}");
t!("\u{ac1c}", "\u{ac1c}");
t!(
"a\u{300}\u{305}\u{315}\u{5ae}b",
"\u{e0}\u{5ae}\u{305}\u{315}b"
);
}
#[test]
fn test_is_combining_mark_ascii() {
for cp in 0..0x7f {
assert!(!is_combining_mark(char::from_u32(cp).unwrap()));
}
}
#[test]
fn test_is_combining_mark_misc() {
// https://github.com/unicode-rs/unicode-normalization/issues/16
// U+11C3A BHAIKSUKI VOWEL SIGN O
// Category: Mark, Nonspacing [Mn]
assert!(is_combining_mark('\u{11C3A}'));
// U+11C3F BHAIKSUKI SIGN VIRAMA
// Category: Mark, Nonspacing [Mn]
assert!(is_combining_mark('\u{11C3F}'));
}