RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.
This commit is contained in:
1
zeroidc/vendor/unicode-normalization/.cargo-checksum.json
vendored
Normal file
1
zeroidc/vendor/unicode-normalization/.cargo-checksum.json
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"files":{"COPYRIGHT":"23860c2a7b5d96b21569afedf033469bab9fe14a1b24a35068b8641c578ce24d","Cargo.toml":"34370ae727c107ec51fd6809e01ff76220a1bcc2b849b8d277bf9c7bf1875abd","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"7b63ecd5f1902af1b63729947373683c32745c16a10e8e6292e2e2dcd7e90ae0","README.md":"80e4415e2f0941aac11b7e5c1db946d00139db2f1a67774fcd0c0bfde52217fe","benches/bench.rs":"827e5343b059a732904be29717c2797203bfd0a633edf08042afea65372a3e2c","scripts/unicode.py":"c00cb48507e4564a2dcf17a95a5fb1206830f748a8444d296f95b5d2dd09b72c","src/__test_api.rs":"78e21bfa0b98894f545c8ed3e31cec20d7a48951a7f3ed69a6130c4b3d463aee","src/decompose.rs":"c0eb774843a545356e63bbcd7fb926f80d3c97ef4601ca3701fc34154f2e9905","src/lib.rs":"3eaa16b8b4d2d8e15d38b56760fb432ec7665e22360fd4c587c9b724486ba90e","src/lookups.rs":"ca7022bf19a82108df1f5bd78c7fc30806f931d932a65538be818caaa5f7049d","src/no_std_prelude.rs":"602e81e67b8952b6571826f431e3b6787be3073bc10f38a0d3374278f81a6a1f","src/normalize.rs":"de2670b4437d335d42884af844a750f70e541467ecd34077dfe032103cb9b041","src/perfect_hash.rs":"400c84e2f467f61bd55d55d08672da6a9ad7a57c938ce5d0c701a6994b1b273b","src/quick_check.rs":"9756312d75fc31b67fca954e44a4812945a7e436b03ba18b9a2441f6de570f6f","src/recompose.rs":"a6228ad7561a5c7a1ef1d510159bdde1eea8a161007c80e470432e9b844d5536","src/replace.rs":"b24c904f3e00851a78820e30ddfa4ff10c795f8925fd0ee7f5870f31fdfa770b","src/stream_safe.rs":"383d71f0da401af8e735877e43855c7e16cb06deb2263539cdec2a407dbe257d","src/tables.rs":"d24cf5a2a6d5059543b39eec6806c93fa8c314b52b251ddd354affcf91ef7f0b","src/test.rs":"0def2cb0a013fba29938262b3cd3533fbb10eacaf6bcd82eef1f91759fe0a2eb"},"package":"d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9"}
|
||||
7
zeroidc/vendor/unicode-normalization/COPYRIGHT
vendored
Normal file
7
zeroidc/vendor/unicode-normalization/COPYRIGHT
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
http://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or http://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
||||
32
zeroidc/vendor/unicode-normalization/Cargo.toml
vendored
Normal file
32
zeroidc/vendor/unicode-normalization/Cargo.toml
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies
|
||||
#
|
||||
# If you believe there's an error in this file please file an
|
||||
# issue against the rust-lang/cargo repository. If you're
|
||||
# editing this file be aware that the upstream Cargo.toml
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.19"
|
||||
authors = ["kwantam <kwantam@gmail.com>", "Manish Goregaokar <manishsmail@gmail.com>"]
|
||||
exclude = ["target/*", "Cargo.lock", "scripts/tmp", "*.txt", "tests/*"]
|
||||
description = "This crate provides functions for normalization of\nUnicode strings, including Canonical and Compatible\nDecomposition and Recomposition, as described in\nUnicode Standard Annex #15.\n"
|
||||
homepage = "https://github.com/unicode-rs/unicode-normalization"
|
||||
documentation = "https://docs.rs/unicode-normalization/"
|
||||
readme = "README.md"
|
||||
keywords = ["text", "unicode", "normalization", "decomposition", "recomposition"]
|
||||
license = "MIT/Apache-2.0"
|
||||
repository = "https://github.com/unicode-rs/unicode-normalization"
|
||||
[dependencies.tinyvec]
|
||||
version = "1"
|
||||
features = ["alloc"]
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
std = []
|
||||
201
zeroidc/vendor/unicode-normalization/LICENSE-APACHE
vendored
Normal file
201
zeroidc/vendor/unicode-normalization/LICENSE-APACHE
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
25
zeroidc/vendor/unicode-normalization/LICENSE-MIT
vendored
Normal file
25
zeroidc/vendor/unicode-normalization/LICENSE-MIT
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
Copyright (c) 2015 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
39
zeroidc/vendor/unicode-normalization/README.md
vendored
Normal file
39
zeroidc/vendor/unicode-normalization/README.md
vendored
Normal file
@@ -0,0 +1,39 @@
|
||||
# unicode-normalization
|
||||
|
||||
[](https://travis-ci.org/unicode-rs/unicode-normalization)
|
||||
[](https://docs.rs/unicode-normalization/)
|
||||
|
||||
Unicode character composition and decomposition utilities
|
||||
as described in
|
||||
[Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
|
||||
This crate requires Rust 1.36+.
|
||||
|
||||
```rust
|
||||
extern crate unicode_normalization;
|
||||
|
||||
use unicode_normalization::char::compose;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
fn main() {
|
||||
assert_eq!(compose('A','\u{30a}'), Some('Å'));
|
||||
|
||||
let s = "ÅΩ";
|
||||
let c = s.nfc().collect::<String>();
|
||||
assert_eq!(c, "ÅΩ");
|
||||
}
|
||||
```
|
||||
|
||||
## crates.io
|
||||
|
||||
You can use this package in your project by adding the following
|
||||
to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
unicode-normalization = "0.1.19"
|
||||
```
|
||||
|
||||
## `no_std` + `alloc` support
|
||||
|
||||
This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`.
|
||||
127
zeroidc/vendor/unicode-normalization/benches/bench.rs
vendored
Normal file
127
zeroidc/vendor/unicode-normalization/benches/bench.rs
vendored
Normal file
@@ -0,0 +1,127 @@
|
||||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
extern crate unicode_normalization;
|
||||
|
||||
use std::fs;
|
||||
use test::Bencher;
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
const ASCII: &'static str = "all types of normalized";
|
||||
const NFC: &'static str = "Introducci\u{00f3}n a Unicode.pdf";
|
||||
const NFD: &'static str = "Introduccio\u{0301}n a Unicode.pdf";
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_stream_safe_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc_stream_safe(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_stream_safe_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfc_stream_safe_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfc_stream_safe(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_stream_safe_ascii(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd_stream_safe(ASCII));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_stream_safe_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFD));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_is_nfd_stream_safe_not_normalized(b: &mut Bencher) {
|
||||
b.iter(|| unicode_normalization::is_nfd_stream_safe(NFC));
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfc_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfd_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfc_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfd_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkc_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfkc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkd_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.nfkd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkc_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfkc().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_nfkd_long(b: &mut Bencher) {
|
||||
let long = fs::read_to_string("benches/long.txt").unwrap();
|
||||
b.iter(|| long.nfkd().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_streamsafe_ascii(b: &mut Bencher) {
|
||||
b.iter(|| ASCII.stream_safe().count());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_streamsafe_adversarial(b: &mut Bencher) {
|
||||
let s = "bo\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}oom";
|
||||
b.iter(|| s.stream_safe().count());
|
||||
}
|
||||
611
zeroidc/vendor/unicode-normalization/scripts/unicode.py
vendored
Normal file
611
zeroidc/vendor/unicode-normalization/scripts/unicode.py
vendored
Normal file
@@ -0,0 +1,611 @@
|
||||
#!/usr/bin/env python
|
||||
#
|
||||
# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
|
||||
# file at the top-level directory of this distribution and at
|
||||
# http://rust-lang.org/COPYRIGHT.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
# option. This file may not be copied, modified, or distributed
|
||||
# except according to those terms.
|
||||
|
||||
# This script uses the following Unicode tables:
|
||||
# - DerivedNormalizationProps.txt
|
||||
# - NormalizationTest.txt
|
||||
# - UnicodeData.txt
|
||||
# - StandardizedVariants.txt
|
||||
#
|
||||
# Since this should not require frequent updates, we just store this
|
||||
# out-of-line and check the tables.rs and normalization_tests.rs files into git.
|
||||
import collections
|
||||
import urllib.request
|
||||
|
||||
UNICODE_VERSION = "13.0.0"
|
||||
UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
|
||||
|
||||
PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
|
||||
|
||||
#![allow(missing_docs)]
|
||||
"""
|
||||
|
||||
NormalizationTest = collections.namedtuple(
|
||||
"NormalizationTest",
|
||||
["source", "nfc", "nfd", "nfkc", "nfkd"],
|
||||
)
|
||||
|
||||
# Mapping taken from Table 12 from:
|
||||
# http://www.unicode.org/reports/tr44/#General_Category_Values
|
||||
expanded_categories = {
|
||||
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
|
||||
'Lm': ['L'], 'Lo': ['L'],
|
||||
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
|
||||
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
|
||||
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
|
||||
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
|
||||
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
|
||||
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
|
||||
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
|
||||
}
|
||||
|
||||
# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
|
||||
S_COUNT = L_COUNT * V_COUNT * T_COUNT
|
||||
|
||||
class UnicodeData(object):
|
||||
def __init__(self):
|
||||
self._load_unicode_data()
|
||||
self.norm_props = self._load_norm_props()
|
||||
self.norm_tests = self._load_norm_tests()
|
||||
|
||||
self.canon_comp = self._compute_canonical_comp()
|
||||
self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
|
||||
|
||||
self.cjk_compat_variants_fully_decomp = {}
|
||||
self._load_cjk_compat_ideograph_variants()
|
||||
|
||||
def stats(name, table):
|
||||
count = sum(len(v) for v in table.values())
|
||||
print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
|
||||
|
||||
print("Decomposition table stats:")
|
||||
stats("Canonical decomp", self.canon_decomp)
|
||||
stats("Compatible decomp", self.compat_decomp)
|
||||
stats("Canonical fully decomp", self.canon_fully_decomp)
|
||||
stats("Compatible fully decomp", self.compat_fully_decomp)
|
||||
stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
|
||||
|
||||
self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
|
||||
|
||||
def _fetch(self, filename):
|
||||
resp = urllib.request.urlopen(UCD_URL + filename)
|
||||
return resp.read().decode('utf-8')
|
||||
|
||||
def _load_unicode_data(self):
|
||||
self.name_to_char_int = {}
|
||||
self.combining_classes = {}
|
||||
self.compat_decomp = {}
|
||||
self.canon_decomp = {}
|
||||
self.general_category_mark = []
|
||||
self.general_category_public_assigned = []
|
||||
|
||||
assigned_start = 0;
|
||||
prev_char_int = -1;
|
||||
prev_name = "";
|
||||
|
||||
for line in self._fetch("UnicodeData.txt").splitlines():
|
||||
# See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
|
||||
pieces = line.split(';')
|
||||
assert len(pieces) == 15
|
||||
char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
|
||||
char_int = int(char, 16)
|
||||
|
||||
name = pieces[1].strip()
|
||||
self.name_to_char_int[name] = char_int
|
||||
|
||||
if cc != '0':
|
||||
self.combining_classes[char_int] = cc
|
||||
|
||||
if decomp.startswith('<'):
|
||||
self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
|
||||
elif decomp != '':
|
||||
self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
|
||||
|
||||
if category == 'M' or 'M' in expanded_categories.get(category, []):
|
||||
self.general_category_mark.append(char_int)
|
||||
|
||||
assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
|
||||
if category not in ['Co', 'Cs']:
|
||||
if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
|
||||
self.general_category_public_assigned.append((assigned_start, prev_char_int))
|
||||
assigned_start = char_int
|
||||
prev_char_int = char_int
|
||||
prev_name = name;
|
||||
|
||||
self.general_category_public_assigned.append((assigned_start, prev_char_int))
|
||||
|
||||
def _load_cjk_compat_ideograph_variants(self):
|
||||
for line in self._fetch("StandardizedVariants.txt").splitlines():
|
||||
strip_comments = line.split('#', 1)[0].strip()
|
||||
if not strip_comments:
|
||||
continue
|
||||
|
||||
variation_sequence, description, differences = strip_comments.split(';')
|
||||
description = description.strip()
|
||||
|
||||
# Don't use variations that only apply in particular shaping environments.
|
||||
if differences:
|
||||
continue
|
||||
|
||||
# Look for entries where the description field is a codepoint name.
|
||||
if description not in self.name_to_char_int:
|
||||
continue
|
||||
|
||||
# Only consider the CJK Compatibility Ideographs.
|
||||
if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
|
||||
continue
|
||||
|
||||
char_int = self.name_to_char_int[description]
|
||||
|
||||
assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
|
||||
assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
|
||||
assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
|
||||
# If we ever need to handle Hangul here, we'll need to handle it separately.
|
||||
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
|
||||
|
||||
cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
|
||||
for c in cjk_compat_variant_parts:
|
||||
assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
|
||||
assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
|
||||
self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
|
||||
|
||||
def _load_norm_props(self):
|
||||
props = collections.defaultdict(list)
|
||||
|
||||
for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
|
||||
(prop_data, _, _) = line.partition("#")
|
||||
prop_pieces = prop_data.split(";")
|
||||
|
||||
if len(prop_pieces) < 2:
|
||||
continue
|
||||
|
||||
assert len(prop_pieces) <= 3
|
||||
(low, _, high) = prop_pieces[0].strip().partition("..")
|
||||
|
||||
prop = prop_pieces[1].strip()
|
||||
|
||||
data = None
|
||||
if len(prop_pieces) == 3:
|
||||
data = prop_pieces[2].strip()
|
||||
|
||||
props[prop].append((low, high, data))
|
||||
|
||||
return props
|
||||
|
||||
def _load_norm_tests(self):
|
||||
tests = []
|
||||
for line in self._fetch("NormalizationTest.txt").splitlines():
|
||||
(test_data, _, _) = line.partition("#")
|
||||
test_pieces = test_data.split(";")
|
||||
|
||||
if len(test_pieces) < 5:
|
||||
continue
|
||||
|
||||
source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
|
||||
tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
|
||||
|
||||
return tests
|
||||
|
||||
def _compute_canonical_comp(self):
|
||||
canon_comp = {}
|
||||
comp_exclusions = [
|
||||
(int(low, 16), int(high or low, 16))
|
||||
for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
|
||||
]
|
||||
for char_int, decomp in self.canon_decomp.items():
|
||||
if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
|
||||
continue
|
||||
|
||||
assert len(decomp) == 2
|
||||
assert (decomp[0], decomp[1]) not in canon_comp
|
||||
canon_comp[(decomp[0], decomp[1])] = char_int
|
||||
|
||||
return canon_comp
|
||||
|
||||
def _compute_fully_decomposed(self):
|
||||
"""
|
||||
Even though the decomposition algorithm is recursive, it is possible
|
||||
to precompute the recursion at table generation time with modest
|
||||
increase to the table size. Then, for these precomputed tables, we
|
||||
note that 1) compatible decomposition is a subset of canonical
|
||||
decomposition and 2) they mostly agree on their intersection.
|
||||
Therefore, we don't store entries in the compatible table for
|
||||
characters that decompose the same way under canonical decomposition.
|
||||
|
||||
Decomposition table stats:
|
||||
Canonical decomp: 2060 chars => 3085 decomposed chars
|
||||
Compatible decomp: 3662 chars => 5440 decomposed chars
|
||||
Canonical fully decomp: 2060 chars => 3404 decomposed chars
|
||||
Compatible fully decomp: 3678 chars => 5599 decomposed chars
|
||||
|
||||
The upshot is that decomposition code is very simple and easy to inline
|
||||
at mild code size cost.
|
||||
"""
|
||||
def _decompose(char_int, compatible):
|
||||
# 7-bit ASCII never decomposes
|
||||
if char_int <= 0x7f:
|
||||
yield char_int
|
||||
return
|
||||
|
||||
# Assert that we're handling Hangul separately.
|
||||
assert not (S_BASE <= char_int < S_BASE + S_COUNT)
|
||||
|
||||
decomp = self.canon_decomp.get(char_int)
|
||||
if decomp is not None:
|
||||
for decomposed_ch in decomp:
|
||||
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
|
||||
yield fully_decomposed_ch
|
||||
return
|
||||
|
||||
if compatible and char_int in self.compat_decomp:
|
||||
for decomposed_ch in self.compat_decomp[char_int]:
|
||||
for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
|
||||
yield fully_decomposed_ch
|
||||
return
|
||||
|
||||
yield char_int
|
||||
return
|
||||
|
||||
end_codepoint = max(
|
||||
max(self.canon_decomp.keys()),
|
||||
max(self.compat_decomp.keys()),
|
||||
)
|
||||
|
||||
canon_fully_decomp = {}
|
||||
compat_fully_decomp = {}
|
||||
|
||||
for char_int in range(0, end_codepoint + 1):
|
||||
# Always skip Hangul, since it's more efficient to represent its
|
||||
# decomposition programmatically.
|
||||
if S_BASE <= char_int < S_BASE + S_COUNT:
|
||||
continue
|
||||
|
||||
canon = list(_decompose(char_int, False))
|
||||
if not (len(canon) == 1 and canon[0] == char_int):
|
||||
canon_fully_decomp[char_int] = canon
|
||||
|
||||
compat = list(_decompose(char_int, True))
|
||||
if not (len(compat) == 1 and compat[0] == char_int):
|
||||
compat_fully_decomp[char_int] = compat
|
||||
|
||||
# Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
|
||||
# need to store their overlap when they agree. When they don't agree,
|
||||
# store the decomposition in the compatibility table since we'll check
|
||||
# that first when normalizing to NFKD.
|
||||
assert set(canon_fully_decomp) <= set(compat_fully_decomp)
|
||||
|
||||
for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
|
||||
if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
|
||||
del compat_fully_decomp[ch]
|
||||
|
||||
return canon_fully_decomp, compat_fully_decomp
|
||||
|
||||
def _compute_stream_safe_tables(self):
|
||||
"""
|
||||
To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
|
||||
we need to be able to know the number of contiguous non-starters *after*
|
||||
applying compatibility decomposition to each character.
|
||||
|
||||
We can do this incrementally by computing the number of leading and
|
||||
trailing non-starters for each character's compatibility decomposition
|
||||
with the following rules:
|
||||
|
||||
1) If a character is not affected by compatibility decomposition, look
|
||||
up its canonical combining class to find out if it's a non-starter.
|
||||
2) All Hangul characters are starters, even under decomposition.
|
||||
3) Otherwise, very few decomposing characters have a nonzero count
|
||||
of leading or trailing non-starters, so store these characters
|
||||
with their associated counts in a separate table.
|
||||
"""
|
||||
leading_nonstarters = {}
|
||||
trailing_nonstarters = {}
|
||||
|
||||
for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
|
||||
decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
|
||||
|
||||
num_leading = 0
|
||||
for d in decomposed:
|
||||
if d not in self.combining_classes:
|
||||
break
|
||||
num_leading += 1
|
||||
|
||||
num_trailing = 0
|
||||
for d in reversed(decomposed):
|
||||
if d not in self.combining_classes:
|
||||
break
|
||||
num_trailing += 1
|
||||
|
||||
if num_leading > 0:
|
||||
leading_nonstarters[c] = num_leading
|
||||
if num_trailing > 0:
|
||||
trailing_nonstarters[c] = num_trailing
|
||||
|
||||
return leading_nonstarters, trailing_nonstarters
|
||||
|
||||
hexify = lambda c: '{:04X}'.format(c)
|
||||
|
||||
# Test whether `first` and `last` are corresponding "<..., First>" and
|
||||
# "<..., Last>" markers.
|
||||
def is_first_and_last(first, last):
|
||||
if not first.startswith('<') or not first.endswith(', First>'):
|
||||
return False
|
||||
if not last.startswith('<') or not last.endswith(', Last>'):
|
||||
return False
|
||||
return first[1:-8] == last[1:-7]
|
||||
|
||||
def gen_mph_data(name, d, kv_type, kv_callback):
|
||||
(salt, keys) = minimal_perfect_hash(d)
|
||||
out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
|
||||
for s in salt:
|
||||
out.write(" 0x{:x},\n".format(s))
|
||||
out.write("];\n")
|
||||
out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
|
||||
for k in keys:
|
||||
out.write(" {},\n".format(kv_callback(k)))
|
||||
out.write("];\n\n")
|
||||
|
||||
def gen_combining_class(combining_classes, out):
|
||||
gen_mph_data('canonical_combining_class', combining_classes, 'u32',
|
||||
lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
|
||||
|
||||
def gen_composition_table(canon_comp, out):
|
||||
table = {}
|
||||
for (c1, c2), c3 in canon_comp.items():
|
||||
if c1 < 0x10000 and c2 < 0x10000:
|
||||
table[(c1 << 16) | c2] = c3
|
||||
(salt, keys) = minimal_perfect_hash(table)
|
||||
gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
|
||||
lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
|
||||
|
||||
out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
|
||||
out.write(" match (c1, c2) {\n")
|
||||
for (c1, c2), c3 in sorted(canon_comp.items()):
|
||||
if c1 >= 0x10000 and c2 >= 0x10000:
|
||||
out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
|
||||
|
||||
out.write(" _ => None,\n")
|
||||
out.write(" }\n")
|
||||
out.write("}\n")
|
||||
|
||||
def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
|
||||
tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
|
||||
for table, name in tables:
|
||||
gen_mph_data(name + '_decomposed', table, "(u32, &'static [char])",
|
||||
lambda k: "(0x{:x}, &[{}])".format(k,
|
||||
", ".join("'\\u{%s}'" % hexify(c) for c in table[k])))
|
||||
|
||||
def gen_qc_match(prop_table, out):
|
||||
out.write(" match c {\n")
|
||||
|
||||
for low, high, data in prop_table:
|
||||
assert data in ('N', 'M')
|
||||
result = "No" if data == 'N' else "Maybe"
|
||||
if high:
|
||||
out.write(r" '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
|
||||
else:
|
||||
out.write(r" '\u{%s}' => %s," % (low, result))
|
||||
out.write("\n")
|
||||
|
||||
out.write(" _ => Yes,\n")
|
||||
out.write(" }\n")
|
||||
|
||||
def gen_nfc_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFC_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_nfkc_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFKC_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_nfd_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFD_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_nfkd_qc(prop_tables, out):
|
||||
out.write("#[inline]\n")
|
||||
out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
|
||||
out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
|
||||
gen_qc_match(prop_tables['NFKD_QC'], out)
|
||||
out.write("}\n")
|
||||
|
||||
def gen_combining_mark(general_category_mark, out):
|
||||
gen_mph_data('combining_mark', general_category_mark, 'u32',
|
||||
lambda k: '0x{:04x}'.format(k))
|
||||
|
||||
def gen_public_assigned(general_category_public_assigned, out):
|
||||
# This could be done as a hash but the table is somewhat small.
|
||||
out.write("#[inline]\n")
|
||||
out.write("pub fn is_public_assigned(c: char) -> bool {\n")
|
||||
out.write(" match c {\n")
|
||||
|
||||
start = True
|
||||
for first, last in general_category_public_assigned:
|
||||
if start:
|
||||
out.write(" ")
|
||||
start = False
|
||||
else:
|
||||
out.write(" | ")
|
||||
if first == last:
|
||||
out.write("'\\u{%s}'\n" % hexify(first))
|
||||
else:
|
||||
out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
|
||||
out.write(" => true,\n")
|
||||
|
||||
out.write(" _ => false,\n")
|
||||
out.write(" }\n")
|
||||
out.write("}\n")
|
||||
out.write("\n")
|
||||
|
||||
def gen_stream_safe(leading, trailing, out):
|
||||
# This could be done as a hash but the table is very small.
|
||||
out.write("#[inline]\n")
|
||||
out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
|
||||
out.write(" match c {\n")
|
||||
|
||||
for char, num_leading in sorted(leading.items()):
|
||||
out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
|
||||
|
||||
out.write(" _ => 0,\n")
|
||||
out.write(" }\n")
|
||||
out.write("}\n")
|
||||
out.write("\n")
|
||||
|
||||
gen_mph_data('trailing_nonstarters', trailing, 'u32',
|
||||
lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
|
||||
|
||||
def gen_tests(tests, out):
|
||||
out.write("""#[derive(Debug)]
|
||||
pub struct NormalizationTest {
|
||||
pub source: &'static str,
|
||||
pub nfc: &'static str,
|
||||
pub nfd: &'static str,
|
||||
pub nfkc: &'static str,
|
||||
pub nfkd: &'static str,
|
||||
}
|
||||
|
||||
""")
|
||||
|
||||
out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
|
||||
str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
|
||||
|
||||
for test in tests:
|
||||
out.write(" NormalizationTest {\n")
|
||||
out.write(" source: %s,\n" % str_literal(test.source))
|
||||
out.write(" nfc: %s,\n" % str_literal(test.nfc))
|
||||
out.write(" nfd: %s,\n" % str_literal(test.nfd))
|
||||
out.write(" nfkc: %s,\n" % str_literal(test.nfkc))
|
||||
out.write(" nfkd: %s,\n" % str_literal(test.nfkd))
|
||||
out.write(" },\n")
|
||||
|
||||
out.write("];\n")
|
||||
|
||||
# Guaranteed to be less than n.
|
||||
def my_hash(x, salt, n):
|
||||
# This is hash based on the theory that multiplication is efficient
|
||||
mask_32 = 0xffffffff
|
||||
y = ((x + salt) * 2654435769) & mask_32
|
||||
y ^= (x * 0x31415926) & mask_32
|
||||
return (y * n) >> 32
|
||||
|
||||
# Compute minimal perfect hash function, d can be either a dict or list of keys.
|
||||
def minimal_perfect_hash(d):
|
||||
n = len(d)
|
||||
buckets = dict((h, []) for h in range(n))
|
||||
for key in d:
|
||||
h = my_hash(key, 0, n)
|
||||
buckets[h].append(key)
|
||||
bsorted = [(len(buckets[h]), h) for h in range(n)]
|
||||
bsorted.sort(reverse = True)
|
||||
claimed = [False] * n
|
||||
salts = [0] * n
|
||||
keys = [0] * n
|
||||
for (bucket_size, h) in bsorted:
|
||||
# Note: the traditional perfect hashing approach would also special-case
|
||||
# bucket_size == 1 here and assign any empty slot, rather than iterating
|
||||
# until rehash finds an empty slot. But we're not doing that so we can
|
||||
# avoid the branch.
|
||||
if bucket_size == 0:
|
||||
break
|
||||
else:
|
||||
for salt in range(1, 32768):
|
||||
rehashes = [my_hash(key, salt, n) for key in buckets[h]]
|
||||
# Make sure there are no rehash collisions within this bucket.
|
||||
if all(not claimed[hash] for hash in rehashes):
|
||||
if len(set(rehashes)) < bucket_size:
|
||||
continue
|
||||
salts[h] = salt
|
||||
for key in buckets[h]:
|
||||
rehash = my_hash(key, salt, n)
|
||||
claimed[rehash] = True
|
||||
keys[rehash] = key
|
||||
break
|
||||
if salts[h] == 0:
|
||||
print("minimal perfect hashing failed")
|
||||
# Note: if this happens (because of unfortunate data), then there are
|
||||
# a few things that could be done. First, the hash function could be
|
||||
# tweaked. Second, the bucket order could be scrambled (especially the
|
||||
# singletons). Right now, the buckets are sorted, which has the advantage
|
||||
# of being deterministic.
|
||||
#
|
||||
# As a more extreme approach, the singleton bucket optimization could be
|
||||
# applied (give the direct address for singleton buckets, rather than
|
||||
# relying on a rehash). That is definitely the more standard approach in
|
||||
# the minimal perfect hashing literature, but in testing the branch was a
|
||||
# significant slowdown.
|
||||
exit(1)
|
||||
return (salts, keys)
|
||||
|
||||
if __name__ == '__main__':
|
||||
data = UnicodeData()
|
||||
with open("tables.rs", "w", newline = "\n") as out:
|
||||
out.write(PREAMBLE)
|
||||
out.write("use crate::quick_check::IsNormalized;\n")
|
||||
out.write("use crate::quick_check::IsNormalized::*;\n")
|
||||
out.write("\n")
|
||||
|
||||
version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
|
||||
out.write("#[allow(unused)]\n")
|
||||
out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
|
||||
|
||||
gen_combining_class(data.combining_classes, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_composition_table(data.canon_comp, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
|
||||
|
||||
gen_combining_mark(data.general_category_mark, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_public_assigned(data.general_category_public_assigned, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfc_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfkc_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfd_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_nfkd_qc(data.norm_props, out)
|
||||
out.write("\n")
|
||||
|
||||
gen_stream_safe(data.ss_leading, data.ss_trailing, out)
|
||||
out.write("\n")
|
||||
|
||||
with open("normalization_tests.rs", "w", newline = "\n") as out:
|
||||
out.write(PREAMBLE)
|
||||
gen_tests(data.norm_tests, out)
|
||||
18
zeroidc/vendor/unicode-normalization/src/__test_api.rs
vendored
Normal file
18
zeroidc/vendor/unicode-normalization/src/__test_api.rs
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
// This crate comprises hacks and glue required to test private functions from tests/
|
||||
//
|
||||
// Keep this as slim as possible.
|
||||
//
|
||||
// If you're caught using this outside this crates tests/, you get to clean up the mess.
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
use crate::stream_safe::StreamSafe;
|
||||
|
||||
pub fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
|
||||
pub mod quick_check {
|
||||
pub use crate::quick_check::*;
|
||||
}
|
||||
161
zeroidc/vendor/unicode-normalization/src/decompose.rs
vendored
Normal file
161
zeroidc/vendor/unicode-normalization/src/decompose.rs
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use core::fmt::{self, Write};
|
||||
use core::iter::Fuse;
|
||||
use core::ops::Range;
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum DecompositionType {
|
||||
Canonical,
|
||||
Compatible,
|
||||
}
|
||||
|
||||
/// External iterator for a string decomposition's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Decompositions<I> {
|
||||
kind: DecompositionType,
|
||||
iter: Fuse<I>,
|
||||
|
||||
// This buffer stores pairs of (canonical combining class, character),
|
||||
// pushed onto the end in text order.
|
||||
//
|
||||
// It's divided into up to three sections:
|
||||
// 1) A prefix that is free space;
|
||||
// 2) "Ready" characters which are sorted and ready to emit on demand;
|
||||
// 3) A "pending" block which stills needs more characters for us to be able
|
||||
// to sort in canonical order and is not safe to emit.
|
||||
buffer: TinyVec<[(u8, char); 4]>,
|
||||
ready: Range<usize>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Canonical,
|
||||
iter: iter.fuse(),
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Decompositions<I> {
|
||||
Decompositions {
|
||||
kind: self::DecompositionType::Compatible,
|
||||
iter: iter.fuse(),
|
||||
buffer: TinyVec::new(),
|
||||
ready: 0..0,
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Decompositions<I> {
|
||||
#[inline]
|
||||
fn push_back(&mut self, ch: char) {
|
||||
let class = super::char::canonical_combining_class(ch);
|
||||
|
||||
if class == 0 {
|
||||
self.sort_pending();
|
||||
self.buffer.push((class, ch));
|
||||
self.ready.end = self.buffer.len();
|
||||
} else {
|
||||
self.buffer.push((class, ch));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sort_pending(&mut self) {
|
||||
// NB: `sort_by_key` is stable, so it will preserve the original text's
|
||||
// order within a combining class.
|
||||
self.buffer[self.ready.end..].sort_by_key(|k| k.0);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn reset_buffer(&mut self) {
|
||||
// Equivalent to `self.buffer.drain(0..self.ready.end)`
|
||||
// but faster than drain() if the buffer is a SmallVec or TinyVec
|
||||
let pending = self.buffer.len() - self.ready.end;
|
||||
for i in 0..pending {
|
||||
self.buffer[i] = self.buffer[i + self.ready.end];
|
||||
}
|
||||
self.buffer.truncate(pending);
|
||||
self.ready = 0..0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn increment_next_ready(&mut self) {
|
||||
let next = self.ready.start + 1;
|
||||
if next == self.ready.end {
|
||||
self.reset_buffer();
|
||||
} else {
|
||||
self.ready.start = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Decompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
while self.ready.end == 0 {
|
||||
match (self.iter.next(), &self.kind) {
|
||||
(Some(ch), &DecompositionType::Canonical) => {
|
||||
super::char::decompose_canonical(ch, |d| self.push_back(d));
|
||||
}
|
||||
(Some(ch), &DecompositionType::Compatible) => {
|
||||
super::char::decompose_compatible(ch, |d| self.push_back(d));
|
||||
}
|
||||
(None, _) => {
|
||||
if self.buffer.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
self.sort_pending();
|
||||
self.ready.end = self.buffer.len();
|
||||
|
||||
// This implementation means that we can call `next`
|
||||
// on an exhausted iterator; the last outer `next` call
|
||||
// will result in an inner `next` call. To make this
|
||||
// safe, we use `fuse`.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We can assume here that, if `self.ready.end` is greater than zero,
|
||||
// it's also greater than `self.ready.start`. That's because we only
|
||||
// increment `self.ready.start` inside `increment_next_ready`, and
|
||||
// whenever it reaches equality with `self.ready.end`, we reset both
|
||||
// to zero, maintaining the invariant that:
|
||||
// self.ready.start < self.ready.end || self.ready.end == self.ready.start == 0
|
||||
//
|
||||
// This less-than-obviously-safe implementation is chosen for performance,
|
||||
// minimizing the number & complexity of branches in `next` in the common
|
||||
// case of buffering then unbuffering a single character with each call.
|
||||
let (_, ch) = self.buffer[self.ready.start];
|
||||
self.increment_next_ready();
|
||||
Some(ch)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Decompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
199
zeroidc/vendor/unicode-normalization/src/lib.rs
vendored
Normal file
199
zeroidc/vendor/unicode-normalization/src/lib.rs
vendored
Normal file
@@ -0,0 +1,199 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Unicode character composition and decomposition utilities
|
||||
//! as described in
|
||||
//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
//!
|
||||
//! ```rust
|
||||
//! extern crate unicode_normalization;
|
||||
//!
|
||||
//! use unicode_normalization::char::compose;
|
||||
//! use unicode_normalization::UnicodeNormalization;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
|
||||
//!
|
||||
//! let s = "ÅΩ";
|
||||
//! let c = s.nfc().collect::<String>();
|
||||
//! assert_eq!(c, "ÅΩ");
|
||||
//! }
|
||||
//! ```
|
||||
//!
|
||||
//! # crates.io
|
||||
//!
|
||||
//! You can use this package in your project by adding the following
|
||||
//! to your `Cargo.toml`:
|
||||
//!
|
||||
//! ```toml
|
||||
//! [dependencies]
|
||||
//! unicode-normalization = "0.1.19"
|
||||
//! ```
|
||||
|
||||
#![deny(missing_docs, unsafe_code)]
|
||||
#![doc(
|
||||
html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
|
||||
html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
|
||||
)]
|
||||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
extern crate core;
|
||||
|
||||
extern crate tinyvec;
|
||||
|
||||
pub use crate::decompose::Decompositions;
|
||||
pub use crate::quick_check::{
|
||||
is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
|
||||
is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
|
||||
IsNormalized,
|
||||
};
|
||||
pub use crate::recompose::Recompositions;
|
||||
pub use crate::replace::Replacements;
|
||||
pub use crate::stream_safe::StreamSafe;
|
||||
pub use crate::tables::UNICODE_VERSION;
|
||||
use core::str::Chars;
|
||||
|
||||
mod no_std_prelude;
|
||||
|
||||
mod decompose;
|
||||
mod lookups;
|
||||
mod normalize;
|
||||
mod perfect_hash;
|
||||
mod quick_check;
|
||||
mod recompose;
|
||||
mod replace;
|
||||
mod stream_safe;
|
||||
|
||||
#[rustfmt::skip]
|
||||
mod tables;
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod __test_api;
|
||||
#[cfg(test)]
|
||||
mod test;
|
||||
|
||||
/// Methods for composing and decomposing characters.
|
||||
pub mod char {
|
||||
pub use crate::normalize::{
|
||||
compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
|
||||
};
|
||||
|
||||
pub use crate::lookups::{canonical_combining_class, is_combining_mark};
|
||||
|
||||
/// Return whether the given character is assigned (`General_Category` != `Unassigned`)
|
||||
/// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
|
||||
/// of Unicode.
|
||||
pub use crate::tables::is_public_assigned;
|
||||
}
|
||||
|
||||
/// Methods for iterating over strings while applying Unicode normalizations
|
||||
/// as described in
|
||||
/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
|
||||
pub trait UnicodeNormalization<I: Iterator<Item = char>> {
|
||||
/// Returns an iterator over the string in Unicode Normalization Form D
|
||||
/// (canonical decomposition).
|
||||
fn nfd(self) -> Decompositions<I>;
|
||||
|
||||
/// Returns an iterator over the string in Unicode Normalization Form KD
|
||||
/// (compatibility decomposition).
|
||||
fn nfkd(self) -> Decompositions<I>;
|
||||
|
||||
/// An Iterator over the string in Unicode Normalization Form C
|
||||
/// (canonical decomposition followed by canonical composition).
|
||||
fn nfc(self) -> Recompositions<I>;
|
||||
|
||||
/// An Iterator over the string in Unicode Normalization Form KC
|
||||
/// (compatibility decomposition followed by canonical composition).
|
||||
fn nfkc(self) -> Recompositions<I>;
|
||||
|
||||
/// A transformation which replaces CJK Compatibility Ideograph codepoints
|
||||
/// with normal forms using Standardized Variation Sequences. This is not
|
||||
/// part of the canonical or compatibility decomposition algorithms, but
|
||||
/// performing it before those algorithms produces normalized output which
|
||||
/// better preserves the intent of the original text.
|
||||
///
|
||||
/// Note that many systems today ignore variation selectors, so these
|
||||
/// may not immediately help text display as intended, but they at
|
||||
/// least preserve the information in a standardized form, giving
|
||||
/// implementations the option to recognize them.
|
||||
fn cjk_compat_variants(self) -> Replacements<I>;
|
||||
|
||||
/// An Iterator over the string with Conjoining Grapheme Joiner characters
|
||||
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
|
||||
fn stream_safe(self) -> StreamSafe<I>;
|
||||
}
|
||||
|
||||
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<Chars<'a>> {
|
||||
decompose::new_canonical(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<Chars<'a>> {
|
||||
decompose::new_compatible(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<Chars<'a>> {
|
||||
recompose::new_canonical(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<Chars<'a>> {
|
||||
recompose::new_compatible(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
|
||||
replace::new_cjk_compat_variants(self.chars())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<Chars<'a>> {
|
||||
StreamSafe::new(self.chars())
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
|
||||
#[inline]
|
||||
fn nfd(self) -> Decompositions<I> {
|
||||
decompose::new_canonical(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkd(self) -> Decompositions<I> {
|
||||
decompose::new_compatible(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfc(self) -> Recompositions<I> {
|
||||
recompose::new_canonical(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn nfkc(self) -> Recompositions<I> {
|
||||
recompose::new_compatible(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn cjk_compat_variants(self) -> Replacements<I> {
|
||||
replace::new_cjk_compat_variants(self)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn stream_safe(self) -> StreamSafe<I> {
|
||||
StreamSafe::new(self)
|
||||
}
|
||||
}
|
||||
135
zeroidc/vendor/unicode-normalization/src/lookups.rs
vendored
Normal file
135
zeroidc/vendor/unicode-normalization/src/lookups.rs
vendored
Normal file
@@ -0,0 +1,135 @@
|
||||
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Lookups of unicode properties using minimal perfect hashing.
|
||||
|
||||
use crate::perfect_hash::mph_lookup;
|
||||
use crate::tables::*;
|
||||
|
||||
/// Look up the canonical combining class for a codepoint.
|
||||
///
|
||||
/// The value returned is as defined in the Unicode Character Database.
|
||||
pub fn canonical_combining_class(c: char) -> u8 {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_COMBINING_CLASS_SALT,
|
||||
CANONICAL_COMBINING_CLASS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn composition_table(c1: char, c2: char) -> Option<char> {
|
||||
if c1 < '\u{10000}' && c2 < '\u{10000}' {
|
||||
mph_lookup(
|
||||
(c1 as u32) << 16 | (c2 as u32),
|
||||
COMPOSITION_TABLE_SALT,
|
||||
COMPOSITION_TABLE_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
composition_table_astral(c1, c2)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn canonical_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CANONICAL_DECOMPOSED_SALT,
|
||||
CANONICAL_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn compatibility_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMPATIBILITY_DECOMPOSED_SALT,
|
||||
COMPATIBILITY_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn cjk_compat_variants_fully_decomposed(c: char) -> Option<&'static [char]> {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
CJK_COMPAT_VARIANTS_DECOMPOSED_SALT,
|
||||
CJK_COMPAT_VARIANTS_DECOMPOSED_KV,
|
||||
pair_lookup_fk,
|
||||
pair_lookup_fv_opt,
|
||||
None,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return whether the given character is a combining mark (`General_Category=Mark`)
|
||||
pub fn is_combining_mark(c: char) -> bool {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
COMBINING_MARK_SALT,
|
||||
COMBINING_MARK_KV,
|
||||
bool_lookup_fk,
|
||||
bool_lookup_fv,
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stream_safe_trailing_nonstarters(c: char) -> usize {
|
||||
mph_lookup(
|
||||
c.into(),
|
||||
TRAILING_NONSTARTERS_SALT,
|
||||
TRAILING_NONSTARTERS_KV,
|
||||
u8_lookup_fk,
|
||||
u8_lookup_fv,
|
||||
0,
|
||||
) as usize
|
||||
}
|
||||
|
||||
/// Extract the key in a 24 bit key and 8 bit value packed in a u32.
|
||||
#[inline]
|
||||
fn u8_lookup_fk(kv: u32) -> u32 {
|
||||
kv >> 8
|
||||
}
|
||||
|
||||
/// Extract the value in a 24 bit key and 8 bit value packed in a u32.
|
||||
#[inline]
|
||||
fn u8_lookup_fv(kv: u32) -> u8 {
|
||||
(kv & 0xff) as u8
|
||||
}
|
||||
|
||||
/// Extract the key for a boolean lookup.
|
||||
#[inline]
|
||||
fn bool_lookup_fk(kv: u32) -> u32 {
|
||||
kv
|
||||
}
|
||||
|
||||
/// Extract the value for a boolean lookup.
|
||||
#[inline]
|
||||
fn bool_lookup_fv(_kv: u32) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// Extract the key in a pair.
|
||||
#[inline]
|
||||
fn pair_lookup_fk<T>(kv: (u32, T)) -> u32 {
|
||||
kv.0
|
||||
}
|
||||
|
||||
/// Extract the value in a pair, returning an option.
|
||||
#[inline]
|
||||
fn pair_lookup_fv_opt<T>(kv: (u32, T)) -> Option<T> {
|
||||
Some(kv.1)
|
||||
}
|
||||
6
zeroidc/vendor/unicode-normalization/src/no_std_prelude.rs
vendored
Normal file
6
zeroidc/vendor/unicode-normalization/src/no_std_prelude.rs
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
#[cfg(not(feature = "std"))]
|
||||
pub use alloc::{
|
||||
str::Chars,
|
||||
string::{String, ToString},
|
||||
vec::Vec,
|
||||
};
|
||||
201
zeroidc/vendor/unicode-normalization/src/normalize.rs
vendored
Normal file
201
zeroidc/vendor/unicode-normalization/src/normalize.rs
vendored
Normal file
@@ -0,0 +1,201 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Functions for computing canonical and compatible decompositions for Unicode characters.
|
||||
use crate::lookups::{
|
||||
canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
|
||||
compatibility_fully_decomposed, composition_table,
|
||||
};
|
||||
|
||||
use core::{char, ops::FnMut};
|
||||
|
||||
/// Compute canonical Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_canonical<F>(c: char, emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
decompose(c, canonical_fully_decomposed, emit_char)
|
||||
}
|
||||
|
||||
/// Compute canonical or compatible Unicode decomposition for character.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
|
||||
let decompose_char =
|
||||
|c| compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
decompose(c, decompose_char, emit_char)
|
||||
}
|
||||
|
||||
/// Compute standard-variation decomposition for character.
|
||||
///
|
||||
/// [Standardized Variation Sequences] are used instead of the standard canonical
|
||||
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
|
||||
/// to avoid losing information. See the
|
||||
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
|
||||
/// "Other Enhancements" section of the
|
||||
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
|
||||
/// for more information.
|
||||
#[inline]
|
||||
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
emit_char(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Don't perform decomposition for Hangul
|
||||
|
||||
if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
|
||||
for &d in decomposed {
|
||||
emit_char(d);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
emit_char(c);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
|
||||
where
|
||||
D: Fn(char) -> Option<&'static [char]>,
|
||||
F: FnMut(char),
|
||||
{
|
||||
// 7-bit ASCII never decomposes
|
||||
if c <= '\x7f' {
|
||||
emit_char(c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Perform decomposition for Hangul
|
||||
if is_hangul_syllable(c) {
|
||||
decompose_hangul(c, emit_char);
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(decomposed) = decompose_char(c) {
|
||||
for &d in decomposed {
|
||||
emit_char(d);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Finally bottom out.
|
||||
emit_char(c);
|
||||
}
|
||||
|
||||
/// Compose two characters into a single character, if possible.
|
||||
/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
|
||||
/// for more information.
|
||||
pub fn compose(a: char, b: char) -> Option<char> {
|
||||
compose_hangul(a, b).or_else(|| composition_table(a, b))
|
||||
}
|
||||
|
||||
// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
|
||||
const S_BASE: u32 = 0xAC00;
|
||||
const L_BASE: u32 = 0x1100;
|
||||
const V_BASE: u32 = 0x1161;
|
||||
const T_BASE: u32 = 0x11A7;
|
||||
const L_COUNT: u32 = 19;
|
||||
const V_COUNT: u32 = 21;
|
||||
const T_COUNT: u32 = 28;
|
||||
const N_COUNT: u32 = V_COUNT * T_COUNT;
|
||||
const S_COUNT: u32 = L_COUNT * N_COUNT;
|
||||
|
||||
const S_LAST: u32 = S_BASE + S_COUNT - 1;
|
||||
const L_LAST: u32 = L_BASE + L_COUNT - 1;
|
||||
const V_LAST: u32 = V_BASE + V_COUNT - 1;
|
||||
const T_LAST: u32 = T_BASE + T_COUNT - 1;
|
||||
|
||||
// Composition only occurs for `TPart`s in `U+11A8 ... U+11C2`,
|
||||
// i.e. `T_BASE + 1 ... T_LAST`.
|
||||
const T_FIRST: u32 = T_BASE + 1;
|
||||
|
||||
pub(crate) fn is_hangul_syllable(c: char) -> bool {
|
||||
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
|
||||
}
|
||||
|
||||
// Decompose a precomposed Hangul syllable
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
fn decompose_hangul<F>(s: char, mut emit_char: F)
|
||||
where
|
||||
F: FnMut(char),
|
||||
{
|
||||
let s_index = s as u32 - S_BASE;
|
||||
let l_index = s_index / N_COUNT;
|
||||
unsafe {
|
||||
emit_char(char::from_u32_unchecked(L_BASE + l_index));
|
||||
|
||||
let v_index = (s_index % N_COUNT) / T_COUNT;
|
||||
emit_char(char::from_u32_unchecked(V_BASE + v_index));
|
||||
|
||||
let t_index = s_index % T_COUNT;
|
||||
if t_index > 0 {
|
||||
emit_char(char::from_u32_unchecked(T_BASE + t_index));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn hangul_decomposition_length(s: char) -> usize {
|
||||
let si = s as u32 - S_BASE;
|
||||
let ti = si % T_COUNT;
|
||||
if ti > 0 {
|
||||
3
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
|
||||
// Compose a pair of Hangul Jamo
|
||||
#[allow(unsafe_code)]
|
||||
#[inline(always)]
|
||||
#[allow(ellipsis_inclusive_range_patterns)]
|
||||
fn compose_hangul(a: char, b: char) -> Option<char> {
|
||||
let (a, b) = (a as u32, b as u32);
|
||||
match (a, b) {
|
||||
// Compose a leading consonant and a vowel together into an LV_Syllable
|
||||
(L_BASE...L_LAST, V_BASE...V_LAST) => {
|
||||
let l_index = a - L_BASE;
|
||||
let v_index = b - V_BASE;
|
||||
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
|
||||
let s = S_BASE + lv_index;
|
||||
Some(unsafe { char::from_u32_unchecked(s) })
|
||||
}
|
||||
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
|
||||
(S_BASE...S_LAST, T_FIRST...T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
|
||||
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::compose_hangul;
|
||||
|
||||
// Regression test from a bugfix where we were composing an LV_Syllable with
|
||||
// T_BASE directly. (We should only compose an LV_Syllable with a character
|
||||
// in the range `T_BASE + 1 ... T_LAST`.)
|
||||
#[test]
|
||||
fn test_hangul_composition() {
|
||||
assert_eq!(compose_hangul('\u{c8e0}', '\u{11a7}'), None);
|
||||
}
|
||||
}
|
||||
50
zeroidc/vendor/unicode-normalization/src/perfect_hash.rs
vendored
Normal file
50
zeroidc/vendor/unicode-normalization/src/perfect_hash.rs
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
// Copyright 2019 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
//! Support for lookups based on minimal perfect hashing.
|
||||
|
||||
// This function is based on multiplication being fast and is "good enough". Also
|
||||
// it can share some work between the unsalted and salted versions.
|
||||
#[inline]
|
||||
fn my_hash(key: u32, salt: u32, n: usize) -> usize {
|
||||
let y = key.wrapping_add(salt).wrapping_mul(2654435769);
|
||||
let y = y ^ key.wrapping_mul(0x31415926);
|
||||
(((y as u64) * (n as u64)) >> 32) as usize
|
||||
}
|
||||
|
||||
/// Do a lookup using minimal perfect hashing.
|
||||
///
|
||||
/// The table is stored as a sequence of "salt" values, then a sequence of
|
||||
/// values that contain packed key/value pairs. The strategy is to hash twice.
|
||||
/// The first hash retrieves a salt value that makes the second hash unique.
|
||||
/// The hash function doesn't have to be very good, just good enough that the
|
||||
/// resulting map is unique.
|
||||
#[inline]
|
||||
pub(crate) fn mph_lookup<KV, V, FK, FV>(
|
||||
x: u32,
|
||||
salt: &[u16],
|
||||
kv: &[KV],
|
||||
fk: FK,
|
||||
fv: FV,
|
||||
default: V,
|
||||
) -> V
|
||||
where
|
||||
KV: Copy,
|
||||
FK: Fn(KV) -> u32,
|
||||
FV: Fn(KV) -> V,
|
||||
{
|
||||
let s = salt[my_hash(x, 0, salt.len())] as u32;
|
||||
let key_val = kv[my_hash(x, s, salt.len())];
|
||||
if x == fk(key_val) {
|
||||
fv(key_val)
|
||||
} else {
|
||||
default
|
||||
}
|
||||
}
|
||||
187
zeroidc/vendor/unicode-normalization/src/quick_check.rs
vendored
Normal file
187
zeroidc/vendor/unicode-normalization/src/quick_check.rs
vendored
Normal file
@@ -0,0 +1,187 @@
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::stream_safe;
|
||||
use crate::tables;
|
||||
use crate::UnicodeNormalization;
|
||||
|
||||
/// The QuickCheck algorithm can quickly determine if a text is or isn't
|
||||
/// normalized without any allocations in many cases, but it has to be able to
|
||||
/// return `Maybe` when a full decomposition and recomposition is necessary.
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum IsNormalized {
|
||||
/// The text is definitely normalized.
|
||||
Yes,
|
||||
/// The text is definitely not normalized.
|
||||
No,
|
||||
/// The text may be normalized.
|
||||
Maybe,
|
||||
}
|
||||
|
||||
// https://unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
||||
#[inline]
|
||||
fn quick_check<F, I>(s: I, is_allowed: F, stream_safe: bool) -> IsNormalized
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
F: Fn(char) -> IsNormalized,
|
||||
{
|
||||
let mut last_cc = 0u8;
|
||||
let mut nonstarter_count = 0;
|
||||
let mut result = IsNormalized::Yes;
|
||||
for ch in s {
|
||||
// For ASCII we know it's always allowed and a starter
|
||||
if ch <= '\x7f' {
|
||||
last_cc = 0;
|
||||
nonstarter_count = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Otherwise, lookup the combining class and QC property
|
||||
let cc = canonical_combining_class(ch);
|
||||
if last_cc > cc && cc != 0 {
|
||||
return IsNormalized::No;
|
||||
}
|
||||
match is_allowed(ch) {
|
||||
IsNormalized::Yes => (),
|
||||
IsNormalized::No => return IsNormalized::No,
|
||||
IsNormalized::Maybe => {
|
||||
result = IsNormalized::Maybe;
|
||||
}
|
||||
}
|
||||
if stream_safe {
|
||||
let decomp = stream_safe::classify_nonstarters(ch);
|
||||
|
||||
// If we're above `MAX_NONSTARTERS`, we're definitely *not*
|
||||
// stream-safe normalized.
|
||||
if nonstarter_count + decomp.leading_nonstarters > stream_safe::MAX_NONSTARTERS {
|
||||
return IsNormalized::No;
|
||||
}
|
||||
if decomp.leading_nonstarters == decomp.decomposition_len {
|
||||
nonstarter_count += decomp.decomposition_len;
|
||||
} else {
|
||||
nonstarter_count = decomp.trailing_nonstarters;
|
||||
}
|
||||
}
|
||||
last_cc = cc;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFC, potentially returning
|
||||
/// `IsNormalized::Maybe` if further checks are necessary. In this case a check
|
||||
/// like `s.chars().nfc().eq(s.chars())` should suffice.
|
||||
#[inline]
|
||||
pub fn is_nfc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkc, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfkd, false)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfc, true)
|
||||
}
|
||||
|
||||
/// Quickly check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe_quick<I: Iterator<Item = char>>(s: I) -> IsNormalized {
|
||||
quick_check(s, tables::qc_nfd, true)
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc(s: &str) -> bool {
|
||||
match is_nfc_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFKC.
|
||||
#[inline]
|
||||
pub fn is_nfkc(s: &str) -> bool {
|
||||
match is_nfkc_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfkc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd(s: &str) -> bool {
|
||||
match is_nfd_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfd()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is in NFKD.
|
||||
#[inline]
|
||||
pub fn is_nfkd(s: &str) -> bool {
|
||||
match is_nfkd_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().nfkd()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is Stream-Safe NFC.
|
||||
#[inline]
|
||||
pub fn is_nfc_stream_safe(s: &str) -> bool {
|
||||
match is_nfc_stream_safe_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfc()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Authoritatively check if a string is Stream-Safe NFD.
|
||||
#[inline]
|
||||
pub fn is_nfd_stream_safe(s: &str) -> bool {
|
||||
match is_nfd_stream_safe_quick(s.chars()) {
|
||||
IsNormalized::Yes => true,
|
||||
IsNormalized::No => false,
|
||||
IsNormalized::Maybe => s.chars().eq(s.chars().stream_safe().nfd()),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{is_nfc_stream_safe_quick, is_nfd_stream_safe_quick, IsNormalized};
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfd() {
|
||||
let okay = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
|
||||
assert_eq!(is_nfd_stream_safe_quick(okay.chars()), IsNormalized::Yes);
|
||||
|
||||
let too_much = "Da\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}ngerzone";
|
||||
assert_eq!(is_nfd_stream_safe_quick(too_much.chars()), IsNormalized::No);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stream_safe_nfc() {
|
||||
let okay = "ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
|
||||
assert_eq!(is_nfc_stream_safe_quick(okay.chars()), IsNormalized::Maybe);
|
||||
|
||||
let too_much = "not ok\u{e0}\u{031b}\u{0316}\u{0317}\u{0318}\u{0319}\u{031c}\u{031d}\u{031e}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{031a}y";
|
||||
assert_eq!(is_nfc_stream_safe_quick(too_much.chars()), IsNormalized::No);
|
||||
}
|
||||
}
|
||||
154
zeroidc/vendor/unicode-normalization/src/recompose.rs
vendored
Normal file
154
zeroidc/vendor/unicode-normalization/src/recompose.rs
vendored
Normal file
@@ -0,0 +1,154 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use crate::decompose::Decompositions;
|
||||
use core::fmt::{self, Write};
|
||||
use tinyvec::TinyVec;
|
||||
|
||||
#[derive(Clone)]
|
||||
enum RecompositionState {
|
||||
Composing,
|
||||
Purging(usize),
|
||||
Finished(usize),
|
||||
}
|
||||
|
||||
/// External iterator for a string recomposition's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Recompositions<I> {
|
||||
iter: Decompositions<I>,
|
||||
state: RecompositionState,
|
||||
buffer: TinyVec<[char; 4]>,
|
||||
composee: Option<char>,
|
||||
last_ccc: Option<u8>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_canonical<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_canonical(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
buffer: TinyVec::new(),
|
||||
composee: None,
|
||||
last_ccc: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_compatible<I: Iterator<Item = char>>(iter: I) -> Recompositions<I> {
|
||||
Recompositions {
|
||||
iter: super::decompose::new_compatible(iter),
|
||||
state: self::RecompositionState::Composing,
|
||||
buffer: TinyVec::new(),
|
||||
composee: None,
|
||||
last_ccc: None,
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Recompositions<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
use self::RecompositionState::*;
|
||||
|
||||
loop {
|
||||
match self.state {
|
||||
Composing => {
|
||||
for ch in self.iter.by_ref() {
|
||||
let ch_class = super::char::canonical_combining_class(ch);
|
||||
let k = match self.composee {
|
||||
None => {
|
||||
if ch_class != 0 {
|
||||
return Some(ch);
|
||||
}
|
||||
self.composee = Some(ch);
|
||||
continue;
|
||||
}
|
||||
Some(k) => k,
|
||||
};
|
||||
match self.last_ccc {
|
||||
None => match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
},
|
||||
Some(l_class) => {
|
||||
if l_class >= ch_class {
|
||||
// `ch` is blocked from `composee`
|
||||
if ch_class == 0 {
|
||||
self.composee = Some(ch);
|
||||
self.last_ccc = None;
|
||||
self.state = Purging(0);
|
||||
return Some(k);
|
||||
}
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
continue;
|
||||
}
|
||||
match super::char::compose(k, ch) {
|
||||
Some(r) => {
|
||||
self.composee = Some(r);
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
self.buffer.push(ch);
|
||||
self.last_ccc = Some(ch_class);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.state = Finished(0);
|
||||
if self.composee.is_some() {
|
||||
return self.composee.take();
|
||||
}
|
||||
}
|
||||
Purging(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
self.state = Composing;
|
||||
}
|
||||
s => {
|
||||
self.state = Purging(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
Finished(next) => match self.buffer.get(next).cloned() {
|
||||
None => {
|
||||
self.buffer.clear();
|
||||
return self.composee.take();
|
||||
}
|
||||
s => {
|
||||
self.state = Finished(next + 1);
|
||||
return s;
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Recompositions<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
61
zeroidc/vendor/unicode-normalization/src/replace.rs
vendored
Normal file
61
zeroidc/vendor/unicode-normalization/src/replace.rs
vendored
Normal file
@@ -0,0 +1,61 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
use core::fmt::{self, Write};
|
||||
use tinyvec::ArrayVec;
|
||||
|
||||
/// External iterator for replacements for a string's characters.
|
||||
#[derive(Clone)]
|
||||
pub struct Replacements<I> {
|
||||
iter: I,
|
||||
// At this time, the longest replacement sequence has length 2, so we just
|
||||
// need buffer space for 1 codepoint.
|
||||
buffer: Option<char>,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
|
||||
Replacements { iter, buffer: None }
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
if let Some(c) = self.buffer.take() {
|
||||
return Some(c);
|
||||
}
|
||||
|
||||
match self.iter.next() {
|
||||
Some(ch) => {
|
||||
// At this time, the longest replacement sequence has length 2.
|
||||
let mut buffer = ArrayVec::<[char; 2]>::new();
|
||||
super::char::decompose_cjk_compat_variants(ch, |d| buffer.push(d));
|
||||
self.buffer = buffer.get(1).copied();
|
||||
Some(buffer[0])
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let (lower, _) = self.iter.size_hint();
|
||||
(lower, None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char> + Clone> fmt::Display for Replacements<I> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for c in self.clone() {
|
||||
f.write_char(c)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
170
zeroidc/vendor/unicode-normalization/src/stream_safe.rs
vendored
Normal file
170
zeroidc/vendor/unicode-normalization/src/stream_safe.rs
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
use crate::lookups::{
|
||||
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
|
||||
stream_safe_trailing_nonstarters,
|
||||
};
|
||||
use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
|
||||
use crate::tables::stream_safe_leading_nonstarters;
|
||||
|
||||
pub(crate) const MAX_NONSTARTERS: usize = 30;
|
||||
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
|
||||
|
||||
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
|
||||
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
|
||||
/// (U+034F) if the count exceeds 30.
|
||||
pub struct StreamSafe<I> {
|
||||
iter: I,
|
||||
nonstarter_count: usize,
|
||||
buffer: Option<char>,
|
||||
}
|
||||
|
||||
impl<I> StreamSafe<I> {
|
||||
pub(crate) fn new(iter: I) -> Self {
|
||||
Self {
|
||||
iter,
|
||||
nonstarter_count: 0,
|
||||
buffer: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
|
||||
type Item = char;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<char> {
|
||||
let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
|
||||
None => return None,
|
||||
Some(c) => c,
|
||||
};
|
||||
let d = classify_nonstarters(next_ch);
|
||||
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
|
||||
// Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
|
||||
// nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
|
||||
// iterator (via `self.buffer`), and we'll reclassify it next iteration.
|
||||
self.nonstarter_count = 0;
|
||||
self.buffer = Some(next_ch);
|
||||
return Some(COMBINING_GRAPHEME_JOINER);
|
||||
}
|
||||
|
||||
// Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
|
||||
// nonstarters in NKFD.
|
||||
if d.leading_nonstarters == d.decomposition_len {
|
||||
self.nonstarter_count += d.decomposition_len;
|
||||
}
|
||||
// Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
|
||||
else {
|
||||
self.nonstarter_count = d.trailing_nonstarters;
|
||||
}
|
||||
Some(next_ch)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Decomposition {
|
||||
pub(crate) leading_nonstarters: usize,
|
||||
pub(crate) trailing_nonstarters: usize,
|
||||
pub(crate) decomposition_len: usize,
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
|
||||
// As usual, fast path for ASCII (which is always a starter)
|
||||
if c <= '\x7f' {
|
||||
return Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: 1,
|
||||
};
|
||||
}
|
||||
// Next, special case Hangul, since it's not handled by our tables.
|
||||
if is_hangul_syllable(c) {
|
||||
return Decomposition {
|
||||
leading_nonstarters: 0,
|
||||
trailing_nonstarters: 0,
|
||||
decomposition_len: hangul_decomposition_length(c),
|
||||
};
|
||||
}
|
||||
let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
|
||||
match decomp {
|
||||
Some(decomp) => Decomposition {
|
||||
leading_nonstarters: stream_safe_leading_nonstarters(c),
|
||||
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
|
||||
decomposition_len: decomp.len(),
|
||||
},
|
||||
None => {
|
||||
let is_nonstarter = canonical_combining_class(c) != 0;
|
||||
let nonstarter = if is_nonstarter { 1 } else { 0 };
|
||||
Decomposition {
|
||||
leading_nonstarters: nonstarter,
|
||||
trailing_nonstarters: nonstarter,
|
||||
decomposition_len: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{classify_nonstarters, StreamSafe};
|
||||
use crate::lookups::canonical_combining_class;
|
||||
use crate::normalize::decompose_compatible;
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
use core::char;
|
||||
|
||||
fn stream_safe(s: &str) -> String {
|
||||
StreamSafe::new(s.chars()).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
|
||||
assert_eq!(stream_safe(technically_okay), technically_okay);
|
||||
|
||||
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
|
||||
let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
|
||||
assert_eq!(stream_safe(too_much), fixed_it);
|
||||
|
||||
let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
|
||||
let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
|
||||
assert_eq!(stream_safe(woah_nelly), its_cool);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_nonstarters() {
|
||||
let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
|
||||
let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
|
||||
assert_eq!(stream_safe(s), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_classify_nonstarters() {
|
||||
// Highest character in the `compat_fully_decomp` table is 2FA1D
|
||||
for ch in 0..0x2FA1E {
|
||||
let ch = match char::from_u32(ch) {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
let c = classify_nonstarters(ch);
|
||||
let mut s = Vec::new();
|
||||
decompose_compatible(ch, |c| s.push(c));
|
||||
|
||||
assert_eq!(s.len(), c.decomposition_len);
|
||||
|
||||
let num_leading = s
|
||||
.iter()
|
||||
.take_while(|&c| canonical_combining_class(*c) != 0)
|
||||
.count();
|
||||
let num_trailing = s
|
||||
.iter()
|
||||
.rev()
|
||||
.take_while(|&c| canonical_combining_class(*c) != 0)
|
||||
.count();
|
||||
|
||||
assert_eq!(num_leading, c.leading_nonstarters);
|
||||
assert_eq!(num_trailing, c.trailing_nonstarters);
|
||||
}
|
||||
}
|
||||
}
|
||||
26020
zeroidc/vendor/unicode-normalization/src/tables.rs
vendored
Normal file
26020
zeroidc/vendor/unicode-normalization/src/tables.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
125
zeroidc/vendor/unicode-normalization/src/test.rs
vendored
Normal file
125
zeroidc/vendor/unicode-normalization/src/test.rs
vendored
Normal file
@@ -0,0 +1,125 @@
|
||||
// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution and at
|
||||
// http://rust-lang.org/COPYRIGHT.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::char::is_combining_mark;
|
||||
use super::UnicodeNormalization;
|
||||
use core::char;
|
||||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use crate::no_std_prelude::*;
|
||||
|
||||
#[test]
|
||||
fn test_nfd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfd().to_string(), $expected);
|
||||
// A dummy iterator that is not std::str::Chars directly;
|
||||
// note that `id_func` is used to ensure `Clone` implementation
|
||||
assert_eq!(
|
||||
$input.chars().map(|c| c).nfd().collect::<String>(),
|
||||
$expected
|
||||
);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}\u{1c4}");
|
||||
t!("\u{2026}", "\u{2026}");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
|
||||
t!("a\u{301}", "a\u{301}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
|
||||
t!("\u{ac1c}", "\u{1100}\u{1162}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkd() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkd().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "d\u{307}DZ\u{30c}");
|
||||
t!("\u{2026}", "...");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "d\u{323}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "d\u{323}\u{307}");
|
||||
t!("a\u{301}", "a\u{301}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{1111}\u{1171}\u{11b6}");
|
||||
t!("\u{ac1c}", "\u{1100}\u{1162}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfc().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}\u{1c4}");
|
||||
t!("\u{2026}", "\u{2026}");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
|
||||
t!("a\u{301}", "\u{e1}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkc() {
|
||||
macro_rules! t {
|
||||
($input: expr, $expected: expr) => {
|
||||
assert_eq!($input.nfkc().to_string(), $expected);
|
||||
};
|
||||
}
|
||||
t!("abc", "abc");
|
||||
t!("\u{1e0b}\u{1c4}", "\u{1e0b}D\u{17d}");
|
||||
t!("\u{2026}", "...");
|
||||
t!("\u{2126}", "\u{3a9}");
|
||||
t!("\u{1e0b}\u{323}", "\u{1e0d}\u{307}");
|
||||
t!("\u{1e0d}\u{307}", "\u{1e0d}\u{307}");
|
||||
t!("a\u{301}", "\u{e1}");
|
||||
t!("\u{301}a", "\u{301}a");
|
||||
t!("\u{d4db}", "\u{d4db}");
|
||||
t!("\u{ac1c}", "\u{ac1c}");
|
||||
t!(
|
||||
"a\u{300}\u{305}\u{315}\u{5ae}b",
|
||||
"\u{e0}\u{5ae}\u{305}\u{315}b"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_combining_mark_ascii() {
|
||||
for cp in 0..0x7f {
|
||||
assert!(!is_combining_mark(char::from_u32(cp).unwrap()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_is_combining_mark_misc() {
|
||||
// https://github.com/unicode-rs/unicode-normalization/issues/16
|
||||
// U+11C3A BHAIKSUKI VOWEL SIGN O
|
||||
// Category: Mark, Nonspacing [Mn]
|
||||
assert!(is_combining_mark('\u{11C3A}'));
|
||||
|
||||
// U+11C3F BHAIKSUKI SIGN VIRAMA
|
||||
// Category: Mark, Nonspacing [Mn]
|
||||
assert!(is_combining_mark('\u{11C3F}'));
|
||||
}
|
||||
Reference in New Issue
Block a user