RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.
This commit is contained in:
1
zeroidc/vendor/encoding_rs/.cargo-checksum.json
vendored
Normal file
1
zeroidc/vendor/encoding_rs/.cargo-checksum.json
vendored
Normal file
File diff suppressed because one or more lines are too long
48
zeroidc/vendor/encoding_rs/CONTRIBUTING.md
vendored
Normal file
48
zeroidc/vendor/encoding_rs/CONTRIBUTING.md
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
If you send a pull request / patch, please observe the following.
|
||||
|
||||
## Licensing
|
||||
|
||||
Since this crate is dual-licensed,
|
||||
[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions)
|
||||
is considered to apply in the sense of Contributions being automatically
|
||||
under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file).
|
||||
That is, by the act of offering a Contribution, you place your Contribution
|
||||
under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT`
|
||||
file. Please do not contribute if you aren't willing or allowed to license your
|
||||
contributions in this manner.
|
||||
|
||||
You are encouraged to dedicate test code that you contribute to the Public
|
||||
Domain using the CC0 dedication. If you contribute test code that is not
|
||||
dedicated to the Public Domain, please be sure not to put it in a part of
|
||||
source code that the comments designate as being dedicated to the Public
|
||||
Domain.
|
||||
|
||||
## Copyright Notices
|
||||
|
||||
If you require the addition of your copyright notice, it's up to you to edit in
|
||||
your notice as part of your Contribution. Not adding a copyright notice is
|
||||
taken as a waiver of copyright notice.
|
||||
|
||||
## No Encodings Beyond The Encoding Standard
|
||||
|
||||
Please do not contribute implementations of encodings that are not specified
|
||||
in the [Encoding Standard](https://encoding.spec.whatwg.org/).
|
||||
|
||||
For example, an implementation of UTF-7 is explicitly out of scope for this
|
||||
crate and is, therefore, provided by the [`charset`](https://crates.io/crates/charset)
|
||||
crate instead. For single-byte DOS encodings, please see the
|
||||
[`oem_cp`](https://crates.io/crates/oem_cp) crate.
|
||||
|
||||
## Compatibility with Stable Rust
|
||||
|
||||
Please ensure that your Contribution compiles with the latest stable-channel
|
||||
rustc.
|
||||
|
||||
## rustfmt
|
||||
|
||||
The `rustfmt` version used for this code is `rustfmt-nightly`. Please either
|
||||
use that version or avoid using `rustfmt` (so as not to reformat all the code).
|
||||
|
||||
## Unit tests
|
||||
|
||||
Please ensure that `cargo test` succeeds.
|
||||
17
zeroidc/vendor/encoding_rs/COPYRIGHT
vendored
Normal file
17
zeroidc/vendor/encoding_rs/COPYRIGHT
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
encoding_rs is copyright Mozilla Foundation.
|
||||
|
||||
Licensed under the Apache License, Version 2.0
|
||||
<LICENSE-APACHE or
|
||||
https://www.apache.org/licenses/LICENSE-2.0> or the MIT
|
||||
license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
|
||||
at your option. All files in the project carrying such
|
||||
notice may not be copied, modified, or distributed except
|
||||
according to those terms.
|
||||
|
||||
This crate includes data derived from the data files supplied
|
||||
with the WHATWG Encoding Standard, which, when incorporated into
|
||||
source code, are licensed under the BSD 3-Clause License
|
||||
<LICENSE-WHATWG>.
|
||||
|
||||
Test code within encoding_rs is dedicated to the Public Domain when so
|
||||
designated (see the individual files for PD/CC0-dedicated sections).
|
||||
84
zeroidc/vendor/encoding_rs/Cargo.toml
vendored
Normal file
84
zeroidc/vendor/encoding_rs/Cargo.toml
vendored
Normal file
@@ -0,0 +1,84 @@
|
||||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "encoding_rs"
|
||||
version = "0.8.31"
|
||||
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
|
||||
description = "A Gecko-oriented implementation of the Encoding Standard"
|
||||
homepage = "https://docs.rs/encoding_rs/"
|
||||
documentation = "https://docs.rs/encoding_rs/"
|
||||
readme = "README.md"
|
||||
keywords = [
|
||||
"encoding",
|
||||
"web",
|
||||
"unicode",
|
||||
"charset",
|
||||
]
|
||||
categories = [
|
||||
"text-processing",
|
||||
"encoding",
|
||||
"web-programming",
|
||||
"internationalization",
|
||||
]
|
||||
license = "(Apache-2.0 OR MIT) AND BSD-3-Clause"
|
||||
repository = "https://github.com/hsivonen/encoding_rs"
|
||||
|
||||
[profile.release]
|
||||
lto = true
|
||||
|
||||
[dependencies.cfg-if]
|
||||
version = "1.0"
|
||||
|
||||
[dependencies.packed_simd]
|
||||
version = "0.3.4"
|
||||
optional = true
|
||||
package = "packed_simd_2"
|
||||
|
||||
[dependencies.serde]
|
||||
version = "1.0"
|
||||
optional = true
|
||||
|
||||
[dev-dependencies.bincode]
|
||||
version = "1.0"
|
||||
|
||||
[dev-dependencies.serde_derive]
|
||||
version = "1.0"
|
||||
|
||||
[dev-dependencies.serde_json]
|
||||
version = "1.0"
|
||||
|
||||
[features]
|
||||
alloc = []
|
||||
default = ["alloc"]
|
||||
fast-big5-hanzi-encode = []
|
||||
fast-gb-hanzi-encode = []
|
||||
fast-hangul-encode = []
|
||||
fast-hanja-encode = []
|
||||
fast-kanji-encode = []
|
||||
fast-legacy-encode = [
|
||||
"fast-hangul-encode",
|
||||
"fast-hanja-encode",
|
||||
"fast-kanji-encode",
|
||||
"fast-gb-hanzi-encode",
|
||||
"fast-big5-hanzi-encode",
|
||||
]
|
||||
less-slow-big5-hanzi-encode = []
|
||||
less-slow-gb-hanzi-encode = []
|
||||
less-slow-kanji-encode = []
|
||||
simd-accel = [
|
||||
"packed_simd",
|
||||
"packed_simd/into_bits",
|
||||
]
|
||||
|
||||
[badges.travis-ci]
|
||||
repository = "hsivonen/encoding_rs"
|
||||
106
zeroidc/vendor/encoding_rs/Ideas.md
vendored
Normal file
106
zeroidc/vendor/encoding_rs/Ideas.md
vendored
Normal file
@@ -0,0 +1,106 @@
|
||||
This document contains notes about various ideas that for one reason or another
|
||||
are not being actively pursued.
|
||||
|
||||
## Next byte is non-ASCII after ASCII optimization
|
||||
|
||||
The current plan for a SIMD-accelerated inner loop for handling ASCII bytes
|
||||
makes no use of the bit of information that if the buffers didn't end but the
|
||||
ASCII loop exited, the next byte will not be an ASCII byte.
|
||||
|
||||
## Handling ASCII with table lookups when decoding single-byte to UTF-16
|
||||
|
||||
Both uconv and ICU outperform encoding_rs when decoding single-byte to UTF-16.
|
||||
unconv doesn't even do anything fancy to manually unroll the loop (see below).
|
||||
Both handle even the ASCII range using table lookup. That is, there's no branch
|
||||
for checking if we're in the lower or upper half of the encoding.
|
||||
|
||||
However, adding SIMD acceleration for the ASCII half will likely be a bigger
|
||||
win than eliminating the branch to decide ASCII vs. non-ASCII.
|
||||
|
||||
## Manual loop unrolling for single-byte encodings
|
||||
|
||||
ICU currently outperforms encoding_rs (by over x2!) when decoding a single-byte
|
||||
encoding to UTF-16. This appears to be thanks to manually unrolling the
|
||||
conversion loop by 16. See [ucnv_MBCSSingleToBMPWithOffsets][1].
|
||||
|
||||
[1]: https://ssl.icu-project.org/repos/icu/icu/tags/release-55-1/source/common/ucnvmbcs.cpp
|
||||
|
||||
Notably, none of the single-byte encodings have bytes that'd decode to the
|
||||
upper half of BMP. Therefore, if the unmappable marker has the highest bit set
|
||||
instead of being zero, the check for unmappables within a 16-character stride
|
||||
can be done either by ORing the BMP characters in the stride together and
|
||||
checking the high bit or by loading the upper halves of the BMP charaters
|
||||
in a `u8x8` register and checking the high bits using the `_mm_movemask_epi8`
|
||||
/ `pmovmskb` SSE2 instruction.
|
||||
|
||||
## After non-ASCII, handle ASCII punctuation without SIMD
|
||||
|
||||
Since the failure mode of SIMD ASCII acceleration involves wasted aligment
|
||||
checks and a wasted SIMD read when the next code unit is non-ASCII and non-Latin
|
||||
scripts have runs of non-ASCII even if ASCII spaces and punctuation is used,
|
||||
consider handling the next two or three bytes following non-ASCII as non-SIMD
|
||||
before looping back to the SIMD mode. Maybe move back to SIMD ASCII faster if
|
||||
there's ASCII that's not space or punctuation. Maybe with the "space or
|
||||
punctuation" check in place, this code can be allowed to be in place even for
|
||||
UTF-8 and Latin single-byte (i.e. not having different code for Latin and
|
||||
non-Latin single-byte).
|
||||
|
||||
## Prefer maintaining aligment
|
||||
|
||||
Instead of returning to acceleration directly after non-ASCII, consider
|
||||
continuing to the alignment boundary without acceleration.
|
||||
|
||||
## Read from SIMD lanes instead of RAM (cache) when ASCII check fails
|
||||
|
||||
When the SIMD ASCII check fails, the data has already been read from memory.
|
||||
Test whether it's faster to read the data by lane from the SIMD register than
|
||||
to read it again from RAM (cache).
|
||||
|
||||
## Use Level 2 Hanzi and Level 2 Kanji ordering
|
||||
|
||||
These two are ordered by radical and then by stroke count, so in principle,
|
||||
they should be mostly Unicode-ordered, although at least Level 2 Hanzi isn't
|
||||
fully Unicode-ordered. Is "mostly" good enough for encode accelelation?
|
||||
|
||||
## Create a `divmod_94()` function
|
||||
|
||||
Experiment with a function that computes `(i / 94, i % 94)` more efficiently
|
||||
than generic code.
|
||||
|
||||
## Align writes on Aarch64
|
||||
|
||||
On [Cortex-A57](https://stackoverflow.com/questions/45714535/performance-of-unaligned-simd-load-store-on-aarch64/45938112#45938112
|
||||
), it might be a good idea to move the destination into 16-byte alignment.
|
||||
|
||||
## Unalign UTF-8 validation on Aarch64
|
||||
|
||||
Currently, Aarch64 runs the generic ALU UTF-8 validation code that aligns
|
||||
reads. That's probably unnecessary on Aarch64. (SIMD was slower than ALU!)
|
||||
|
||||
## Table-driven UTF-8 validation
|
||||
|
||||
When there are at least four bytes left, read all four. With each byte
|
||||
index into tables corresponding to magic values indexable by byte in
|
||||
each position.
|
||||
|
||||
In the value read from the table indexed by lead byte, encode the
|
||||
following in 16 bits: advance 2 bits (2, 3 or 4 bytes), 9 positional
|
||||
bits one of which is set to indicate the type of lead byte (8 valid
|
||||
types, in the 8 lowest bits, and invalid, ASCII would be tenth type),
|
||||
and the mask for extracting the payload bits from the lead byte
|
||||
(for conversion to UTF-16 or UTF-32).
|
||||
|
||||
In the tables indexable by the trail bytes, in each positions
|
||||
corresponding byte the lead byte type, store 1 if the trail is
|
||||
invalid given the lead and 0 if valid given the lead.
|
||||
|
||||
Use the low 8 bits of the of the 16 bits read from the first
|
||||
table to mask (bitwise AND) one positional bit from each of the
|
||||
three other values. Bitwise OR the results together with the
|
||||
bit that is 1 if the lead is invalid. If the result is zero,
|
||||
the sequence is valid. Otherwise it's invalid.
|
||||
|
||||
Use the advance to advance. In the conversion to UTF-16 or
|
||||
UTF-32 case, use the mast for extracting the meaningful
|
||||
bits from the lead byte to mask them from the lead. Shift
|
||||
left by 6 as many times as the advance indicates, etc.
|
||||
202
zeroidc/vendor/encoding_rs/LICENSE-APACHE
vendored
Normal file
202
zeroidc/vendor/encoding_rs/LICENSE-APACHE
vendored
Normal file
@@ -0,0 +1,202 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
25
zeroidc/vendor/encoding_rs/LICENSE-MIT
vendored
Normal file
25
zeroidc/vendor/encoding_rs/LICENSE-MIT
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
Copyright Mozilla Foundation
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
26
zeroidc/vendor/encoding_rs/LICENSE-WHATWG
vendored
Normal file
26
zeroidc/vendor/encoding_rs/LICENSE-WHATWG
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
Copyright © WHATWG (Apple, Google, Mozilla, Microsoft).
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
821
zeroidc/vendor/encoding_rs/README.md
vendored
Normal file
821
zeroidc/vendor/encoding_rs/README.md
vendored
Normal file
@@ -0,0 +1,821 @@
|
||||
# encoding_rs
|
||||
|
||||
[](https://travis-ci.org/hsivonen/encoding_rs)
|
||||
[](https://crates.io/crates/encoding_rs)
|
||||
[](https://docs.rs/encoding_rs/)
|
||||
|
||||
encoding_rs an implementation of the (non-JavaScript parts of) the
|
||||
[Encoding Standard](https://encoding.spec.whatwg.org/) written in Rust.
|
||||
|
||||
The Encoding Standard defines the Web-compatible set of character encodings,
|
||||
which means this crate can be used to decode Web content. encoding_rs is
|
||||
used in Gecko starting with Firefox 56. Due to the notable overlap between
|
||||
the legacy encodings on the Web and the legacy encodings used on Windows,
|
||||
this crate may be of use for non-Web-related situations as well; see below
|
||||
for links to adjacent crates.
|
||||
|
||||
Additionally, the `mem` module provides various operations for dealing with
|
||||
in-RAM text (as opposed to data that's coming from or going to an IO boundary).
|
||||
The `mem` module is a module instead of a separate crate due to internal
|
||||
implementation detail efficiencies.
|
||||
|
||||
## Functionality
|
||||
|
||||
Due to the Gecko use case, encoding_rs supports decoding to and encoding from
|
||||
UTF-16 in addition to supporting the usual Rust use case of decoding to and
|
||||
encoding from UTF-8. Additionally, the API has been designed to be FFI-friendly
|
||||
to accommodate the C++ side of Gecko.
|
||||
|
||||
Specifically, encoding_rs does the following:
|
||||
|
||||
* Decodes a stream of bytes in an Encoding Standard-defined character encoding
|
||||
into valid aligned native-endian in-RAM UTF-16 (units of `u16` / `char16_t`).
|
||||
* Encodes a stream of potentially-invalid aligned native-endian in-RAM UTF-16
|
||||
(units of `u16` / `char16_t`) into a sequence of bytes in an Encoding
|
||||
Standard-defined character encoding as if the lone surrogates had been
|
||||
replaced with the REPLACEMENT CHARACTER before performing the encode.
|
||||
(Gecko's UTF-16 is potentially invalid.)
|
||||
* Decodes a stream of bytes in an Encoding Standard-defined character
|
||||
encoding into valid UTF-8.
|
||||
* Encodes a stream of valid UTF-8 into a sequence of bytes in an Encoding
|
||||
Standard-defined character encoding. (Rust's UTF-8 is guaranteed-valid.)
|
||||
* Does the above in streaming (input and output split across multiple
|
||||
buffers) and non-streaming (whole input in a single buffer and whole
|
||||
output in a single buffer) variants.
|
||||
* Avoids copying (borrows) when possible in the non-streaming cases when
|
||||
decoding to or encoding from UTF-8.
|
||||
* Resolves textual labels that identify character encodings in
|
||||
protocol text into type-safe objects representing the those encodings
|
||||
conceptually.
|
||||
* Maps the type-safe encoding objects onto strings suitable for
|
||||
returning from `document.characterSet`.
|
||||
* Validates UTF-8 (in common instruction set scenarios a bit faster for Web
|
||||
workloads than the standard library; hopefully will get upstreamed some
|
||||
day) and ASCII.
|
||||
|
||||
Additionally, `encoding_rs::mem` does the following:
|
||||
|
||||
* Checks if a byte buffer contains only ASCII.
|
||||
* Checks if a potentially-invalid UTF-16 buffer contains only Basic Latin (ASCII).
|
||||
* Checks if a valid UTF-8, potentially-invalid UTF-8 or potentially-invalid UTF-16
|
||||
buffer contains only Latin1 code points (below U+0100).
|
||||
* Checks if a valid UTF-8, potentially-invalid UTF-8 or potentially-invalid UTF-16
|
||||
buffer or a code point or a UTF-16 code unit can trigger right-to-left behavior
|
||||
(suitable for checking if the Unicode Bidirectional Algorithm can be optimized
|
||||
out).
|
||||
* Combined versions of the above two checks.
|
||||
* Converts valid UTF-8, potentially-invalid UTF-8 and Latin1 to UTF-16.
|
||||
* Converts potentially-invalid UTF-16 and Latin1 to UTF-8.
|
||||
* Converts UTF-8 and UTF-16 to Latin1 (if in range).
|
||||
* Finds the first invalid code unit in a buffer of potentially-invalid UTF-16.
|
||||
* Makes a mutable buffer of potential-invalid UTF-16 contain valid UTF-16.
|
||||
* Copies ASCII from one buffer to another up to the first non-ASCII byte.
|
||||
* Converts ASCII to UTF-16 up to the first non-ASCII byte.
|
||||
* Converts UTF-16 to ASCII up to the first non-Basic Latin code unit.
|
||||
|
||||
## Integration with `std::io`
|
||||
|
||||
Notably, the above feature list doesn't include the capability to wrap
|
||||
a `std::io::Read`, decode it into UTF-8 and presenting the result via
|
||||
`std::io::Read`. The [`encoding_rs_io`](https://crates.io/crates/encoding_rs_io)
|
||||
crate provides that capability.
|
||||
|
||||
## `no_std` Environment
|
||||
|
||||
The crate works in a `no_std` environment. By default, the `alloc` feature,
|
||||
which assumes that an allocator is present is enabled. For a no-allocator
|
||||
environment, the default features (i.e. `alloc`) can be turned off. This
|
||||
makes the part of the API that returns `Vec`/`String`/`Cow` unavailable.
|
||||
|
||||
## Decoding Email
|
||||
|
||||
For decoding character encodings that occur in email, use the
|
||||
[`charset`](https://crates.io/crates/charset) crate instead of using this
|
||||
one directly. (It wraps this crate and adds UTF-7 decoding.)
|
||||
|
||||
## Windows Code Page Identifier Mappings
|
||||
|
||||
For mappings to and from Windows code page identifiers, use the
|
||||
[`codepage`](https://crates.io/crates/codepage) crate.
|
||||
|
||||
## DOS Encodings
|
||||
|
||||
This crate does not support single-byte DOS encodings that aren't required by
|
||||
the Web Platform, but the [`oem_cp`](https://crates.io/crates/oem_cp) crate does.
|
||||
|
||||
## Preparing Text for the Encoders
|
||||
|
||||
Normalizing text into Unicode Normalization Form C prior to encoding text into
|
||||
a legacy encoding minimizes unmappable characters. Text can be normalized to
|
||||
Unicode Normalization Form C using the
|
||||
[`unic-normal`](https://crates.io/crates/unic-normal) crate.
|
||||
|
||||
The exception is windows-1258, which after normalizing to Unicode Normalization
|
||||
Form C requires tone marks to be decomposed in order to minimize unmappable
|
||||
characters. Vietnamese tone marks can be decomposed using the
|
||||
[`detone`](https://crates.io/crates/detone) crate.
|
||||
|
||||
## Licensing
|
||||
|
||||
TL;DR: `(Apache-2.0 OR MIT) AND BSD-3-Clause` for the code and data combination.
|
||||
|
||||
Please see the file named
|
||||
[COPYRIGHT](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT).
|
||||
|
||||
The non-test code that isn't generated from the WHATWG data in this crate is
|
||||
under Apache-2.0 OR MIT. Test code is under CC0.
|
||||
|
||||
This crate contains code/data generated from WHATWG-supplied data. The WHATWG
|
||||
upstream changed its license for portions of specs incorporated into source code
|
||||
from CC0 to BSD-3-Clause between the initial release of this crate and the present
|
||||
version of this crate. The in-source licensing legends have been updated for the
|
||||
parts of the generated code that have changed since the upstream license change.
|
||||
|
||||
## Documentation
|
||||
|
||||
Generated [API documentation](https://docs.rs/encoding_rs/) is available
|
||||
online.
|
||||
|
||||
There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
|
||||
design and internals of the crate.
|
||||
|
||||
## C and C++ bindings
|
||||
|
||||
An FFI layer for encoding_rs is available as a
|
||||
[separate crate](https://github.com/hsivonen/encoding_c). The crate comes
|
||||
with a [demo C++ wrapper](https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h)
|
||||
using the C++ standard library and [GSL](https://github.com/Microsoft/GSL/) types.
|
||||
|
||||
The bindings for the `mem` module are in the
|
||||
[encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
|
||||
|
||||
For the Gecko context, there's a
|
||||
[C++ wrapper using the MFBT/XPCOM types](https://searchfox.org/mozilla-central/source/intl/Encoding.h#100).
|
||||
|
||||
There's a [write-up](https://hsivonen.fi/modern-cpp-in-rust/) about the C++
|
||||
wrappers.
|
||||
|
||||
## Sample programs
|
||||
|
||||
* [Rust](https://github.com/hsivonen/recode_rs)
|
||||
* [C](https://github.com/hsivonen/recode_c)
|
||||
* [C++](https://github.com/hsivonen/recode_cpp)
|
||||
|
||||
## Optional features
|
||||
|
||||
There are currently these optional cargo features:
|
||||
|
||||
### `simd-accel`
|
||||
|
||||
Enables SIMD acceleration using the nightly-dependent `packed_simd_2` crate.
|
||||
|
||||
This is an opt-in feature, because enabling this feature _opts out_ of Rust's
|
||||
guarantees of future compilers compiling old code (aka. "stability story").
|
||||
|
||||
Currently, this has not been tested to be an improvement except for these
|
||||
targets:
|
||||
|
||||
* x86_64
|
||||
* i686
|
||||
* aarch64
|
||||
* thumbv7neon
|
||||
|
||||
If you use nightly Rust, you use targets whose first component is one of the
|
||||
above, and you are prepared _to have to revise your configuration when updating
|
||||
Rust_, you should enable this feature. Otherwise, please _do not_ enable this
|
||||
feature.
|
||||
|
||||
_Note!_ If you are compiling for a target that does not have 128-bit SIMD
|
||||
enabled as part of the target definition and you are enabling 128-bit SIMD
|
||||
using `-C target_feature`, you need to enable the `core_arch` Cargo feature
|
||||
for `packed_simd_2` to compile a crates.io snapshot of `core_arch` instead of
|
||||
using the standard-library copy of `core::arch`, because the `core::arch`
|
||||
module of the pre-compiled standard library has been compiled with the
|
||||
assumption that the CPU doesn't have 128-bit SIMD. At present this applies
|
||||
mainly to 32-bit ARM targets whose first component does not include the
|
||||
substring `neon`.
|
||||
|
||||
The encoding_rs side of things has not been properly set up for POWER,
|
||||
PowerPC, MIPS, etc., SIMD at this time, so even if you were to follow
|
||||
the advice from the previous paragraph, you probably shouldn't use
|
||||
the `simd-accel` option on the less mainstream architectures at this
|
||||
time.
|
||||
|
||||
Used by Firefox.
|
||||
|
||||
### `serde`
|
||||
|
||||
Enables support for serializing and deserializing `&'static Encoding`-typed
|
||||
struct fields using [Serde][1].
|
||||
|
||||
[1]: https://serde.rs/
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-legacy-encode`
|
||||
|
||||
A catch-all option for enabling the fastest legacy encode options. _Does not
|
||||
affect decode speed or UTF-8 encode speed._
|
||||
|
||||
At present, this option is equivalent to enabling the following options:
|
||||
* `fast-hangul-encode`
|
||||
* `fast-hanja-encode`
|
||||
* `fast-kanji-encode`
|
||||
* `fast-gb-hanzi-encode`
|
||||
* `fast-big5-hanzi-encode`
|
||||
|
||||
Adds 176 KB to the binary size.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-hangul-encode`
|
||||
|
||||
Changes encoding precomposed Hangul syllables into EUC-KR from binary
|
||||
search over the decode-optimized tables to lookup by index making Korean
|
||||
plain-text encode about 4 times as fast as without this option.
|
||||
|
||||
Adds 20 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-hanja-encode`
|
||||
|
||||
Changes encoding of Hanja into EUC-KR from linear search over the
|
||||
decode-optimized table to lookup by index. Since Hanja is practically absent
|
||||
in modern Korean text, this option doesn't affect perfomance in the common
|
||||
case and mainly makes sense if you want to make your application resilient
|
||||
agaist denial of service by someone intentionally feeding it a lot of Hanja
|
||||
to encode into EUC-KR.
|
||||
|
||||
Adds 40 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-kanji-encode`
|
||||
|
||||
Changes encoding of Kanji into Shift_JIS, EUC-JP and ISO-2022-JP from linear
|
||||
search over the decode-optimized tables to lookup by index making Japanese
|
||||
plain-text encode to legacy encodings 30 to 50 times as fast as without this
|
||||
option (about 2 times as fast as with `less-slow-kanji-encode`).
|
||||
|
||||
Takes precedence over `less-slow-kanji-encode`.
|
||||
|
||||
Adds 36 KB to the binary size (24 KB compared to `less-slow-kanji-encode`).
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `less-slow-kanji-encode`
|
||||
|
||||
Makes JIS X 0208 Level 1 Kanji (the most common Kanji in Shift_JIS, EUC-JP and
|
||||
ISO-2022-JP) encode less slow (binary search instead of linear search) making
|
||||
Japanese plain-text encode to legacy encodings 14 to 23 times as fast as
|
||||
without this option.
|
||||
|
||||
Adds 12 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-gb-hanzi-encode`
|
||||
|
||||
Changes encoding of Hanzi in the CJK Unified Ideographs block into GBK and
|
||||
gb18030 from linear search over a part the decode-optimized tables followed
|
||||
by a binary search over another part of the decode-optimized tables to lookup
|
||||
by index making Simplified Chinese plain-text encode to the legacy encodings
|
||||
100 to 110 times as fast as without this option (about 2.5 times as fast as
|
||||
with `less-slow-gb-hanzi-encode`).
|
||||
|
||||
Takes precedence over `less-slow-gb-hanzi-encode`.
|
||||
|
||||
Adds 36 KB to the binary size (24 KB compared to `less-slow-gb-hanzi-encode`).
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `less-slow-gb-hanzi-encode`
|
||||
|
||||
Makes GB2312 Level 1 Hanzi (the most common Hanzi in gb18030 and GBK) encode
|
||||
less slow (binary search instead of linear search) making Simplified Chinese
|
||||
plain-text encode to the legacy encodings about 40 times as fast as without
|
||||
this option.
|
||||
|
||||
Adds 12 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `fast-big5-hanzi-encode`
|
||||
|
||||
Changes encoding of Hanzi in the CJK Unified Ideographs block into Big5 from
|
||||
linear search over a part the decode-optimized tables to lookup by index
|
||||
making Traditional Chinese plain-text encode to Big5 105 to 125 times as fast
|
||||
as without this option (about 3 times as fast as with
|
||||
`less-slow-big5-hanzi-encode`).
|
||||
|
||||
Takes precedence over `less-slow-big5-hanzi-encode`.
|
||||
|
||||
Adds 40 KB to the binary size (20 KB compared to `less-slow-big5-hanzi-encode`).
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
### `less-slow-big5-hanzi-encode`
|
||||
|
||||
Makes Big5 Level 1 Hanzi (the most common Hanzi in Big5) encode less slow
|
||||
(binary search instead of linear search) making Traditional Chinese
|
||||
plain-text encode to Big5 about 36 times as fast as without this option.
|
||||
|
||||
Adds 20 KB to the binary size.
|
||||
|
||||
Does _not_ affect decode speed.
|
||||
|
||||
Not used by Firefox.
|
||||
|
||||
## Performance goals
|
||||
|
||||
For decoding to UTF-16, the goal is to perform at least as well as Gecko's old
|
||||
uconv. For decoding to UTF-8, the goal is to perform at least as well as
|
||||
rust-encoding. These goals have been achieved.
|
||||
|
||||
Encoding to UTF-8 should be fast. (UTF-8 to UTF-8 encode should be equivalent
|
||||
to `memcpy` and UTF-16 to UTF-8 should be fast.)
|
||||
|
||||
Speed is a non-goal when encoding to legacy encodings. By default, encoding to
|
||||
legacy encodings should not be optimized for speed at the expense of code size
|
||||
as long as form submission and URL parsing in Gecko don't become noticeably
|
||||
too slow in real-world use.
|
||||
|
||||
In the interest of binary size, by default, encoding_rs does not have
|
||||
encode-specific data tables beyond 32 bits of encode-specific data for each
|
||||
single-byte encoding. Therefore, encoders search the decode-optimized data
|
||||
tables. This is a linear search in most cases. As a result, by default, encode
|
||||
to legacy encodings varies from slow to extremely slow relative to other
|
||||
libraries. Still, with realistic work loads, this seemed fast enough not to be
|
||||
user-visibly slow on Raspberry Pi 3 (which stood in for a phone for testing)
|
||||
in the Web-exposed encoder use cases.
|
||||
|
||||
See the cargo features above for optionally making CJK legacy encode fast.
|
||||
|
||||
A framework for measuring performance is [available separately][2].
|
||||
|
||||
[2]: https://github.com/hsivonen/encoding_bench/
|
||||
|
||||
## Rust Version Compatibility
|
||||
|
||||
It is a goal to support the latest stable Rust, the latest nightly Rust and
|
||||
the version of Rust that's used for Firefox Nightly.
|
||||
|
||||
At this time, there is no firm commitment to support a version older than
|
||||
what's required by Firefox, and there is no commitment to treat MSRV changes
|
||||
as semver-breaking, because this crate depends on `cfg-if`, which doesn't
|
||||
appear to treat MSRV changes as semver-breaking, so it would be useless for
|
||||
this crate to treat MSRV changes as semver-breaking.
|
||||
|
||||
As of 2021-02-04, MSRV appears to be Rust 1.36.0 for using the crate and
|
||||
1.42.0 for doc tests to pass without errors about the global allocator.
|
||||
|
||||
## Compatibility with rust-encoding
|
||||
|
||||
A compatibility layer that implements the rust-encoding API on top of
|
||||
encoding_rs is
|
||||
[provided as a separate crate](https://github.com/hsivonen/encoding_rs_compat)
|
||||
(cannot be uploaded to crates.io). The compatibility layer was originally
|
||||
written with the assuption that Firefox would need it, but it is not currently
|
||||
used in Firefox.
|
||||
|
||||
## Regenerating Generated Code
|
||||
|
||||
To regenerate the generated code:
|
||||
|
||||
* Have Python 2 installed.
|
||||
* Clone [`https://github.com/hsivonen/encoding_c`](https://github.com/hsivonen/encoding_c)
|
||||
next to the `encoding_rs` directory.
|
||||
* Clone [`https://github.com/hsivonen/codepage`](https://github.com/hsivonen/codepage)
|
||||
next to the `encoding_rs` directory.
|
||||
* Clone [`https://github.com/whatwg/encoding`](https://github.com/whatwg/encoding)
|
||||
next to the `encoding_rs` directory.
|
||||
* Checkout revision `be3337450e7df1c49dca7872153c4c4670dd8256` of the `encoding` repo.
|
||||
(Note: `f381389` was the revision of `encoding` used from before the `encoding` repo
|
||||
license change. So far, only output changed since then has been updated to
|
||||
the new license legend.)
|
||||
* With the `encoding_rs` directory as the working directory, run
|
||||
`python generate-encoding-data.py`.
|
||||
|
||||
## Roadmap
|
||||
|
||||
- [x] Design the low-level API.
|
||||
- [x] Provide Rust-only convenience features.
|
||||
- [x] Provide an stl/gsl-flavored C++ API.
|
||||
- [x] Implement all decoders and encoders.
|
||||
- [x] Add unit tests for all decoders and encoders.
|
||||
- [x] Finish BOM sniffing variants in Rust-only convenience features.
|
||||
- [x] Document the API.
|
||||
- [x] Publish the crate on crates.io.
|
||||
- [x] Create a solution for measuring performance.
|
||||
- [x] Accelerate ASCII conversions using SSE2 on x86.
|
||||
- [x] Accelerate ASCII conversions using ALU register-sized operations on
|
||||
non-x86 architectures (process an `usize` instead of `u8` at a time).
|
||||
- [x] Split FFI into a separate crate so that the FFI doesn't interfere with
|
||||
LTO in pure-Rust usage.
|
||||
- [x] Compress CJK indices by making use of sequential code points as well
|
||||
as Unicode-ordered parts of indices.
|
||||
- [x] Make lookups by label or name use binary search that searches from the
|
||||
end of the label/name to the start.
|
||||
- [x] Make labels with non-ASCII bytes fail fast.
|
||||
- [ ] ~Parallelize UTF-8 validation using [Rayon](https://github.com/nikomatsakis/rayon).~
|
||||
(This turned out to be a pessimization in the ASCII case due to memory bandwidth reasons.)
|
||||
- [x] Provide an XPCOM/MFBT-flavored C++ API.
|
||||
- [x] Investigate accelerating single-byte encode with a single fast-tracked
|
||||
range per encoding.
|
||||
- [x] Replace uconv with encoding_rs in Gecko.
|
||||
- [x] Implement the rust-encoding API in terms of encoding_rs.
|
||||
- [x] Add SIMD acceleration for Aarch64.
|
||||
- [x] Investigate the use of NEON on 32-bit ARM.
|
||||
- [ ] ~Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
|
||||
adapted to Rust in rust-encoding.~
|
||||
- [x] Add actually fast CJK encode options.
|
||||
- [ ] ~Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).~
|
||||
- [ ] Provide a build mode that works without `alloc` (with lesser API surface).
|
||||
- [ ] Migrate to `std::simd` once it is stable and declare 1.0.
|
||||
|
||||
## Release Notes
|
||||
|
||||
### 0.8.31
|
||||
|
||||
* Use SPDX with parentheses now that crates.io supports parentheses.
|
||||
|
||||
### 0.8.30
|
||||
|
||||
* Update the licensing information to take into account the WHATWG data license change.
|
||||
|
||||
### 0.8.29
|
||||
|
||||
* Make the parts that use an allocator optional.
|
||||
|
||||
### 0.8.28
|
||||
|
||||
* Fix error in Serde support introduced as part of `no_std` support.
|
||||
|
||||
### 0.8.27
|
||||
|
||||
* Make the crate works in a `no_std` environment (with `alloc`).
|
||||
|
||||
### 0.8.26
|
||||
|
||||
* Fix oversights in edition 2018 migration that broke the `simd-accel` feature.
|
||||
|
||||
### 0.8.25
|
||||
|
||||
* Do pointer alignment checks in a way where intermediate steps aren't defined to be Undefined Behavior.
|
||||
* Update the `packed_simd` dependency to `packed_simd_2`.
|
||||
* Update the `cfg-if` dependency to 1.0.
|
||||
* Address warnings that have been introduced by newer Rust versions along the way.
|
||||
* Update to edition 2018, since even prior to 1.0 `cfg-if` updated to edition 2018 without a semver break.
|
||||
|
||||
### 0.8.24
|
||||
|
||||
* Avoid computing an intermediate (not dereferenced) pointer value in a manner designated as Undefined Behavior when computing pointer alignment.
|
||||
|
||||
### 0.8.23
|
||||
|
||||
* Remove year from copyright notices. (No features or bug fixes.)
|
||||
|
||||
### 0.8.22
|
||||
|
||||
* Formatting fix and new unit test. (No features or bug fixes.)
|
||||
|
||||
### 0.8.21
|
||||
|
||||
* Fixed a panic with invalid UTF-16[BE|LE] input at the end of the stream.
|
||||
|
||||
### 0.8.20
|
||||
|
||||
* Make `Decoder::latin1_byte_compatible_up_to` return `None` in more
|
||||
cases to make the method actually useful. While this could be argued
|
||||
to be a breaking change due to the bug fix changing semantics, it does
|
||||
not break callers that had to handle the `None` case in a reasonable
|
||||
way anyway.
|
||||
|
||||
### 0.8.19
|
||||
|
||||
* Removed a bunch of bound checks in `convert_str_to_utf16`.
|
||||
* Added `mem::convert_utf8_to_utf16_without_replacement`.
|
||||
|
||||
### 0.8.18
|
||||
|
||||
* Added `mem::utf8_latin1_up_to` and `mem::str_latin1_up_to`.
|
||||
* Added `Decoder::latin1_byte_compatible_up_to`.
|
||||
|
||||
### 0.8.17
|
||||
|
||||
* Update `bincode` (dev dependency) version requirement to 1.0.
|
||||
|
||||
### 0.8.16
|
||||
|
||||
* Switch from the `simd` crate to `packed_simd`.
|
||||
|
||||
### 0.8.15
|
||||
|
||||
* Adjust documentation for `simd-accel` (README-only release).
|
||||
|
||||
### 0.8.14
|
||||
|
||||
* Made UTF-16 to UTF-8 encode conversion fill the output buffer as
|
||||
closely as possible.
|
||||
|
||||
### 0.8.13
|
||||
|
||||
* Made the UTF-8 to UTF-16 decoder compare the number of code units written
|
||||
with the length of the right slice (the output slice) to fix a panic
|
||||
introduced in 0.8.11.
|
||||
|
||||
### 0.8.12
|
||||
|
||||
* Removed the `clippy::` prefix from clippy lint names.
|
||||
|
||||
### 0.8.11
|
||||
|
||||
* Changed minimum Rust requirement to 1.29.0 (for the ability to refer
|
||||
to the interior of a `static` when defining another `static`).
|
||||
* Explicitly aligned the lookup tables for single-byte encodings and
|
||||
UTF-8 to cache lines in the hope of freeing up one cache line for
|
||||
other data. (Perhaps the tables were already aligned and this is
|
||||
placebo.)
|
||||
* Added 32 bits of encode-oriented data for each single-byte encoding.
|
||||
The change was performance-neutral for non-Latin1-ish Latin legacy
|
||||
encodings, improved Latin1-ish and Arabic legacy encode speed
|
||||
somewhat (new speed is 2.4x the old speed for German, 2.3x for
|
||||
Arabic, 1.7x for Portuguese and 1.4x for French) and improved
|
||||
non-Latin1, non-Arabic legacy single-byte encode a lot (7.2x for
|
||||
Thai, 6x for Greek, 5x for Russian, 4x for Hebrew).
|
||||
* Added compile-time options for fast CJK legacy encode options (at
|
||||
the cost of binary size (up to 176 KB) and run-time memory usage).
|
||||
These options still retain the overall code structure instead of
|
||||
rewriting the CJK encoders totally, so the speed isn't as good as
|
||||
what could be achieved by using even more memory / making the
|
||||
binary even langer.
|
||||
* Made UTF-8 decode and validation faster.
|
||||
* Added method `is_single_byte()` on `Encoding`.
|
||||
* Added `mem::decode_latin1()` and `mem::encode_latin1_lossy()`.
|
||||
|
||||
### 0.8.10
|
||||
|
||||
* Disabled a unit test that tests a panic condition when the assertion
|
||||
being tested is disabled.
|
||||
|
||||
### 0.8.9
|
||||
|
||||
* Made `--features simd-accel` work with stable-channel compiler to
|
||||
simplify the Firefox build system.
|
||||
|
||||
### 0.8.8
|
||||
|
||||
* Made the `is_foo_bidi()` not treat U+FEFF (ZERO WIDTH NO-BREAK SPACE
|
||||
aka. BYTE ORDER MARK) as right-to-left.
|
||||
* Made the `is_foo_bidi()` functions report `true` if the input contains
|
||||
Hebrew presentations forms (which are right-to-left but not in a
|
||||
right-to-left-roadmapped block).
|
||||
|
||||
### 0.8.7
|
||||
|
||||
* Fixed a panic in the UTF-16LE/UTF-16BE decoder when decoding to UTF-8.
|
||||
|
||||
### 0.8.6
|
||||
|
||||
* Temporarily removed the debug assertion added in version 0.8.5 from
|
||||
`convert_utf16_to_latin1_lossy`.
|
||||
|
||||
### 0.8.5
|
||||
|
||||
* If debug assertions are enabled but fuzzing isn't enabled, lossy conversions
|
||||
to Latin1 in the `mem` module assert that the input is in the range
|
||||
U+0000...U+00FF (inclusive).
|
||||
* In the `mem` module provide conversions from Latin1 and UTF-16 to UTF-8
|
||||
that can deal with insufficient output space. The idea is to use them
|
||||
first with an allocation rounded up to jemalloc bucket size and do the
|
||||
worst-case allocation only if the jemalloc rounding up was insufficient
|
||||
as the first guess.
|
||||
|
||||
### 0.8.4
|
||||
|
||||
* Fix SSE2-specific, `simd-accel`-specific memory corruption introduced in
|
||||
version 0.8.1 in conversions between UTF-16 and Latin1 in the `mem` module.
|
||||
|
||||
### 0.8.3
|
||||
|
||||
* Removed an `#[inline(never)]` annotation that was not meant for release.
|
||||
|
||||
### 0.8.2
|
||||
|
||||
* Made non-ASCII UTF-16 to UTF-8 encode faster by manually omitting bound
|
||||
checks and manually adding branch prediction annotations.
|
||||
|
||||
### 0.8.1
|
||||
|
||||
* Tweaked loop unrolling and memory alignment for SSE2 conversions between
|
||||
UTF-16 and Latin1 in the `mem` module to increase the performance when
|
||||
converting long buffers.
|
||||
|
||||
### 0.8.0
|
||||
|
||||
* Changed the minimum supported version of Rust to 1.21.0 (semver breaking
|
||||
change).
|
||||
* Flipped around the defaults vs. optional features for controlling the size
|
||||
vs. speed trade-off for Kanji and Hanzi legacy encode (semver breaking
|
||||
change).
|
||||
* Added NEON support on ARMv7.
|
||||
* SIMD-accelerated x-user-defined to UTF-16 decode.
|
||||
* Made UTF-16LE and UTF-16BE decode a lot faster (including SIMD
|
||||
acceleration).
|
||||
|
||||
### 0.7.2
|
||||
|
||||
* Add the `mem` module.
|
||||
* Refactor SIMD code which can affect performance outside the `mem`
|
||||
module.
|
||||
|
||||
### 0.7.1
|
||||
|
||||
* When encoding from invalid UTF-16, correctly handle U+DC00 followed by
|
||||
another low surrogate.
|
||||
|
||||
### 0.7.0
|
||||
|
||||
* [Make `replacement` a label of the replacement
|
||||
encoding.](https://github.com/whatwg/encoding/issues/70) (Spec change.)
|
||||
* Remove `Encoding::for_name()`. (`Encoding::for_label(foo).unwrap()` is
|
||||
now close enough after the above label change.)
|
||||
* Remove the `parallel-utf8` cargo feature.
|
||||
* Add optional Serde support for `&'static Encoding`.
|
||||
* Performance tweaks for ASCII handling.
|
||||
* Performance tweaks for UTF-8 validation.
|
||||
* SIMD support on aarch64.
|
||||
|
||||
### 0.6.11
|
||||
|
||||
* Make `Encoder::has_pending_state()` public.
|
||||
* Update the `simd` crate dependency to 0.2.0.
|
||||
|
||||
### 0.6.10
|
||||
|
||||
* Reserve enough space for NCRs when encoding to ISO-2022-JP.
|
||||
* Correct max length calculations for multibyte decoders.
|
||||
* Correct max length calculations before BOM sniffing has been
|
||||
performed.
|
||||
* Correctly calculate max length when encoding from UTF-16 to GBK.
|
||||
|
||||
### 0.6.9
|
||||
|
||||
* [Don't prepend anything when gb18030 range decode
|
||||
fails](https://github.com/whatwg/encoding/issues/110). (Spec change.)
|
||||
|
||||
### 0.6.8
|
||||
|
||||
* Correcly handle the case where the first buffer contains potentially
|
||||
partial BOM and the next buffer is the last buffer.
|
||||
* Decode byte `7F` correctly in ISO-2022-JP.
|
||||
* Make UTF-16 to UTF-8 encode write closer to the end of the buffer.
|
||||
* Implement `Hash` for `Encoding`.
|
||||
|
||||
### 0.6.7
|
||||
|
||||
* [Map half-width katakana to full-width katana in ISO-2022-JP
|
||||
encoder](https://github.com/whatwg/encoding/issues/105). (Spec change.)
|
||||
* Give `InputEmpty` correct precedence over `OutputFull` when encoding
|
||||
with replacement and the output buffer passed in is too short or the
|
||||
remaining space in the output buffer is too small after a replacement.
|
||||
|
||||
### 0.6.6
|
||||
|
||||
* Correct max length calculation when a partial BOM prefix is part of
|
||||
the decoder's state.
|
||||
|
||||
### 0.6.5
|
||||
|
||||
* Correct max length calculation in various encoders.
|
||||
* Correct max length calculation in the UTF-16 decoder.
|
||||
* Derive `PartialEq` and `Eq` for the `CoderResult`, `DecoderResult`
|
||||
and `EncoderResult` types.
|
||||
|
||||
### 0.6.4
|
||||
|
||||
* Avoid panic when encoding with replacement and the destination buffer is
|
||||
too short to hold one numeric character reference.
|
||||
|
||||
### 0.6.3
|
||||
|
||||
* Add support for 32-bit big-endian hosts. (For real this time.)
|
||||
|
||||
### 0.6.2
|
||||
|
||||
* Fix a panic from subslicing with bad indices in
|
||||
`Encoder::encode_from_utf16`. (Due to an oversight, it lacked the fix that
|
||||
`Encoder::encode_from_utf8` already had.)
|
||||
* Micro-optimize error status accumulation in non-streaming case.
|
||||
|
||||
### 0.6.1
|
||||
|
||||
* Avoid panic near integer overflow in a case that's unlikely to actually
|
||||
happen.
|
||||
* Address Clippy lints.
|
||||
|
||||
### 0.6.0
|
||||
|
||||
* Make the methods for computing worst-case buffer size requirements check
|
||||
for integer overflow.
|
||||
* Upgrade rayon to 0.7.0.
|
||||
|
||||
### 0.5.1
|
||||
|
||||
* Reorder methods for better documentation readability.
|
||||
* Add support for big-endian hosts. (Only 64-bit case actually tested.)
|
||||
* Optimize the ALU (non-SIMD) case for 32-bit ARM instead of x86_64.
|
||||
|
||||
### 0.5.0
|
||||
|
||||
* Avoid allocating an excessively long buffers in non-streaming decode.
|
||||
* Fix the behavior of ISO-2022-JP and replacement decoders near the end of the
|
||||
output buffer.
|
||||
* Annotate the result structs with `#[must_use]`.
|
||||
|
||||
### 0.4.0
|
||||
|
||||
* Split FFI into a separate crate.
|
||||
* Performance tweaks.
|
||||
* CJK binary size and encoding performance changes.
|
||||
* Parallelize UTF-8 validation in the case of long buffers (with optional
|
||||
feature `parallel-utf8`).
|
||||
* Borrow even with ISO-2022-JP when possible.
|
||||
|
||||
### 0.3.2
|
||||
|
||||
* Fix moving pointers to alignment in ALU-based ASCII acceleration.
|
||||
* Fix errors in documentation and improve documentation.
|
||||
|
||||
### 0.3.1
|
||||
|
||||
* Fix UTF-8 to UTF-16 decode for byte sequences beginning with 0xEE.
|
||||
* Make UTF-8 to UTF-8 decode SSE2-accelerated when feature `simd-accel` is used.
|
||||
* When decoding and encoding ASCII-only input from or to an ASCII-compatible
|
||||
encoding using the non-streaming API, return a borrow of the input.
|
||||
* Make encode from UTF-16 to UTF-8 faster.
|
||||
|
||||
### 0.3
|
||||
|
||||
* Change the references to the instances of `Encoding` from `const` to `static`
|
||||
to make the referents unique across crates that use the refernces.
|
||||
* Introduce non-reference-typed `FOO_INIT` instances of `Encoding` to allow
|
||||
foreign crates to initialize `static` arrays with references to `Encoding`
|
||||
instances even under Rust's constraints that prohibit the initialization of
|
||||
`&'static Encoding`-typed array items with `&'static Encoding`-typed
|
||||
`statics`.
|
||||
* Document that the above two points will be reverted if Rust changes `const`
|
||||
to work so that cross-crate usage keeps the referents unique.
|
||||
* Return `Cow`s from Rust-only non-streaming methods for encode and decode.
|
||||
* `Encoding::for_bom()` returns the length of the BOM.
|
||||
* ASCII-accelerated conversions for encodings other than UTF-16LE, UTF-16BE,
|
||||
ISO-2022-JP and x-user-defined.
|
||||
* Add SSE2 acceleration behind the `simd-accel` feature flag. (Requires
|
||||
nightly Rust.)
|
||||
* Fix panic with long bogus labels.
|
||||
* Map [0xCA to U+05BA in windows-1255](https://github.com/whatwg/encoding/issues/73).
|
||||
(Spec change.)
|
||||
* Correct the [end of the Shift_JIS EUDC range](https://github.com/whatwg/encoding/issues/53).
|
||||
(Spec change.)
|
||||
|
||||
### 0.2.4
|
||||
|
||||
* Polish FFI documentation.
|
||||
|
||||
### 0.2.3
|
||||
|
||||
* Fix UTF-16 to UTF-8 encode.
|
||||
|
||||
### 0.2.2
|
||||
|
||||
* Add `Encoder.encode_from_utf8_to_vec_without_replacement()`.
|
||||
|
||||
### 0.2.1
|
||||
|
||||
* Add `Encoding.is_ascii_compatible()`.
|
||||
|
||||
* Add `Encoding::for_bom()`.
|
||||
|
||||
* Make `==` for `Encoding` use name comparison instead of pointer comparison,
|
||||
because uses of the encoding constants in different crates result in
|
||||
different addresses and the constant cannot be turned into statics without
|
||||
breaking other things.
|
||||
|
||||
### 0.2.0
|
||||
|
||||
The initial release.
|
||||
12
zeroidc/vendor/encoding_rs/build.rs
vendored
Normal file
12
zeroidc/vendor/encoding_rs/build.rs
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
fn main() {
|
||||
// This does not enable `RUSTC_BOOTSTRAP=1` for `packed_simd`.
|
||||
// You still need to knowingly have a setup that makes
|
||||
// `packed_simd` compile. Therefore, having this file on
|
||||
// crates.io is harmless in terms of users of `encoding_rs`
|
||||
// accidentally depending on nightly features. Having this
|
||||
// here means that if you knowingly want this, you only
|
||||
// need to maintain a fork of `packed_simd` without _also_
|
||||
// having to maintain a fork of `encoding_rs`.
|
||||
#[cfg(feature = "simd-accel")]
|
||||
println!("cargo:rustc-env=RUSTC_BOOTSTRAP=1");
|
||||
}
|
||||
14
zeroidc/vendor/encoding_rs/ci/miri.sh
vendored
Normal file
14
zeroidc/vendor/encoding_rs/ci/miri.sh
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
set -ex
|
||||
|
||||
# Install Miri.
|
||||
MIRI_NIGHTLY=nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/miri)
|
||||
echo "Installing latest nightly with Miri: $MIRI_NIGHTLY"
|
||||
rustup default "$MIRI_NIGHTLY"
|
||||
rustup component add miri
|
||||
|
||||
# Run tests.
|
||||
# Stacked Borrows is disabled as it costs too much RAM (due to our large tables).
|
||||
MIRIFLAGS="-Zmiri-disable-stacked-borrows" cargo miri test
|
||||
|
||||
# Restore old state in case Travis uses this cache for other jobs.
|
||||
rustup default nightly
|
||||
16
zeroidc/vendor/encoding_rs/doc/Big5.txt
vendored
Normal file
16
zeroidc/vendor/encoding_rs/doc/Big5.txt
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
|
||||
/// instead of the Private Use Area code points that have been used historically.
|
||||
/// It is believed to be able to decode existing Web content in a way that makes
|
||||
/// sense.
|
||||
///
|
||||
/// To avoid form submissions generating data that Web servers don't understand,
|
||||
/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
|
||||
/// Big5 in the lexical order.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
|
||||
///
|
||||
/// This encoding is designed to be suited for decoding the Windows code page 950
|
||||
/// and its HKSCS patched "951" variant such that the text makes sense, given
|
||||
/// assignments that Unicode has made after those encodings used Private Use
|
||||
/// Area characters.
|
||||
12
zeroidc/vendor/encoding_rs/doc/EUC-JP.txt
vendored
Normal file
12
zeroidc/vendor/encoding_rs/doc/EUC-JP.txt
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
/// This is the legacy Unix encoding for Japanese.
|
||||
///
|
||||
/// For compatibility with Web servers that don't expect three-byte sequences
|
||||
/// in form submissions, the encoder doesn't generate three-byte sequences.
|
||||
/// That is, the JIS X 0212 support is decode-only.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
|
||||
///
|
||||
/// This encoding roughly matches the Windows code page 20932. There are error
|
||||
/// handling differences and a handful of 2-byte sequences that decode differently.
|
||||
/// Additionall, Windows doesn't support 3-byte sequences.
|
||||
10
zeroidc/vendor/encoding_rs/doc/EUC-KR.txt
vendored
Normal file
10
zeroidc/vendor/encoding_rs/doc/EUC-KR.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
|
||||
/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
|
||||
/// Classic), with all the characters from the Hangul Syllables block of Unicode.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
|
||||
/// to U+0080 and some byte sequences that are error per the Encoding Standard to
|
||||
/// the question mark or the Private Use Area.
|
||||
16
zeroidc/vendor/encoding_rs/doc/GBK.txt
vendored
Normal file
16
zeroidc/vendor/encoding_rs/doc/GBK.txt
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
/// The decoder for this encoding is the same as the decoder for gb18030.
|
||||
/// The encoder side of this encoding is GBK with Windows code page 936 euro
|
||||
/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
|
||||
/// Unicode block as well as a handful of ideographs from the CJK Unified
|
||||
/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
|
||||
///
|
||||
/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
|
||||
/// unified with the gb18030 encoder in the Encoding Standard out of concern
|
||||
/// that servers that expect GBK form submissions might not be able to handle
|
||||
/// the four-byte sequences.
|
||||
///
|
||||
/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
|
||||
/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
|
||||
///
|
||||
/// The encoder of this encoding roughly matches the Windows code page 936.
|
||||
/// The decoder side is a superset.
|
||||
8
zeroidc/vendor/encoding_rs/doc/IBM866.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/IBM866.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This the most notable one of the DOS Cyrillic code pages. It has the same
|
||||
/// box drawing characters as code page 437, so it can be used for decoding
|
||||
/// DOS-era ASCII + box drawing data.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 866.
|
||||
10
zeroidc/vendor/encoding_rs/doc/ISO-2022-JP.txt
vendored
Normal file
10
zeroidc/vendor/encoding_rs/doc/ISO-2022-JP.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
|
||||
/// byte range to encode non-Basic Latin characters. It's the only encoding
|
||||
/// supported by this crate whose encoder is stateful.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
|
||||
///
|
||||
/// This encoding roughly matches the Windows code page 50220. Notably, Windows
|
||||
/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
|
||||
/// error handling.
|
||||
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-10.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-10.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
|
||||
/// is also known as Latin 6.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
|
||||
///
|
||||
/// The Windows code page number for this encoding is 28600, but kernel32.dll
|
||||
/// does not support this encoding.
|
||||
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-13.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-13.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
|
||||
/// is also known as Latin 7.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28603, except Windows decodes
|
||||
/// unassigned code points to the Private Use Area of Unicode.
|
||||
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-14.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-14.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
|
||||
/// is also known as Latin 8.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
|
||||
///
|
||||
/// The Windows code page number for this encoding is 28604, but kernel32.dll
|
||||
/// does not support this encoding.
|
||||
7
zeroidc/vendor/encoding_rs/doc/ISO-8859-15.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/ISO-8859-15.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the revised Western European part of the ISO/IEC 8859 encoding
|
||||
/// family. This encoding is also known as Latin 9.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28605.
|
||||
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-16.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/ISO-8859-16.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
|
||||
/// family. This encoding is also known as Latin 10.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
|
||||
///
|
||||
/// The Windows code page number for this encoding is 28606, but kernel32.dll
|
||||
/// does not support this encoding.
|
||||
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-2.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-2.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28592.
|
||||
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-3.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-3.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28593.
|
||||
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-4.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-4.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28594.
|
||||
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-5.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/ISO-8859-5.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28595.
|
||||
7
zeroidc/vendor/encoding_rs/doc/ISO-8859-6.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/ISO-8859-6.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the Arabic part of the ISO/IEC 8859 encoding family.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 28596, except Windows decodes
|
||||
/// unassigned code points to the Private Use Area of Unicode.
|
||||
11
zeroidc/vendor/encoding_rs/doc/ISO-8859-7.txt
vendored
Normal file
11
zeroidc/vendor/encoding_rs/doc/ISO-8859-7.txt
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
/// This is the Greek part of the ISO/IEC 8859 encoding family.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
|
||||
///
|
||||
/// This encoding roughly matches the Windows code page 28597. Windows decodes
|
||||
/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
|
||||
/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
|
||||
/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
|
||||
/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
|
||||
/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
|
||||
9
zeroidc/vendor/encoding_rs/doc/ISO-8859-8-I.txt
vendored
Normal file
9
zeroidc/vendor/encoding_rs/doc/ISO-8859-8-I.txt
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
|
||||
///
|
||||
/// This encoding roughly matches the Windows code page 38598. Windows decodes
|
||||
/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
|
||||
/// Area instead of LRM and RLM. Windows decodes unassigned code points to
|
||||
/// the private use area.
|
||||
9
zeroidc/vendor/encoding_rs/doc/ISO-8859-8.txt
vendored
Normal file
9
zeroidc/vendor/encoding_rs/doc/ISO-8859-8.txt
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
|
||||
///
|
||||
/// This encoding roughly matches the Windows code page 28598. Windows decodes
|
||||
/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
|
||||
/// Area instead of LRM and RLM. Windows decodes unassigned code points to
|
||||
/// the private use area.
|
||||
6
zeroidc/vendor/encoding_rs/doc/KOI8-R.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/KOI8-R.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 20866.
|
||||
6
zeroidc/vendor/encoding_rs/doc/KOI8-U.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/KOI8-U.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is an encoding for Ukrainian adapted from KOI8-R.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 21866.
|
||||
8
zeroidc/vendor/encoding_rs/doc/Shift_JIS.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/Shift_JIS.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the Japanese encoding for Windows.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 932, except Windows decodes some byte
|
||||
/// sequences that are error per the Encoding Standard to the question mark or the
|
||||
/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
|
||||
8
zeroidc/vendor/encoding_rs/doc/UTF-16BE.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/UTF-16BE.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This decode-only encoding uses 16-bit code units due to Unicode originally
|
||||
/// having been designed as a 16-bit reportoire. In the absence of a byte order
|
||||
/// mark the big endian byte order is assumed.
|
||||
///
|
||||
/// There is no corresponding encoder in this crate or in the Encoding
|
||||
/// Standard. The output encoding of this encoding is UTF-8.
|
||||
///
|
||||
/// This encoding matches the Windows code page 1201.
|
||||
8
zeroidc/vendor/encoding_rs/doc/UTF-16LE.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/UTF-16LE.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This decode-only encoding uses 16-bit code units due to Unicode originally
|
||||
/// having been designed as a 16-bit reportoire. In the absence of a byte order
|
||||
/// mark the little endian byte order is assumed.
|
||||
///
|
||||
/// There is no corresponding encoder in this crate or in the Encoding
|
||||
/// Standard. The output encoding of this encoding is UTF-8.
|
||||
///
|
||||
/// This encoding matches the Windows code page 1200.
|
||||
5
zeroidc/vendor/encoding_rs/doc/UTF-8.txt
vendored
Normal file
5
zeroidc/vendor/encoding_rs/doc/UTF-8.txt
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
/// This is the encoding that should be used for all new development it can
|
||||
/// represent all of Unicode.
|
||||
///
|
||||
/// This encoding matches the Windows code page 65001, except Windows differs
|
||||
/// in the number of errors generated for some erroneous byte sequences.
|
||||
9
zeroidc/vendor/encoding_rs/doc/gb18030.txt
vendored
Normal file
9
zeroidc/vendor/encoding_rs/doc/gb18030.txt
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
|
||||
/// maps to U+3000 for compatibility with existing Web content. As a result,
|
||||
/// this encoding can represent all of Unicode except for the private-use
|
||||
/// character U+E5E5.
|
||||
///
|
||||
/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
|
||||
/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 54936.
|
||||
7
zeroidc/vendor/encoding_rs/doc/macintosh.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/macintosh.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the MacRoman encoding from Mac OS Classic.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 10000, except Windows decodes
|
||||
/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
|
||||
10
zeroidc/vendor/encoding_rs/doc/replacement.txt
vendored
Normal file
10
zeroidc/vendor/encoding_rs/doc/replacement.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
/// This decode-only encoding decodes all non-zero-length streams to a single
|
||||
/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
|
||||
/// ASCII-compatible fallback encoding (typically windows-1252) for some
|
||||
/// encodings that are no longer supported by the Web Platform and that
|
||||
/// would be dangerous to treat as ASCII-compatible.
|
||||
///
|
||||
/// There is no corresponding encoder. The output encoding of this encoding
|
||||
/// is UTF-8.
|
||||
///
|
||||
/// This encoding does not have a Windows code page number.
|
||||
6
zeroidc/vendor/encoding_rs/doc/windows-1250.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/windows-1250.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the Central European encoding for Windows.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1250.
|
||||
6
zeroidc/vendor/encoding_rs/doc/windows-1251.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/windows-1251.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the Cyrillic encoding for Windows.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1251.
|
||||
7
zeroidc/vendor/encoding_rs/doc/windows-1252.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/windows-1252.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
|
||||
/// which is known as Latin 1.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1252.
|
||||
8
zeroidc/vendor/encoding_rs/doc/windows-1253.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/windows-1253.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the Greek encoding for Windows. It is mostly an extension of
|
||||
/// ISO-8859-7, but U+0386 is mapped to a different byte.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1253, except Windows decodes
|
||||
/// unassigned code points to the Private Use Area of Unicode.
|
||||
7
zeroidc/vendor/encoding_rs/doc/windows-1254.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/windows-1254.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
|
||||
/// which is known as Latin 5.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1254.
|
||||
8
zeroidc/vendor/encoding_rs/doc/windows-1255.txt
vendored
Normal file
8
zeroidc/vendor/encoding_rs/doc/windows-1255.txt
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
|
||||
/// except for a currency sign swap.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1255, except Windows decodes
|
||||
/// unassigned code points to the Private Use Area of Unicode.
|
||||
6
zeroidc/vendor/encoding_rs/doc/windows-1256.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/windows-1256.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the Arabic encoding for Windows.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1256.
|
||||
7
zeroidc/vendor/encoding_rs/doc/windows-1257.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/windows-1257.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the Baltic encoding for Windows.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1257, except Windows decodes
|
||||
/// unassigned code points to the Private Use Area of Unicode.
|
||||
11
zeroidc/vendor/encoding_rs/doc/windows-1258.txt
vendored
Normal file
11
zeroidc/vendor/encoding_rs/doc/windows-1258.txt
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
/// This is the Vietnamese encoding for Windows.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 1258 when used in the
|
||||
/// non-normalizing mode. Unlike with the other single-byte encodings, the
|
||||
/// result of decoding is not necessarily in Normalization Form C. On the
|
||||
/// other hand, input in the Normalization Form C is not encoded without
|
||||
/// replacement. In general, it's a bad idea to encode to encodings other
|
||||
/// than UTF-8, but this encoding is especially hazardous to encode to.
|
||||
7
zeroidc/vendor/encoding_rs/doc/windows-874.txt
vendored
Normal file
7
zeroidc/vendor/encoding_rs/doc/windows-874.txt
vendored
Normal file
@@ -0,0 +1,7 @@
|
||||
/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 874, except Windows decodes
|
||||
/// unassigned code points to the Private Use Area of Unicode.
|
||||
6
zeroidc/vendor/encoding_rs/doc/x-mac-cyrillic.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/x-mac-cyrillic.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This is the MacUkrainian encoding from Mac OS Classic.
|
||||
///
|
||||
/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
|
||||
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
|
||||
///
|
||||
/// This encoding matches the Windows code page 10017.
|
||||
6
zeroidc/vendor/encoding_rs/doc/x-user-defined.txt
vendored
Normal file
6
zeroidc/vendor/encoding_rs/doc/x-user-defined.txt
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
|
||||
/// them to the Private Use Area of Unicode. It was used for loading binary
|
||||
/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
|
||||
/// the `"arraybuffer"` response type.
|
||||
///
|
||||
/// This encoding does not have a Windows code page number.
|
||||
2008
zeroidc/vendor/encoding_rs/generate-encoding-data.py
vendored
Normal file
2008
zeroidc/vendor/encoding_rs/generate-encoding-data.py
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1
zeroidc/vendor/encoding_rs/rustfmt.toml
vendored
Normal file
1
zeroidc/vendor/encoding_rs/rustfmt.toml
vendored
Normal file
@@ -0,0 +1 @@
|
||||
error_on_line_overflow = false
|
||||
1548
zeroidc/vendor/encoding_rs/src/ascii.rs
vendored
Normal file
1548
zeroidc/vendor/encoding_rs/src/ascii.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
427
zeroidc/vendor/encoding_rs/src/big5.rs
vendored
Normal file
427
zeroidc/vendor/encoding_rs/src/big5.rs
vendored
Normal file
@@ -0,0 +1,427 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range32;
|
||||
|
||||
pub struct Big5Decoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl Big5Decoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Big5(Big5Decoder { lead: None })
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.lead.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// If there is a lead but the next byte isn't a valid trail, an
|
||||
// error is generated for the lead (+1). Then another iteration checks
|
||||
// space, which needs +1 to account for the possibility of astral
|
||||
// output or combining pair.
|
||||
checked_add(1, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// No need to account for REPLACEMENT CHARACTERS.
|
||||
// Cases:
|
||||
// ASCII: 1 to 1
|
||||
// Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
|
||||
// lead set and first byte is trail: 1 to 4 worst case
|
||||
//
|
||||
// When checking for space for the last byte:
|
||||
// no lead: the last byte must be ASCII (or fatal error): 1 to 1
|
||||
// lead set: space for 4 bytes was already checked when reading the
|
||||
// lead, hence the last lead and the last trail together are worst
|
||||
// case 2 to 4.
|
||||
//
|
||||
// If lead set and the input is a single trail byte, the worst-case
|
||||
// output is 4, so we need to add one before multiplying if lead is
|
||||
// set.
|
||||
//
|
||||
// Finally, add two so that if input is non-zero, the output is at
|
||||
// least 4.
|
||||
checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// If there is a lead but the next byte isn't a valid trail, an
|
||||
// error is generated for the lead (+(1*3)). Then another iteration
|
||||
// checks space, which needs +3 to account for the possibility of astral
|
||||
// output or combining pair. In between start and end, the worst case
|
||||
// is that every byte is bad: *3.
|
||||
checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0x40 and 0x7E, inclusive,
|
||||
// subtract offset 0x40. Else if trail is
|
||||
// between 0xA1 and 0xFE, inclusive, subtract
|
||||
// offset 0x62.
|
||||
// TODO: Find out which range is more probable.
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start =
|
||||
byte.wrapping_sub(0xA1);
|
||||
if trail_minus_range_start >
|
||||
(0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = byte - 0x62;
|
||||
}
|
||||
let pointer = lead_minus_offset as usize *
|
||||
157usize +
|
||||
trail_minus_offset as usize;
|
||||
let rebased_pointer = pointer.wrapping_sub(942);
|
||||
let low_bits = big5_low_bits(rebased_pointer);
|
||||
if low_bits == 0 {
|
||||
match pointer {
|
||||
1133 => {
|
||||
handle.write_big5_combination(0x00CAu16,
|
||||
0x0304u16)
|
||||
}
|
||||
1135 => {
|
||||
handle.write_big5_combination(0x00CAu16,
|
||||
0x030Cu16)
|
||||
}
|
||||
1164 => {
|
||||
handle.write_big5_combination(0x00EAu16,
|
||||
0x0304u16)
|
||||
}
|
||||
1166 => {
|
||||
handle.write_big5_combination(0x00EAu16,
|
||||
0x030Cu16)
|
||||
}
|
||||
_ => {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
} else if big5_is_astral(rebased_pointer) {
|
||||
handle.write_astral(u32::from(low_bits) |
|
||||
0x20000u32)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(low_bits)
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_astral,
|
||||
check_space_astral,
|
||||
false);
|
||||
}
|
||||
|
||||
pub struct Big5Encoder;
|
||||
|
||||
impl Big5Encoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
// Astral: 2 to 2
|
||||
// ASCII: 1 to 1
|
||||
// Other: 1 to 2
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
// Astral: 4 to 2
|
||||
// Upper BMP: 3 to 2
|
||||
// Lower BMP: 2 to 2
|
||||
// ASCII: 1 to 1
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_encoder_functions!(
|
||||
{
|
||||
// For simplicity, unified ideographs
|
||||
// in the pointer range 11206...11212 are handled
|
||||
// as Level 1 Hanzi.
|
||||
if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
let pointer = if let Some(pointer) = big5_box_encode(bmp) {
|
||||
pointer
|
||||
} else if let Some(pointer) = big5_other_encode(bmp) {
|
||||
pointer
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
let lead = pointer / 157 + 0x81;
|
||||
let remainder = pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
}
|
||||
},
|
||||
{
|
||||
if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
|
||||
if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
|
||||
// big5_astral_encode returns rebased pointer,
|
||||
// so adding 0x87 instead of 0x81.
|
||||
let lead = rebased_pointer / 157 + 0x87;
|
||||
let remainder = rebased_pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
astral,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_big5(bytes: &[u8], expect: &str) {
|
||||
decode(BIG5, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_big5(string: &str, expect: &[u8]) {
|
||||
encode(BIG5, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_decode() {
|
||||
// Empty
|
||||
decode_big5(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
|
||||
|
||||
// Edge cases
|
||||
decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
|
||||
decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
|
||||
decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
|
||||
decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
|
||||
decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
|
||||
decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
|
||||
decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
|
||||
decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
|
||||
decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
|
||||
decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
|
||||
decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
|
||||
decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
|
||||
|
||||
// Edge cases surrounded with ASCII
|
||||
decode_big5(
|
||||
&[0x61u8, 0x87u8, 0x40u8, 0x62u8],
|
||||
&"\u{0061}\u{43F0}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
|
||||
&"\u{0061}\u{79D4}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
|
||||
&"\u{0061}\u{2910D}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x62u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{0304}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x64u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{030C}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x66u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{0304}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{030C}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
|
||||
&"\u{0061}\u{8991}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
|
||||
&"\u{0061}\u{27967}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
|
||||
&"\u{0061}\u{8A29}\u{0062}",
|
||||
);
|
||||
|
||||
// Bad sequences
|
||||
decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
|
||||
decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
|
||||
decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
|
||||
decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
|
||||
decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
|
||||
decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_encode() {
|
||||
// Empty
|
||||
encode_big5("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_big5("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
// Edge cases
|
||||
encode_big5("\u{9EA6}\u{0061}", b"麦\x61");
|
||||
encode_big5("\u{2626B}\u{0061}", b"𦉫\x61");
|
||||
encode_big5("\u{3000}", b"\xA1\x40");
|
||||
encode_big5("\u{20AC}", b"\xA3\xE1");
|
||||
encode_big5("\u{4E00}", b"\xA4\x40");
|
||||
encode_big5("\u{27607}", b"\xC8\xA4");
|
||||
encode_big5("\u{FFE2}", b"\xC8\xCD");
|
||||
encode_big5("\u{79D4}", b"\xFE\xFE");
|
||||
|
||||
// Not in index
|
||||
encode_big5("\u{2603}\u{0061}", b"☃\x61");
|
||||
}
|
||||
|
||||
// duplicate low bits
|
||||
encode_big5("\u{203B5}", b"\xFD\x6A");
|
||||
encode_big5("\u{25605}", b"\xFE\x46");
|
||||
|
||||
// prefer last
|
||||
encode_big5("\u{2550}", b"\xF9\xF9");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_big5_decode_all() {
|
||||
let input = include_bytes!("test_data/big5_in.txt");
|
||||
let expectation = include_str!("test_data/big5_in_ref.txt");
|
||||
let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_big5_encode_all() {
|
||||
let input = include_str!("test_data/big5_out.txt");
|
||||
let expectation = include_bytes!("test_data/big5_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = BIG5.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, BIG5);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_big5_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = BIG5.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
114378
zeroidc/vendor/encoding_rs/src/data.rs
vendored
Normal file
114378
zeroidc/vendor/encoding_rs/src/data.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
469
zeroidc/vendor/encoding_rs/src/euc_jp.rs
vendored
Normal file
469
zeroidc/vendor/encoding_rs/src/euc_jp.rs
vendored
Normal file
@@ -0,0 +1,469 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
enum EucJpPending {
|
||||
None,
|
||||
Jis0208Lead(u8),
|
||||
Jis0212Shift,
|
||||
Jis0212Lead(u8),
|
||||
HalfWidthKatakana,
|
||||
}
|
||||
|
||||
impl EucJpPending {
|
||||
fn is_none(&self) -> bool {
|
||||
match *self {
|
||||
EucJpPending::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
match *self {
|
||||
EucJpPending::None => 0,
|
||||
EucJpPending::Jis0208Lead(_)
|
||||
| EucJpPending::Jis0212Shift
|
||||
| EucJpPending::HalfWidthKatakana => 1,
|
||||
EucJpPending::Jis0212Lead(_) => 2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucJpDecoder {
|
||||
pending: EucJpPending,
|
||||
}
|
||||
|
||||
impl EucJpDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::EucJp(EucJpDecoder {
|
||||
pending: EucJpPending::None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.pending.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 2 to 3
|
||||
let len = self.plus_one_if_lead(byte_length);
|
||||
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
euc_jp_decoder_functions!(
|
||||
{
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
|
||||
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
|
||||
} else if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
} else {
|
||||
let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
|
||||
} else {
|
||||
let ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// If lead is between 0xA1 and 0xFE, inclusive,
|
||||
// subtract 0xA1.
|
||||
let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
|
||||
if jis0212_lead_minus_offset > (0xFE - 0xA1) {
|
||||
if lead < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
unread_handle_jis0212.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_jis0212.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
jis0212_lead_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0xA1 and 0xFE, inclusive,
|
||||
// subtract 0xA1.
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(3, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
|
||||
let pointer_minus_kanji = pointer.wrapping_sub(1410);
|
||||
if pointer_minus_kanji < JIS0212_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
|
||||
} else if let Some(bmp) = jis0212_accented_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
|
||||
if pointer_minus_upper_cyrillic <= (607 - 597) {
|
||||
handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
|
||||
} else {
|
||||
let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
|
||||
if pointer_minus_lower_cyrillic <= (655 - 645) {
|
||||
handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
|
||||
} else {
|
||||
return (
|
||||
DecoderResult::Malformed(3, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// If trail is between 0xA1 and 0xDF, inclusive,
|
||||
// subtract 0xA1 and map to half-width Katakana.
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset > (0xDF - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
jis0208_lead_minus_offset,
|
||||
byte,
|
||||
unread_handle_trail,
|
||||
jis0212_lead_minus_offset,
|
||||
lead,
|
||||
unread_handle_jis0212,
|
||||
source,
|
||||
handle
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_euc_jp_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
Some((0xA1, 0xB8))
|
||||
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
|
||||
Some((lead, trail))
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + 0xD0;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucJpEncoder;
|
||||
|
||||
impl EucJpEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
|
||||
} else if bmp == 0xA5 {
|
||||
handle.write_one(0x5Cu8)
|
||||
} else if bmp == 0x203E {
|
||||
handle.write_one(0x7Eu8)
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0xA1u8, 0xDDu8)
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|
||||
|| bmp == 0xF929
|
||||
|| bmp == 0xF9DC
|
||||
{
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pointer) = ibm_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_euc_jp(bytes: &[u8], expect: &str) {
|
||||
decode(EUC_JP, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_euc_jp(string: &str, expect: &[u8]) {
|
||||
encode(EUC_JP, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_jp_decode() {
|
||||
// Empty
|
||||
decode_euc_jp(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// Half-width
|
||||
decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
|
||||
decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
|
||||
decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E", "\u{FFFD}");
|
||||
|
||||
// JIS 0212
|
||||
decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
|
||||
decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F", "\u{FFFD}");
|
||||
|
||||
// JIS 0208
|
||||
decode_euc_jp(b"\xA1\xA1", "\u{3000}");
|
||||
decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
|
||||
decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
|
||||
decode_euc_jp(b"\xA1", "\u{FFFD}");
|
||||
|
||||
// Bad leads
|
||||
decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
|
||||
// Bad ASCII trail
|
||||
decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_jp_encode() {
|
||||
// Empty
|
||||
encode_euc_jp("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Exceptional code points
|
||||
encode_euc_jp("\u{00A5}", b"\x5C");
|
||||
encode_euc_jp("\u{203E}", b"\x7E");
|
||||
encode_euc_jp("\u{2212}", b"\xA1\xDD");
|
||||
|
||||
// Half-width
|
||||
encode_euc_jp("\u{FF61}", b"\x8E\xA1");
|
||||
encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
|
||||
|
||||
// JIS 0212
|
||||
encode_euc_jp("\u{02D8}", b"˘");
|
||||
|
||||
// JIS 0208
|
||||
encode_euc_jp("\u{3000}", b"\xA1\xA1");
|
||||
encode_euc_jp("\u{FF02}", b"\xFC\xFE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_jis0208_decode_all() {
|
||||
let input = include_bytes!("test_data/jis0208_in.txt");
|
||||
let expectation = include_str!("test_data/jis0208_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_jis0208_encode_all() {
|
||||
let input = include_str!("test_data/jis0208_out.txt");
|
||||
let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = EUC_JP.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, EUC_JP);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_jis0212_decode_all() {
|
||||
let input = include_bytes!("test_data/jis0212_in.txt");
|
||||
let expectation = include_str!("test_data/jis0212_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
}
|
||||
442
zeroidc/vendor/encoding_rs/src/euc_kr.rs
vendored
Normal file
442
zeroidc/vendor/encoding_rs/src/euc_kr.rs
vendored
Normal file
@@ -0,0 +1,442 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
use super::in_range16;
|
||||
|
||||
pub struct EucKrDecoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl EucKrDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::EucKr(EucKrDecoder { lead: None })
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.lead.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 2 to 3
|
||||
let len = self.plus_one_if_lead(byte_length);
|
||||
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
if lead_minus_offset >= 0x20 {
|
||||
// Not the extension range above KS X 1001
|
||||
let trail_minus_offset =
|
||||
byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset <= (0xFE - 0xA1) {
|
||||
// KS X 1001
|
||||
let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
|
||||
let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
|
||||
if hangul_pointer < KSX1001_HANGUL.len() {
|
||||
let upper_bmp = KSX1001_HANGUL[hangul_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if ksx_pointer < KSX1001_SYMBOLS.len() {
|
||||
let bmp = KSX1001_SYMBOLS[ksx_pointer];
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
|
||||
if hanja_pointer < KSX1001_HANJA.len() {
|
||||
let upper_bmp = KSX1001_HANJA[hanja_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
|
||||
let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
|
||||
if mid_bmp == 0 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_mid_bmp(mid_bmp)
|
||||
} else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
|
||||
let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
|
||||
handle.write_mid_bmp(mid_bmp)
|
||||
} else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
|
||||
let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else {
|
||||
let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
|
||||
if other_pointer < 0x039F {
|
||||
let bmp = ksx1001_other_decode(other_pointer as u16);
|
||||
// ASCII range means unassigned
|
||||
if bmp < 0x80 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Extension range to the left of
|
||||
// KS X 1001
|
||||
let left_lead = lead_minus_offset - 0x20;
|
||||
let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
|
||||
byte - (12 + 0x41)
|
||||
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
|
||||
byte - (6 + 0x41)
|
||||
} else if byte.wrapping_sub(0x41) < 0x1A {
|
||||
byte - 0x41
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
|
||||
if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
|
||||
let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Extension range above KS X 1001
|
||||
let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
|
||||
byte - (12 + 0x41)
|
||||
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
|
||||
byte - (6 + 0x41)
|
||||
} else if byte.wrapping_sub(0x41) < 0x1A {
|
||||
byte - 0x41
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
|
||||
let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_bmp,
|
||||
check_space_bmp,
|
||||
true);
|
||||
}
|
||||
|
||||
fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
|
||||
if in_inclusive_range16(bmp, 0x3000, 0x3015) {
|
||||
if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
|
||||
return Some((0xA1, pos + 0xA1));
|
||||
}
|
||||
}
|
||||
if let Some(other_pointer) = ksx1001_other_encode(bmp) {
|
||||
let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
|
||||
let other_trail = ((other_pointer as usize) % 94) + 0xA1;
|
||||
return Some((other_lead, other_trail));
|
||||
}
|
||||
if in_range16(bmp, 0x00AA, 0x0168) {
|
||||
// Latin
|
||||
if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
|
||||
return Some((0x81 + 0x28, 0xA1 + pos));
|
||||
}
|
||||
if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
|
||||
return Some((0x81 + 0x27, 0xA1 + pos));
|
||||
}
|
||||
} else if in_range16(bmp, 0x2500, 0x254C) {
|
||||
if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
|
||||
return Some((0x81 + 0x25, 0xA1 + pos));
|
||||
}
|
||||
}
|
||||
if in_inclusive_range16(bmp, 0x2015, 0x266D)
|
||||
|| in_inclusive_range16(bmp, 0x321C, 0x33D8)
|
||||
|| in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
|
||||
|| in_inclusive_range16(bmp, 0x00A1, 0x00F7)
|
||||
|| in_inclusive_range16(bmp, 0x02C7, 0x02DD)
|
||||
{
|
||||
if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
|
||||
if pos < (94 - 3) {
|
||||
return Some((0xA1, pos + 0xA1 + 3));
|
||||
}
|
||||
return Some((0xA2, pos - (94 - 3) + 0xA1));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-hangul-encode"))]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
|
||||
match KSX1001_HANGUL.binary_search(&bmp) {
|
||||
Ok(ksx_hangul_pointer) => {
|
||||
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
|
||||
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
|
||||
(ksx_hangul_lead as u8, ksx_hangul_trail as u8)
|
||||
}
|
||||
Err(_) => {
|
||||
let (lead, cp949_trail) = if bmp < 0xC8A5 {
|
||||
// Above KS X 1001
|
||||
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
|
||||
let top_lead = (top_pointer / (190 - 12)) + 0x81;
|
||||
let top_trail = top_pointer % (190 - 12);
|
||||
(top_lead as u8, top_trail as u8)
|
||||
} else {
|
||||
// To the left of KS X 1001
|
||||
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
|
||||
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
|
||||
let left_trail = left_pointer % (190 - 94 - 12);
|
||||
(left_lead as u8, left_trail as u8)
|
||||
};
|
||||
let offset = if cp949_trail >= (0x40 - 12) {
|
||||
0x41 + 12
|
||||
} else if cp949_trail >= (0x20 - 6) {
|
||||
0x41 + 6
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
(lead as u8, (cp949_trail + offset) as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-hangul-encode")]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
|
||||
cp949_hangul_encode(bmp_minus_hangul_start)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-hanja-encode"))]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
|
||||
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
|
||||
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
|
||||
let hanja_trail = (hanja_pointer % 94) + 0xA1;
|
||||
Some((hanja_lead as u8, hanja_trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-hanja-encode")]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
|
||||
if bmp < 0xF900 {
|
||||
ksx1001_unified_hangul_encode(bmp)
|
||||
} else {
|
||||
Some(ksx1001_compatibility_hangul_encode(bmp))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucKrEncoder;
|
||||
|
||||
impl EucKrEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
|
||||
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
|
||||
// Hangul
|
||||
ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
|
||||
} else if in_range16(bmp, 0x33DE, 0xFF01) {
|
||||
// Vast range that includes no other
|
||||
// mappables except Hangul (already
|
||||
// processed) and Hanja.
|
||||
// Narrow the range further to Unified and
|
||||
// Compatibility ranges of Hanja.
|
||||
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
|
||||
if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
|
||||
(hanja_lead, hanja_trail)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
|
||||
(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
handle.write_two(lead, trail)
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_euc_kr(bytes: &[u8], expect: &str) {
|
||||
decode(EUC_KR, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_euc_kr(string: &str, expect: &[u8]) {
|
||||
encode(EUC_KR, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_decode() {
|
||||
// Empty
|
||||
decode_euc_kr(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_euc_kr(b"\x81\x41", "\u{AC02}");
|
||||
decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
|
||||
decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
|
||||
decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
|
||||
decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode() {
|
||||
// Empty
|
||||
encode_euc_kr("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
encode_euc_kr("\u{AC02}", b"\x81\x41");
|
||||
encode_euc_kr("\u{8A70}", b"\xFD\xFE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_euc_kr_decode_all() {
|
||||
let input = include_bytes!("test_data/euc_kr_in.txt");
|
||||
let expectation = include_str!("test_data/euc_kr_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_euc_kr_encode_all() {
|
||||
let input = include_str!("test_data/euc_kr_out.txt");
|
||||
let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = EUC_KR.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, EUC_KR);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = EUC_KR.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
767
zeroidc/vendor/encoding_rs/src/gb18030.rs
vendored
Normal file
767
zeroidc/vendor/encoding_rs/src/gb18030.rs
vendored
Normal file
@@ -0,0 +1,767 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
use super::in_range16;
|
||||
|
||||
enum Gb18030Pending {
|
||||
None,
|
||||
One(u8),
|
||||
Two(u8, u8),
|
||||
Three(u8, u8, u8),
|
||||
}
|
||||
|
||||
impl Gb18030Pending {
|
||||
fn is_none(&self) -> bool {
|
||||
match *self {
|
||||
Gb18030Pending::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
match *self {
|
||||
Gb18030Pending::None => 0,
|
||||
Gb18030Pending::One(_) => 1,
|
||||
Gb18030Pending::Two(_, _) => 2,
|
||||
Gb18030Pending::Three(_, _, _) => 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Gb18030Decoder {
|
||||
first: Option<u8>,
|
||||
second: Option<u8>,
|
||||
third: Option<u8>,
|
||||
pending: Gb18030Pending,
|
||||
pending_ascii: Option<u8>,
|
||||
}
|
||||
|
||||
impl Gb18030Decoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Gb18030(Gb18030Decoder {
|
||||
first: None,
|
||||
second: None,
|
||||
third: None,
|
||||
pending: Gb18030Pending::None,
|
||||
pending_ascii: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.first.is_none()
|
||||
&& self.second.is_none()
|
||||
&& self.third.is_none()
|
||||
&& self.pending.is_none()
|
||||
&& self.pending_ascii.is_none()
|
||||
}
|
||||
|
||||
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
self.pending.count()
|
||||
+ match self.first {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.second {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.third {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.pending_ascii {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// ASCII: 1 to 1 (worst case)
|
||||
// gbk: 2 to 1
|
||||
// ranges: 4 to 1 or 4 to 2
|
||||
checked_add(1, self.extra_from_state(byte_length))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// ASCII: 1 to 1
|
||||
// gbk: 2 to 2 or 2 to 3
|
||||
// ranges: 4 to 2, 4 to 3 or 4 to 4
|
||||
// 0x80: 1 to 3 (worst case)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
|
||||
}
|
||||
|
||||
gb18030_decoder_functions!(
|
||||
{
|
||||
// If first is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
if non_ascii == 0x80 {
|
||||
handle.write_upper_bmp(0x20ACu16);
|
||||
continue 'outermost;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// Two-byte (or error)
|
||||
if first_minus_offset >= 0x20 {
|
||||
// Not the gbk ideograph range above GB2312
|
||||
let trail_minus_offset = second.wrapping_sub(0xA1);
|
||||
if trail_minus_offset <= (0xFE - 0xA1) {
|
||||
// GB2312
|
||||
let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
|
||||
if hanzi_lead < (0x77 - 0x2F) {
|
||||
// Level 1 Hanzi, Level 2 Hanzi
|
||||
// or one of the 5 PUA code
|
||||
// points in between.
|
||||
let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
|
||||
let upper_bmp = GB2312_HANZI[hanzi_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if first_minus_offset == 0x20 {
|
||||
// Symbols (starting with ideographic space)
|
||||
let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
|
||||
handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
|
||||
} else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
|
||||
handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
|
||||
} else if first_minus_offset > 0x76 {
|
||||
// Bottom PUA
|
||||
let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
|
||||
handle.write_upper_bmp(pua)
|
||||
} else {
|
||||
let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
}
|
||||
} else {
|
||||
// gbk range on the left
|
||||
let mut trail_minus_offset = second.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start = second.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xA0 - 0x80) {
|
||||
if second < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_second.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_second.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = second - 0x41;
|
||||
}
|
||||
// Zero-base lead
|
||||
let left_lead = first_minus_offset - 0x20;
|
||||
let left_pointer = left_lead as usize * (190 - 94) +
|
||||
trail_minus_offset as usize;
|
||||
let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
|
||||
if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
|
||||
let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
|
||||
let bmp = gbk_other_decode(left_pointer as u16);
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
|
||||
let upper_bmp = GBK_BOTTOM[bottom_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// gbk ideograph range above GB2312
|
||||
let mut trail_minus_offset = second.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start = second.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xFE - 0x80) {
|
||||
if second < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_second.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_second.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = second - 0x41;
|
||||
}
|
||||
let pointer = first_minus_offset as usize * 190usize +
|
||||
trail_minus_offset as usize;
|
||||
let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
},
|
||||
{
|
||||
// If third is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let third_minus_offset = third.wrapping_sub(0x81);
|
||||
if third_minus_offset > (0xFE - 0x81) {
|
||||
// We have an error. Let's inline what's going
|
||||
// to happen when `second` is
|
||||
// reprocessed. (`third` gets unread.)
|
||||
// `second` is guaranteed ASCII, so let's
|
||||
// put it in `pending_ascii`. Recompute
|
||||
// `second` from `second_minus_offset`.
|
||||
self.pending_ascii = Some(second_minus_offset + 0x30);
|
||||
// Now unread `third` and designate the previous
|
||||
// `first` as being in error.
|
||||
return (DecoderResult::Malformed(1, 1),
|
||||
unread_handle_third.unread(),
|
||||
handle.written());
|
||||
}
|
||||
third_minus_offset
|
||||
},
|
||||
{
|
||||
// If fourth is between 0x30 and 0x39, inclusive,
|
||||
// subtract offset 0x30.
|
||||
//
|
||||
// If we have an error, we'll inline what's going
|
||||
// to happen when `second` and `third` are
|
||||
// reprocessed. (`fourth` gets unread.)
|
||||
// `second` is guaranteed ASCII, so let's
|
||||
// put it in `pending_ascii`. Recompute
|
||||
// `second` from `second_minus_offset` to
|
||||
// make this block reusable when `second`
|
||||
// is not in scope.
|
||||
//
|
||||
// `third` is guaranteed to be in the range
|
||||
// that makes it become the new `self.first`.
|
||||
//
|
||||
// `fourth` gets unread and the previous
|
||||
// `first` gets designates as being in error.
|
||||
let fourth_minus_offset = fourth.wrapping_sub(0x30);
|
||||
if fourth_minus_offset > (0x39 - 0x30) {
|
||||
self.pending_ascii = Some(second_minus_offset + 0x30);
|
||||
self.pending = Gb18030Pending::One(third_minus_offset);
|
||||
return (DecoderResult::Malformed(1, 2),
|
||||
unread_handle_fourth.unread(),
|
||||
handle.written());
|
||||
}
|
||||
let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
|
||||
(second_minus_offset as usize * (10 * 126)) +
|
||||
(third_minus_offset as usize * 10) +
|
||||
fourth_minus_offset as usize;
|
||||
if pointer <= 39419 {
|
||||
// BMP
|
||||
if pointer == 7457 {
|
||||
handle.write_upper_bmp(0xE7C7)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
|
||||
}
|
||||
} else if pointer >= 189_000 && pointer <= 1_237_575 {
|
||||
// Astral
|
||||
handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(4, 0),
|
||||
unread_handle_fourth.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
first_minus_offset,
|
||||
second,
|
||||
second_minus_offset,
|
||||
unread_handle_second,
|
||||
third,
|
||||
third_minus_offset,
|
||||
unread_handle_third,
|
||||
fourth,
|
||||
fourth_minus_offset,
|
||||
unread_handle_fourth,
|
||||
source,
|
||||
handle,
|
||||
'outermost);
|
||||
}
|
||||
|
||||
// XXX Experiment with inline directives
|
||||
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
|
||||
// Try ideographic punctuation first as it's the most likely case.
|
||||
// Throwing in the check for full-width currencies and tilde is probably
|
||||
// more size-efficient here than elsewhere.
|
||||
if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
|
||||
return Some((0xA1, pos + 0xA1));
|
||||
}
|
||||
}
|
||||
// Ext A
|
||||
if in_range16(bmp, 0x3400, 0x4E00) {
|
||||
return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
|
||||
(
|
||||
0xFE,
|
||||
pos + if pos < (0x3F - 16) {
|
||||
0x40 + 16
|
||||
} else {
|
||||
0x41 + 16
|
||||
},
|
||||
)
|
||||
});
|
||||
}
|
||||
// Compatibility ideographs
|
||||
if in_range16(bmp, 0xF900, 0xFB00) {
|
||||
return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
|
||||
if pos < 5 {
|
||||
// end of second to last row
|
||||
(0xFD, pos + (190 - 94 - 5 + 0x41))
|
||||
} else {
|
||||
// last row
|
||||
(0xFE, pos + (0x40 - 5))
|
||||
}
|
||||
});
|
||||
}
|
||||
// Handle everything below U+02CA, which is in GBK_OTHER.
|
||||
if bmp < 0x02CA {
|
||||
if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
|
||||
// Pinyin except U+1E3F
|
||||
if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
|
||||
return Some((0xA8, pos + 0xA1));
|
||||
}
|
||||
} else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
|
||||
|| in_inclusive_range16(bmp, 0x02C7, 0x02C9)
|
||||
{
|
||||
// Diacritics and Latin 1 symbols
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
|
||||
return Some((0xA1, pos + 0xA1 + 3));
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
if bmp >= 0xE794 {
|
||||
// Various brackets, all in PUA or full-width regions
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
|
||||
return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
|
||||
}
|
||||
} else if bmp == 0x1E3F {
|
||||
// The one Pinyin placed elsewhere on the BMP
|
||||
return Some((0xA8, 0x7B - 0x60 + 0xA1));
|
||||
} else if in_range16(bmp, 0xA000, 0xD800) {
|
||||
// Since Korean has usage in China, let's spend a branch to fast-track
|
||||
// Hangul.
|
||||
return None;
|
||||
}
|
||||
// GB2312 other (except bottom PUA and PUA between Hanzi levels).
|
||||
if let Some(other_pointer) = gb2312_other_encode(bmp) {
|
||||
let other_lead = other_pointer as usize / 94;
|
||||
let other_trail = other_pointer as usize % 94;
|
||||
return Some((0xA2 + other_lead, 0xA1 + other_trail));
|
||||
}
|
||||
// At this point, we've handled all mappable characters above U+02D9 but
|
||||
// below U+2010. Let's check for that range in order to let lower BMP
|
||||
// characters used for minority languages in China avoid the subsequent
|
||||
// search that deals mainly with various symbols.
|
||||
if in_range16(bmp, 0x02DA, 0x2010) {
|
||||
return None;
|
||||
}
|
||||
// GBK other (except radicals and PUA in GBK_BOTTOM).
|
||||
if let Some(other_pointer) = gbk_other_encode(bmp) {
|
||||
let other_lead = other_pointer as usize / (190 - 94);
|
||||
let other_trail = other_pointer as usize % (190 - 94);
|
||||
let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
return Some((other_lead + (0x81 + 0x20), other_trail + offset));
|
||||
}
|
||||
// CJK Radicals Supplement or PUA in GBK_BOTTOM
|
||||
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
|
||||
if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
|
||||
let trail = pos + 16;
|
||||
let offset = if trail < 0x3F { 0x40 } else { 0x41 };
|
||||
return Some((0xFE, trail + offset));
|
||||
}
|
||||
}
|
||||
// GB2312 bottom PUA
|
||||
let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
|
||||
if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
|
||||
let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
|
||||
let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
|
||||
return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
|
||||
}
|
||||
// PUA between Hanzi Levels
|
||||
let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
|
||||
if bmp_minus_pua_between_hanzi < 5 {
|
||||
return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
|
||||
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
|
||||
(lead, trail)
|
||||
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
|
||||
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
|
||||
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
|
||||
(hanzi_lead as u8, hanzi_trail as u8)
|
||||
} else {
|
||||
let (lead, gbk_trail) = if bmp < 0x72DC {
|
||||
// Above GB2312
|
||||
let pointer = gbk_top_ideograph_encode(bmp) as usize;
|
||||
let lead = (pointer / 190) + 0x81;
|
||||
let gbk_trail = pointer % 190;
|
||||
(lead, gbk_trail)
|
||||
} else {
|
||||
// To the left of GB2312
|
||||
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
|
||||
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
|
||||
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
|
||||
(lead, gbk_trail)
|
||||
};
|
||||
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
(lead as u8, (gbk_trail + offset) as u8)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-gb-hanzi-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
|
||||
gbk_hanzi_encode(bmp_minus_unified_start)
|
||||
}
|
||||
|
||||
pub struct Gb18030Encoder {
|
||||
extended: bool,
|
||||
}
|
||||
|
||||
impl Gb18030Encoder {
|
||||
pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::Gb18030(Gb18030Encoder {
|
||||
extended: extended_range,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
if self.extended {
|
||||
u16_length.checked_mul(4)
|
||||
} else {
|
||||
// Need to add, because space check is done with the four-byte
|
||||
// assumption.
|
||||
checked_add(2, u16_length.checked_mul(2))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
if self.extended {
|
||||
// 1 to 1
|
||||
// 2 to 2
|
||||
// 3 to 2
|
||||
// 2 to 4 (worst)
|
||||
// 3 to 4
|
||||
// 4 to 4
|
||||
checked_add(2, byte_length.checked_mul(2))
|
||||
} else {
|
||||
// 1 to 1
|
||||
// 2 to 2
|
||||
// 3 to 2
|
||||
// Need to add, because space check is done with the four-byte
|
||||
// assumption.
|
||||
byte_length.checked_add(3)
|
||||
}
|
||||
}
|
||||
|
||||
ascii_compatible_encoder_functions!(
|
||||
{
|
||||
let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
|
||||
if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
|
||||
// CJK Unified Ideographs
|
||||
// Can't fail now, since all are
|
||||
// mapped.
|
||||
let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
|
||||
handle.write_two(lead, trail)
|
||||
} else if bmp == 0xE5E5 {
|
||||
// It's not optimal to check for the unmappable
|
||||
// and for euro at this stage, but getting
|
||||
// the out of the way makes the rest of the
|
||||
// code less messy.
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
} else if bmp == 0x20AC && !self.extended {
|
||||
handle.write_one(0x80u8)
|
||||
} else {
|
||||
match gbk_encode_non_unified(bmp) {
|
||||
Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
|
||||
None => {
|
||||
if !self.extended {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let range_pointer = gb18030_range_encode(bmp);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
let rem_second = rem_first % (10 * 126);
|
||||
let third = rem_second / 10;
|
||||
let fourth = rem_second % 10;
|
||||
handle.write_four(
|
||||
(first + 0x81) as u8,
|
||||
(second + 0x30) as u8,
|
||||
(third + 0x81) as u8,
|
||||
(fourth + 0x30) as u8,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
if !self.extended {
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
let rem_second = rem_first % (10 * 126);
|
||||
let third = rem_second / 10;
|
||||
let fourth = rem_second % 10;
|
||||
handle.write_four(
|
||||
(first + 0x81) as u8,
|
||||
(second + 0x30) as u8,
|
||||
(third + 0x81) as u8,
|
||||
(fourth + 0x30) as u8,
|
||||
)
|
||||
},
|
||||
bmp,
|
||||
astral,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_four,
|
||||
check_space_four,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_gb18030(bytes: &[u8], expect: &str) {
|
||||
decode(GB18030, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_gb18030(string: &str, expect: &[u8]) {
|
||||
encode(GB18030, string, expect);
|
||||
}
|
||||
|
||||
fn encode_gbk(string: &str, expect: &[u8]) {
|
||||
encode(GBK, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_decode() {
|
||||
// Empty
|
||||
decode_gb18030(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// euro
|
||||
decode_gb18030(b"\x80", "\u{20AC}");
|
||||
decode_gb18030(b"\xA2\xE3", "\u{20AC}");
|
||||
|
||||
// two bytes
|
||||
decode_gb18030(b"\x81\x40", "\u{4E02}");
|
||||
decode_gb18030(b"\x81\x7E", "\u{4E8A}");
|
||||
decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
|
||||
decode_gb18030(b"\x81\x80", "\u{4E90}");
|
||||
decode_gb18030(b"\x81\xFE", "\u{4FA2}");
|
||||
decode_gb18030(b"\xFE\x40", "\u{FA0C}");
|
||||
decode_gb18030(b"\xFE\x7E", "\u{E843}");
|
||||
decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
|
||||
decode_gb18030(b"\xFE\x80", "\u{4723}");
|
||||
decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
|
||||
|
||||
// The difference from the original GB18030
|
||||
decode_gb18030(b"\xA3\xA0", "\u{3000}");
|
||||
decode_gb18030(b"\xA1\xA1", "\u{3000}");
|
||||
|
||||
// 0xFF
|
||||
decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
|
||||
decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
|
||||
decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
|
||||
decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
|
||||
decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
|
||||
decode_gb18030(
|
||||
b"\xFF\x32\x9A\x33\x00",
|
||||
"\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
|
||||
);
|
||||
|
||||
// Four bytes
|
||||
decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
|
||||
decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
|
||||
decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
|
||||
decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
|
||||
decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
|
||||
decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode() {
|
||||
// Empty
|
||||
encode_gb18030("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// euro
|
||||
encode_gb18030("\u{20AC}", b"\xA2\xE3");
|
||||
|
||||
// two bytes
|
||||
encode_gb18030("\u{4E02}", b"\x81\x40");
|
||||
encode_gb18030("\u{4E8A}", b"\x81\x7E");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gb18030("\u{4E90}", b"\x81\x80");
|
||||
encode_gb18030("\u{4FA2}", b"\x81\xFE");
|
||||
encode_gb18030("\u{FA0C}", b"\xFE\x40");
|
||||
encode_gb18030("\u{E843}", b"\xFE\x7E");
|
||||
encode_gb18030("\u{4723}", b"\xFE\x80");
|
||||
encode_gb18030("\u{E4C5}", b"\xFE\xFE");
|
||||
}
|
||||
|
||||
// The difference from the original GB18030
|
||||
encode_gb18030("\u{E5E5}", b"");
|
||||
encode_gb18030("\u{3000}", b"\xA1\xA1");
|
||||
|
||||
// Four bytes
|
||||
encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
|
||||
encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
|
||||
encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
|
||||
encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
|
||||
}
|
||||
|
||||
// Edge cases
|
||||
encode_gb18030("\u{00F7}", b"\xA1\xC2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gbk_encode() {
|
||||
// Empty
|
||||
encode_gbk("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// euro
|
||||
encode_gbk("\u{20AC}", b"\x80");
|
||||
|
||||
// two bytes
|
||||
encode_gbk("\u{4E02}", b"\x81\x40");
|
||||
encode_gbk("\u{4E8A}", b"\x81\x7E");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gbk("\u{4E90}", b"\x81\x80");
|
||||
encode_gbk("\u{4FA2}", b"\x81\xFE");
|
||||
encode_gbk("\u{FA0C}", b"\xFE\x40");
|
||||
encode_gbk("\u{E843}", b"\xFE\x7E");
|
||||
encode_gbk("\u{4723}", b"\xFE\x80");
|
||||
encode_gbk("\u{E4C5}", b"\xFE\xFE");
|
||||
}
|
||||
|
||||
// The difference from the original gb18030
|
||||
encode_gbk("\u{E5E5}", b"");
|
||||
encode_gbk("\u{3000}", b"\xA1\xA1");
|
||||
|
||||
// Four bytes
|
||||
encode_gbk("\u{0080}", b"€");
|
||||
encode_gbk("\u{E7C7}", b"");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gbk("\u{2603}", b"☃");
|
||||
encode_gbk("\u{1F4A9}", b"💩");
|
||||
encode_gbk("\u{10FFFF}", b"");
|
||||
}
|
||||
|
||||
// Edge cases
|
||||
encode_gbk("\u{00F7}", b"\xA1\xC2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_gb18030_decode_all() {
|
||||
let input = include_bytes!("test_data/gb18030_in.txt");
|
||||
let expectation = include_str!("test_data/gb18030_in_ref.txt");
|
||||
let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_gb18030_encode_all() {
|
||||
let input = include_str!("test_data/gb18030_out.txt");
|
||||
let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = GB18030.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, GB18030);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode_from_utf16_max_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut encoder = GB18030.new_encoder();
|
||||
{
|
||||
let needed = encoder
|
||||
.max_buffer_length_from_utf16_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) = encoder.encode_from_utf16_without_replacement(
|
||||
&[0x3000],
|
||||
&mut output[..needed],
|
||||
true,
|
||||
);
|
||||
assert_eq!(result, EncoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 2);
|
||||
assert_eq!(output[0], 0xA1);
|
||||
assert_eq!(output[1], 0xA1);
|
||||
}
|
||||
}
|
||||
}
|
||||
1969
zeroidc/vendor/encoding_rs/src/handles.rs
vendored
Normal file
1969
zeroidc/vendor/encoding_rs/src/handles.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1068
zeroidc/vendor/encoding_rs/src/iso_2022_jp.rs
vendored
Normal file
1068
zeroidc/vendor/encoding_rs/src/iso_2022_jp.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
6113
zeroidc/vendor/encoding_rs/src/lib.rs
vendored
Normal file
6113
zeroidc/vendor/encoding_rs/src/lib.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1622
zeroidc/vendor/encoding_rs/src/macros.rs
vendored
Normal file
1622
zeroidc/vendor/encoding_rs/src/macros.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3356
zeroidc/vendor/encoding_rs/src/mem.rs
vendored
Normal file
3356
zeroidc/vendor/encoding_rs/src/mem.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
104
zeroidc/vendor/encoding_rs/src/replacement.rs
vendored
Normal file
104
zeroidc/vendor/encoding_rs/src/replacement.rs
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::variant::*;
|
||||
|
||||
pub struct ReplacementDecoder {
|
||||
emitted: bool,
|
||||
}
|
||||
|
||||
impl ReplacementDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Replacement(ReplacementDecoder { emitted: false })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, _u16_length: usize) -> Option<usize> {
|
||||
Some(1)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, _byte_length: usize) -> Option<usize> {
|
||||
Some(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, _byte_length: usize) -> Option<usize> {
|
||||
Some(3)
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
// Don't err if the input stream is empty. See
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.is_empty() {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
self.emitted = true;
|
||||
(DecoderResult::Malformed(1, 0), 1, 0)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
// Don't err if the input stream is empty. See
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.len() < 3 {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
self.emitted = true;
|
||||
(DecoderResult::Malformed(1, 0), 1, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_replacement(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(REPLACEMENT, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_replacement(string: &str, expect: &[u8]) {
|
||||
encode(REPLACEMENT, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replacement_decode() {
|
||||
decode_replacement(b"", "");
|
||||
decode_replacement(b"A", "\u{FFFD}");
|
||||
decode_replacement(b"AB", "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replacement_encode() {
|
||||
// Empty
|
||||
encode_replacement("", b"");
|
||||
|
||||
assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
|
||||
encode_replacement("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
}
|
||||
}
|
||||
426
zeroidc/vendor/encoding_rs/src/shift_jis.rs
vendored
Normal file
426
zeroidc/vendor/encoding_rs/src/shift_jis.rs
vendored
Normal file
@@ -0,0 +1,426 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range;
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
pub struct ShiftJisDecoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl ShiftJisDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.lead.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 1 to 3 (half-width katakana)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0x9F, inclusive,
|
||||
// subtract offset 0x81. Else if lead is
|
||||
// between 0xE0 and 0xFC, inclusive, subtract
|
||||
// offset 0xC1. Else if lead is between
|
||||
// 0xA1 and 0xDF, inclusive, map to half-width
|
||||
// Katakana. Else if lead is 0x80, pass through.
|
||||
let mut non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0x9F - 0x81) {
|
||||
let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
|
||||
if non_ascii_minus_range_start > (0xFC - 0xE0) {
|
||||
let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
|
||||
if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
|
||||
if non_ascii == 0x80 {
|
||||
handle.write_mid_bmp(0x80);
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
non_ascii_minus_offset = non_ascii - 0xC1;
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0x40 and 0x7E, inclusive,
|
||||
// subtract offset 0x40. Else if trail is
|
||||
// between 0x80 and 0xFC, inclusive, subtract
|
||||
// offset 0x41.
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
// Hiragana doesn't cross 0x7F, but Katakana does.
|
||||
// We can check for Hiragana before normalizing
|
||||
// trail.
|
||||
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
|
||||
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
|
||||
} else {
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start =
|
||||
byte.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xFC - 0x80) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = byte - 0x41;
|
||||
}
|
||||
if lead_minus_offset == 0x02 &&
|
||||
trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
|
||||
} else {
|
||||
let pointer = lead_minus_offset as usize *
|
||||
188usize +
|
||||
trail_minus_offset as usize;
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer <
|
||||
JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
|
||||
} else {
|
||||
let upper_ibm_pointer = pointer.wrapping_sub(10744);
|
||||
if upper_ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
|
||||
} else {
|
||||
let lower_ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if lower_ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
|
||||
} else if in_inclusive_range(pointer, 8836, 10715) {
|
||||
handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_bmp,
|
||||
check_space_bmp,
|
||||
false);
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_shift_jis_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
return Some((lead, trail));
|
||||
}
|
||||
let pointer = if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
23
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
4418 + pos
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
10744 + pos
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
|
||||
}
|
||||
|
||||
pub struct ShiftJisEncoder;
|
||||
|
||||
impl ShiftJisEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
let trail_offset = if bmp_minus_katakana < 0x3F {
|
||||
0x40
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
|
||||
} else if bmp == 0xA5 {
|
||||
handle.write_one(0x5Cu8)
|
||||
} else if bmp == 0x80 {
|
||||
handle.write_one(0x80u8)
|
||||
} else if bmp == 0x203E {
|
||||
handle.write_one(0x7Eu8)
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0x81u8, 0x7Cu8)
|
||||
} else {
|
||||
let bmp_minus_roman = bmp.wrapping_sub(0x2170);
|
||||
let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
|
||||
10716 + bmp_minus_roman as usize
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
pointer
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|
||||
|| bmp == 0xF929
|
||||
|| bmp == 0xF9DC
|
||||
{
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
10744 + pos
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
pointer
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_shift_jis(bytes: &[u8], expect: &str) {
|
||||
decode(SHIFT_JIS, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_shift_jis(string: &str, expect: &[u8]) {
|
||||
encode(SHIFT_JIS, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_decode() {
|
||||
// Empty
|
||||
decode_shift_jis(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// Half-width
|
||||
decode_shift_jis(b"\xA1", "\u{FF61}");
|
||||
decode_shift_jis(b"\xDF", "\u{FF9F}");
|
||||
decode_shift_jis(b"\xA0", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xE0", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xA0+", "\u{FFFD}+");
|
||||
decode_shift_jis(b"\xE0+", "\u{FFFD}+");
|
||||
|
||||
// EUDC
|
||||
decode_shift_jis(b"\xF0\x40", "\u{E000}");
|
||||
decode_shift_jis(b"\xF9\xFC", "\u{E757}");
|
||||
decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xFA\x40", "\u{2170}");
|
||||
|
||||
// JIS 0208
|
||||
decode_shift_jis(b"\x81\x40", "\u{3000}");
|
||||
decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
|
||||
decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
|
||||
decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xFA\x40", "\u{2170}");
|
||||
decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
|
||||
decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
|
||||
decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
|
||||
//
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_encode() {
|
||||
// Empty
|
||||
encode_shift_jis("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Exceptional code points
|
||||
encode_shift_jis("\u{0080}", b"\x80");
|
||||
encode_shift_jis("\u{00A5}", b"\x5C");
|
||||
encode_shift_jis("\u{203E}", b"\x7E");
|
||||
encode_shift_jis("\u{2212}", b"\x81\x7C");
|
||||
|
||||
// Half-width
|
||||
encode_shift_jis("\u{FF61}", b"\xA1");
|
||||
encode_shift_jis("\u{FF9F}", b"\xDF");
|
||||
|
||||
// EUDC
|
||||
encode_shift_jis("\u{E000}", b"");
|
||||
encode_shift_jis("\u{E757}", b"");
|
||||
|
||||
// JIS 0212
|
||||
encode_shift_jis("\u{02D8}", b"˘");
|
||||
|
||||
// JIS 0208
|
||||
encode_shift_jis("\u{3000}", b"\x81\x40");
|
||||
encode_shift_jis("\u{FF02}", b"\xFA\x57");
|
||||
encode_shift_jis("\u{2170}", b"\xFA\x40");
|
||||
encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_shift_jis_decode_all() {
|
||||
let input = include_bytes!("test_data/shift_jis_in.txt");
|
||||
let expectation = include_str!("test_data/shift_jis_in_ref.txt");
|
||||
let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_shift_jis_encode_all() {
|
||||
let input = include_str!("test_data/shift_jis_out.txt");
|
||||
let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, SHIFT_JIS);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_half_width_katakana_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut decoder = SHIFT_JIS.new_decoder();
|
||||
{
|
||||
let needed = decoder
|
||||
.max_utf8_buffer_length_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
|
||||
assert_eq!(result, DecoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert_eq!(output[0], 0xEF);
|
||||
assert_eq!(output[1], 0xBD);
|
||||
assert_eq!(output[2], 0xA1);
|
||||
}
|
||||
}
|
||||
}
|
||||
455
zeroidc/vendor/encoding_rs/src/simd_funcs.rs
vendored
Normal file
455
zeroidc/vendor/encoding_rs/src/simd_funcs.rs
vendored
Normal file
@@ -0,0 +1,455 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use packed_simd::u16x8;
|
||||
use packed_simd::u8x16;
|
||||
use packed_simd::FromBits;
|
||||
|
||||
// TODO: Migrate unaligned access to stdlib code if/when the RFC
|
||||
// https://github.com/rust-lang/rfcs/pull/1725 is implemented.
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
|
||||
let mut simd = ::core::mem::uninitialized();
|
||||
::core::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
|
||||
simd
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
|
||||
*(ptr as *const u8x16)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
|
||||
::core::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
|
||||
*(ptr as *mut u8x16) = s;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
|
||||
let mut simd = ::core::mem::uninitialized();
|
||||
::core::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
|
||||
simd
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
|
||||
*(ptr as *const u16x8)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
|
||||
::core::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
|
||||
*(ptr as *mut u16x8) = s;
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] {
|
||||
use core::arch::x86_64::__m128i;
|
||||
use core::arch::x86_64::_mm_movemask_epi8;
|
||||
use core::arch::x86_64::_mm_packus_epi16;
|
||||
} else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] {
|
||||
use core::arch::x86::__m128i;
|
||||
use core::arch::x86::_mm_movemask_epi8;
|
||||
use core::arch::x86::_mm_packus_epi16;
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
use core::arch::aarch64::uint8x16_t;
|
||||
use core::arch::aarch64::uint16x8_t;
|
||||
use core::arch::aarch64::vmaxvq_u8;
|
||||
use core::arch::aarch64::vmaxvq_u16;
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// #[inline(always)]
|
||||
// fn simd_byte_swap_u8(s: u8x16) -> u8x16 {
|
||||
// unsafe {
|
||||
// shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[inline(always)]
|
||||
// pub fn simd_byte_swap(s: u16x8) -> u16x8 {
|
||||
// to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s)))
|
||||
// }
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_byte_swap(s: u16x8) -> u16x8 {
|
||||
let left = s << 8;
|
||||
let right = s >> 8;
|
||||
left | right
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn to_u16_lanes(s: u8x16) -> u16x8 {
|
||||
u16x8::from_bits(s)
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
|
||||
// Expose low-level mask instead of higher-level conclusion,
|
||||
// because the non-ASCII case would perform less well otherwise.
|
||||
#[inline(always)]
|
||||
pub fn mask_ascii(s: u8x16) -> i32 {
|
||||
unsafe {
|
||||
_mm_movemask_epi8(__m128i::from_bits(s))
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
_mm_movemask_epi8(__m128i::from_bits(s)) == 0
|
||||
}
|
||||
}
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
#[inline(always)]
|
||||
pub fn simd_is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_ascii(s: u8x16) -> bool {
|
||||
// This optimizes better on ARM than
|
||||
// the lt formulation.
|
||||
let highest_ascii = u8x16::splat(0x7F);
|
||||
!s.gt(highest_ascii).any()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_str_latin1(s: u8x16) -> bool {
|
||||
if simd_is_ascii(s) {
|
||||
return true;
|
||||
}
|
||||
let above_str_latin1 = u8x16::splat(0xC4);
|
||||
s.lt(above_str_latin1).all()
|
||||
}
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
#[inline(always)]
|
||||
pub fn simd_is_str_latin1(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_str_latin1(s: u8x16) -> bool {
|
||||
let above_str_latin1 = u8x16::splat(0xC4);
|
||||
s.lt(above_str_latin1).all()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_arch = "aarch64")]{
|
||||
#[inline(always)]
|
||||
pub fn simd_is_basic_latin(s: u16x8) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_is_latin1(s: u16x8) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_basic_latin(s: u16x8) -> bool {
|
||||
let above_ascii = u16x8::splat(0x80);
|
||||
s.lt(above_ascii).all()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_is_latin1(s: u16x8) -> bool {
|
||||
// For some reason, on SSE2 this formulation
|
||||
// seems faster in this case while the above
|
||||
// function is better the other way round...
|
||||
let highest_latin1 = u16x8::splat(0xFF);
|
||||
!s.gt(highest_latin1).any()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn contains_surrogates(s: u16x8) -> bool {
|
||||
let mask = u16x8::splat(0xF800);
|
||||
let surrogate_bits = u16x8::splat(0xD800);
|
||||
(s & mask).eq(surrogate_bits).any()
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_arch = "aarch64")]{
|
||||
macro_rules! aarch64_return_false_if_below_hebrew {
|
||||
($s:ident) => ({
|
||||
unsafe {
|
||||
if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
macro_rules! non_aarch64_return_false_if_all {
|
||||
($s:ident) => ()
|
||||
}
|
||||
} else {
|
||||
macro_rules! aarch64_return_false_if_below_hebrew {
|
||||
($s:ident) => ()
|
||||
}
|
||||
|
||||
macro_rules! non_aarch64_return_false_if_all {
|
||||
($s:ident) => ({
|
||||
if $s.all() {
|
||||
return false;
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! in_range16x8 {
|
||||
($s:ident, $start:expr, $end:expr) => {{
|
||||
// SIMD sub is wrapping
|
||||
($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
|
||||
}};
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_u16x8_bidi(s: u16x8) -> bool {
|
||||
// We try to first quickly refute the RTLness of the vector. If that
|
||||
// fails, we do the real RTL check, so in that case we end up wasting
|
||||
// the work for the up-front quick checks. Even the quick-check is
|
||||
// two-fold in order to return `false` ASAP if everything is below
|
||||
// Hebrew.
|
||||
|
||||
aarch64_return_false_if_below_hebrew!(s);
|
||||
|
||||
let below_hebrew = s.lt(u16x8::splat(0x0590));
|
||||
|
||||
non_aarch64_return_false_if_all!(below_hebrew);
|
||||
|
||||
if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quick refutation failed. Let's do the full check.
|
||||
|
||||
(in_range16x8!(s, 0x0590, 0x0900)
|
||||
| in_range16x8!(s, 0xFB1D, 0xFE00)
|
||||
| in_range16x8!(s, 0xFE70, 0xFEFF)
|
||||
| in_range16x8!(s, 0xD802, 0xD804)
|
||||
| in_range16x8!(s, 0xD83A, 0xD83C)
|
||||
| s.eq(u16x8::splat(0x200F))
|
||||
| s.eq(u16x8::splat(0x202B))
|
||||
| s.eq(u16x8::splat(0x202E))
|
||||
| s.eq(u16x8::splat(0x2067)))
|
||||
.any()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
|
||||
unsafe {
|
||||
let first: u8x16 = shuffle!(
|
||||
s,
|
||||
u8x16::splat(0),
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
|
||||
);
|
||||
let second: u8x16 = shuffle!(
|
||||
s,
|
||||
u8x16::splat(0),
|
||||
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
|
||||
);
|
||||
(u16x8::from_bits(first), u16x8::from_bits(second))
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
|
||||
unsafe {
|
||||
u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b)))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
|
||||
unsafe {
|
||||
let first = u8x16::from_bits(a);
|
||||
let second = u8x16::from_bits(b);
|
||||
shuffle!(
|
||||
first,
|
||||
second,
|
||||
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use alloc::vec::Vec;
|
||||
|
||||
#[test]
|
||||
fn test_unpack() {
|
||||
let ascii: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let basic_latin: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u16);
|
||||
let (first, second) = simd_unpack(simd);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
unsafe {
|
||||
store8_unaligned(ptr, first);
|
||||
store8_unaligned(ptr.add(8), second);
|
||||
}
|
||||
assert_eq!(&vec[..], &basic_latin[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_success() {
|
||||
let ascii: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let basic_latin: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u8);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
assert!(simd_is_basic_latin(first | second));
|
||||
unsafe {
|
||||
store16_unaligned(ptr, simd_pack(first, second));
|
||||
}
|
||||
assert_eq!(&vec[..], &ascii[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_c0() {
|
||||
let input: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_0fff() {
|
||||
let input: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_ffff() {
|
||||
let input: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_ascii_success() {
|
||||
let ascii: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
assert!(simd_is_ascii(simd));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_ascii_failure() {
|
||||
let input: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(input.as_ptr()) };
|
||||
assert!(!simd_is_ascii(simd));
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "sse2")]
|
||||
#[test]
|
||||
fn test_check_ascii() {
|
||||
let input: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(input.as_ptr()) };
|
||||
let mask = mask_ascii(simd);
|
||||
assert_ne!(mask, 0);
|
||||
assert_eq!(mask.trailing_zeros(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alu() {
|
||||
let input: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let mut alu = 0u64;
|
||||
unsafe {
|
||||
::core::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
|
||||
}
|
||||
let masked = alu & 0x8080808080808080;
|
||||
assert_eq!(masked.trailing_zeros(), 39);
|
||||
}
|
||||
}
|
||||
714
zeroidc/vendor/encoding_rs/src/single_byte.rs
vendored
Normal file
714
zeroidc/vendor/encoding_rs/src/single_byte.rs
vendored
Normal file
@@ -0,0 +1,714 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::ascii::*;
|
||||
use crate::data::position;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
|
||||
pub struct SingleByteDecoder {
|
||||
table: &'static [u16; 128],
|
||||
}
|
||||
|
||||
impl SingleByteDecoder {
|
||||
pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
|
||||
VariantDecoder::SingleByte(SingleByteDecoder { table: data })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let mut source = ByteSource::new(src);
|
||||
let mut dest = Utf8Destination::new(dst);
|
||||
'outermost: loop {
|
||||
match dest.copy_ascii_from_check_space_bmp(&mut source) {
|
||||
CopyAsciiResult::Stop(ret) => return ret,
|
||||
CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
|
||||
// Start non-boilerplate
|
||||
//
|
||||
// Since the non-ASCIIness of `non_ascii` is hidden from
|
||||
// the optimizer, it can't figure out that it's OK to
|
||||
// statically omit the bound check when accessing
|
||||
// `[u16; 128]` with an index
|
||||
// `non_ascii as usize - 0x80usize`.
|
||||
let mapped =
|
||||
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
// let mapped = self.table[non_ascii as usize - 0x80usize];
|
||||
if mapped == 0u16 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let dest_again = handle.write_bmp_excl_ascii(mapped);
|
||||
// End non-boilerplate
|
||||
match source.check_available() {
|
||||
Space::Full(src_consumed) => {
|
||||
return (
|
||||
DecoderResult::InputEmpty,
|
||||
src_consumed,
|
||||
dest_again.written(),
|
||||
);
|
||||
}
|
||||
Space::Available(source_handle) => {
|
||||
match dest_again.check_space_bmp() {
|
||||
Space::Full(dst_written) => {
|
||||
return (
|
||||
DecoderResult::OutputFull,
|
||||
source_handle.consumed(),
|
||||
dst_written,
|
||||
);
|
||||
}
|
||||
Space::Available(mut destination_handle) => {
|
||||
let (mut b, unread_handle) = source_handle.read();
|
||||
let source_again = unread_handle.commit();
|
||||
'innermost: loop {
|
||||
if b > 127 {
|
||||
non_ascii = b;
|
||||
handle = destination_handle;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
let dest_again_again = destination_handle.write_ascii(b);
|
||||
if b < 60 {
|
||||
// We've got punctuation
|
||||
match source_again.check_available() {
|
||||
Space::Full(src_consumed_again) => {
|
||||
return (
|
||||
DecoderResult::InputEmpty,
|
||||
src_consumed_again,
|
||||
dest_again_again.written(),
|
||||
);
|
||||
}
|
||||
Space::Available(source_handle_again) => {
|
||||
match dest_again_again.check_space_bmp() {
|
||||
Space::Full(dst_written_again) => {
|
||||
return (
|
||||
DecoderResult::OutputFull,
|
||||
source_handle_again.consumed(),
|
||||
dst_written_again,
|
||||
);
|
||||
}
|
||||
Space::Available(
|
||||
destination_handle_again,
|
||||
) => {
|
||||
let (b_again, _unread_handle_again) =
|
||||
source_handle_again.read();
|
||||
b = b_again;
|
||||
destination_handle =
|
||||
destination_handle_again;
|
||||
continue 'innermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let mut converted = 0usize;
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
ascii_to_basic_latin(
|
||||
src.as_ptr().add(converted),
|
||||
dst.as_mut_ptr().add(converted),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
None => {
|
||||
return (pending, length, length);
|
||||
}
|
||||
Some((mut non_ascii, consumed)) => {
|
||||
converted += consumed;
|
||||
'middle: loop {
|
||||
// `converted` doesn't count the reading of `non_ascii` yet.
|
||||
// Since the non-ASCIIness of `non_ascii` is hidden from
|
||||
// the optimizer, it can't figure out that it's OK to
|
||||
// statically omit the bound check when accessing
|
||||
// `[u16; 128]` with an index
|
||||
// `non_ascii as usize - 0x80usize`.
|
||||
let mapped =
|
||||
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
// let mapped = self.table[non_ascii as usize - 0x80usize];
|
||||
if mapped == 0u16 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
unsafe {
|
||||
// The bound check has already been performed
|
||||
*(dst.get_unchecked_mut(converted)) = mapped;
|
||||
}
|
||||
converted += 1;
|
||||
// Next, handle ASCII punctuation and non-ASCII without
|
||||
// going back to ASCII acceleration. Non-ASCII scripts
|
||||
// use ASCII punctuation, so this avoid going to
|
||||
// acceleration just for punctuation/space and then
|
||||
// failing. This is a significant boost to non-ASCII
|
||||
// scripts.
|
||||
// TODO: Split out Latin converters without this part
|
||||
// this stuff makes Latin script-conversion slower.
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
let mut b = unsafe { *(src.get_unchecked(converted)) };
|
||||
'innermost: loop {
|
||||
if b > 127 {
|
||||
non_ascii = b;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = u16::from(b);
|
||||
}
|
||||
converted += 1;
|
||||
if b < 60 {
|
||||
// We've got punctuation
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
b = unsafe { *(src.get_unchecked(converted)) };
|
||||
continue 'innermost;
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
|
||||
let mut bytes = buffer;
|
||||
let mut total = 0;
|
||||
loop {
|
||||
if let Some((non_ascii, offset)) = validate_ascii(bytes) {
|
||||
total += offset;
|
||||
let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
if mapped != u16::from(non_ascii) {
|
||||
return total;
|
||||
}
|
||||
total += 1;
|
||||
bytes = &bytes[offset + 1..];
|
||||
} else {
|
||||
return total;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SingleByteEncoder {
|
||||
table: &'static [u16; 128],
|
||||
run_bmp_offset: usize,
|
||||
run_byte_offset: usize,
|
||||
run_length: usize,
|
||||
}
|
||||
|
||||
impl SingleByteEncoder {
|
||||
pub fn new(
|
||||
encoding: &'static Encoding,
|
||||
data: &'static [u16; 128],
|
||||
run_bmp_offset: u16,
|
||||
run_byte_offset: u8,
|
||||
run_length: u8,
|
||||
) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::SingleByte(SingleByteEncoder {
|
||||
table: data,
|
||||
run_bmp_offset: run_bmp_offset as usize,
|
||||
run_byte_offset: run_byte_offset as usize,
|
||||
run_length: run_length as usize,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(u16_length)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
|
||||
// First, we see if the code unit falls into a run of consecutive
|
||||
// code units that can be mapped by offset. This is very efficient
|
||||
// for most non-Latin encodings as well as Latin1-ish encodings.
|
||||
//
|
||||
// For encodings that don't fit this pattern, the run (which may
|
||||
// have the length of just one) just establishes the starting point
|
||||
// for the next rule.
|
||||
//
|
||||
// Next, we do a forward linear search in the part of the index
|
||||
// after the run. Even in non-Latin1-ish Latin encodings (except
|
||||
// macintosh), the lower case letters are here.
|
||||
//
|
||||
// Next, we search the third quadrant up to the start of the run
|
||||
// (upper case letters in Latin encodings except macintosh, in
|
||||
// Greek and in KOI encodings) and then the second quadrant,
|
||||
// except if the run stared before the third quadrant, we search
|
||||
// the second quadrant up to the run.
|
||||
//
|
||||
// Last, we search the first quadrant, which has unused controls
|
||||
// or punctuation in most encodings. This is bad for macintosh
|
||||
// and IBM866, but those are rare.
|
||||
|
||||
// Run of consecutive units
|
||||
let unit_as_usize = code_unit as usize;
|
||||
let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
|
||||
if offset < self.run_length {
|
||||
return Some((128 + self.run_byte_offset + offset) as u8);
|
||||
}
|
||||
|
||||
// Search after the run
|
||||
let tail_start = self.run_byte_offset + self.run_length;
|
||||
if let Some(pos) = position(&self.table[tail_start..], code_unit) {
|
||||
return Some((128 + tail_start + pos) as u8);
|
||||
}
|
||||
|
||||
if self.run_byte_offset >= 64 {
|
||||
// Search third quadrant before the run
|
||||
if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
|
||||
return Some(((128 + 64) + pos) as u8);
|
||||
}
|
||||
|
||||
// Search second quadrant
|
||||
if let Some(pos) = position(&self.table[32..64], code_unit) {
|
||||
return Some(((128 + 32) + pos) as u8);
|
||||
}
|
||||
} else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
|
||||
// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
|
||||
// Search second quadrant before the run
|
||||
return Some(((128 + 32) + pos) as u8);
|
||||
}
|
||||
|
||||
// Search first quadrant
|
||||
if let Some(pos) = position(&self.table[..32], code_unit) {
|
||||
return Some((128 + pos) as u8);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_function!(
|
||||
{
|
||||
match self.encode_u16(bmp) {
|
||||
Some(byte) => handle.write_one(byte),
|
||||
None => {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_one,
|
||||
check_space_one,
|
||||
encode_from_utf8_raw,
|
||||
str,
|
||||
Utf8Source,
|
||||
true
|
||||
);
|
||||
|
||||
pub fn encode_from_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u16],
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(EncoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(EncoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let mut converted = 0usize;
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
basic_latin_to_ascii(
|
||||
src.as_ptr().add(converted),
|
||||
dst.as_mut_ptr().add(converted),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
None => {
|
||||
return (pending, length, length);
|
||||
}
|
||||
Some((mut non_ascii, consumed)) => {
|
||||
converted += consumed;
|
||||
'middle: loop {
|
||||
// `converted` doesn't count the reading of `non_ascii` yet.
|
||||
match self.encode_u16(non_ascii) {
|
||||
Some(byte) => {
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = byte;
|
||||
}
|
||||
converted += 1;
|
||||
}
|
||||
None => {
|
||||
// At this point, we need to know if we
|
||||
// have a surrogate.
|
||||
let high_bits = non_ascii & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
// high surrogate
|
||||
if converted + 1 == length {
|
||||
// End of buffer. This surrogate is unpaired.
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
let second =
|
||||
u32::from(unsafe { *src.get_unchecked(converted + 1) });
|
||||
if second & 0xFC00u32 != 0xDC00u32 {
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
// The next code unit is a low surrogate.
|
||||
let astral: char = unsafe {
|
||||
::core::char::from_u32_unchecked(
|
||||
(u32::from(non_ascii) << 10) + second
|
||||
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
|
||||
)
|
||||
};
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
converted + 2, // +2 `for non_ascii` and `second`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
if high_bits == 0xDC00u16 {
|
||||
// Unpaired low surrogate
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(non_ascii),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
}
|
||||
// Next, handle ASCII punctuation and non-ASCII without
|
||||
// going back to ASCII acceleration. Non-ASCII scripts
|
||||
// use ASCII punctuation, so this avoid going to
|
||||
// acceleration just for punctuation/space and then
|
||||
// failing. This is a significant boost to non-ASCII
|
||||
// scripts.
|
||||
// TODO: Split out Latin converters without this part
|
||||
// this stuff makes Latin script-conversion slower.
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
let mut unit = unsafe { *(src.get_unchecked(converted)) };
|
||||
'innermost: loop {
|
||||
if unit > 127 {
|
||||
non_ascii = unit;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = unit as u8;
|
||||
}
|
||||
converted += 1;
|
||||
if unit < 60 {
|
||||
// We've got punctuation
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
unit = unsafe { *(src.get_unchecked(converted)) };
|
||||
continue 'innermost;
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
#[test]
|
||||
fn test_windows_1255_ca() {
|
||||
decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
|
||||
encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ascii_punctuation() {
|
||||
let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
|
||||
let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
|
||||
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
|
||||
\u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
|
||||
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
|
||||
\u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
|
||||
decode(WINDOWS_1253, bytes, characters);
|
||||
encode(WINDOWS_1253, characters, bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_malformed() {
|
||||
decode(
|
||||
WINDOWS_1253,
|
||||
b"\xC1\xF5\xD2\xF4\xFC",
|
||||
"\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unmappables() {
|
||||
encode(
|
||||
WINDOWS_1253,
|
||||
"\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
|
||||
b"\xC1\xF5☃\xF4\xFC",
|
||||
);
|
||||
encode(
|
||||
WINDOWS_1253,
|
||||
"\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
|
||||
b"\xC1\xF5💩\xF4\xFC",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unpaired_surrogates() {
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
|
||||
b"\xC1\xF5�\xF4\xFC",
|
||||
);
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
|
||||
b"\xC1\xF5�\xF4\xFC",
|
||||
);
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
|
||||
b"\xC1\xF5\xF4\xFC�",
|
||||
);
|
||||
}
|
||||
|
||||
pub const HIGH_BYTES: &'static [u8; 128] = &[
|
||||
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
|
||||
0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
|
||||
0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
|
||||
0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
|
||||
0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
|
||||
0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
|
||||
0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
|
||||
0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
];
|
||||
|
||||
fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
|
||||
let mut with_replacement = [0u16; 128];
|
||||
let mut it = data.iter().enumerate();
|
||||
loop {
|
||||
match it.next() {
|
||||
Some((i, code_point)) => {
|
||||
if *code_point == 0 {
|
||||
with_replacement[i] = 0xFFFD;
|
||||
} else {
|
||||
with_replacement[i] = *code_point;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
|
||||
}
|
||||
|
||||
fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
|
||||
let mut with_zeros = [0u8; 128];
|
||||
let mut it = data.iter().enumerate();
|
||||
loop {
|
||||
match it.next() {
|
||||
Some((i, code_point)) => {
|
||||
if *code_point == 0 {
|
||||
with_zeros[i] = 0;
|
||||
} else {
|
||||
with_zeros[i] = HIGH_BYTES[i];
|
||||
}
|
||||
}
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
encode_from_utf16(encoding, data, &with_zeros[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = WINDOWS_1253.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
|
||||
// These tests are so self-referential that they are pretty useless.
|
||||
|
||||
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_decode() {
|
||||
decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
|
||||
decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
|
||||
if cfg!(miri) {
|
||||
// Miri is too slow
|
||||
return;
|
||||
}
|
||||
decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
|
||||
decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
|
||||
decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
|
||||
decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
|
||||
decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
|
||||
decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
|
||||
decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
|
||||
decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
|
||||
decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
|
||||
decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
|
||||
decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
|
||||
decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
|
||||
decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
|
||||
decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
|
||||
decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
|
||||
decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
|
||||
decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
|
||||
decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
|
||||
decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
|
||||
decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
|
||||
decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
|
||||
decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
|
||||
decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
|
||||
decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
|
||||
decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_encode() {
|
||||
encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
|
||||
encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
|
||||
if cfg!(miri) {
|
||||
// Miri is too slow
|
||||
return;
|
||||
}
|
||||
encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
|
||||
encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
|
||||
encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
|
||||
encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
|
||||
encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
|
||||
encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
|
||||
encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
|
||||
encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
|
||||
encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
|
||||
encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
|
||||
encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
|
||||
encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
|
||||
encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
|
||||
encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
|
||||
encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
|
||||
encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
|
||||
encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
|
||||
encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
|
||||
encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
|
||||
encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
|
||||
encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
|
||||
encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
|
||||
encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
|
||||
encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
|
||||
encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
|
||||
}
|
||||
// END GENERATED CODE
|
||||
}
|
||||
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in.txt
vendored
Normal file
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in_ref.txt
vendored
Normal file
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out.txt
vendored
Normal file
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out_ref.txt
vendored
Normal file
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in_ref.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out.txt
vendored
Normal file
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out_ref.txt
vendored
Normal file
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in_ref.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out.txt
vendored
Normal file
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out_ref.txt
vendored
Normal file
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in_ref.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out.txt
vendored
Normal file
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out_ref.txt
vendored
Normal file
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in_ref.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out.txt
vendored
Normal file
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out_ref.txt
vendored
Normal file
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in_ref.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in.txt
vendored
Normal file
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in_ref.txt
vendored
Normal file
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out.txt
vendored
Normal file
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out_ref.txt
vendored
Normal file
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
242
zeroidc/vendor/encoding_rs/src/test_labels_names.rs
vendored
Normal file
242
zeroidc/vendor/encoding_rs/src/test_labels_names.rs
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_all_labels() {
|
||||
assert_eq!(Encoding::for_label(b"l1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"l2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"l3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"l4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"l5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"l6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"l9"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"mac"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"koi"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"gbk"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"koi8"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"sjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"ucs-2"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"ms932"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"cp866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"cp819"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ascii"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"x-gbk"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"greek"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"cp1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"cp1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"latin1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"gb2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"cp1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"latin2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"cp1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"latin3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"cp1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"latin4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"cp1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"csbig5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"latin5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"utf-16"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"cp1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"ibm866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"latin6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"cp1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"cp1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"greek8"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"ibm819"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"arabic"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"visual"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"korean"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"euc-jp"), Some(EUC_JP));
|
||||
assert_eq!(Encoding::for_label(b"koi8-r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"koi8_r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"euc-kr"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-sjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"koi8-u"), Some(KOI8_U));
|
||||
assert_eq!(Encoding::for_label(b"hebrew"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"tis-620"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"gb18030"), Some(GB18030));
|
||||
assert_eq!(Encoding::for_label(b"ksc5601"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"gb_2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"dos-874"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"cn-big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"unicode"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"chinese"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"logical"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"cskoi8r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"cseuckr"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"koi8-ru"), Some(KOI8_U));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"ksc_5601"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"iso88591"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"csgb2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso88592"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"iso88593"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"ecma-114"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso88594"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"iso88595"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"x-x-big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"csibm866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"iso88596"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"iso88597"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"asmo-708"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"ecma-118"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"elot_928"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"iso88598"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso88599"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"cyrillic"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"utf-16be"), Some(UTF_16BE));
|
||||
assert_eq!(Encoding::for_label(b"utf-16le"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"us-ascii"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ms_kanji"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"x-euc-jp"), Some(EUC_JP));
|
||||
assert_eq!(Encoding::for_label(b"iso885910"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso885911"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso885913"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso885914"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso885915"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-58"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"csunicode"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"macintosh"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"shift-jis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"shift_jis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-100"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-10"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-110"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"gb_2312-80"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-101"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-11"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"hz-gb-2312"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-13"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-14"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-144"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-126"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-127"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-157"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-138"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-148"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-109"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-149"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"big5-hkscs"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"csshiftjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-10"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-11"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-13"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-14"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"windows-874"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-16"), Some(ISO_8859_16));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"windows-949"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin9"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"csiso88596e"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csiso88598e"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"unicodefffe"), Some(UTF_16BE));
|
||||
assert_eq!(Encoding::for_label(b"unicodefeff"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"csmacintosh"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"csiso88596i"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csiso88598i"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"windows-31j"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"x-mac-roman"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-cn"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-jp"), Some(ISO_2022_JP));
|
||||
assert_eq!(Encoding::for_label(b"csiso2022jp"), Some(ISO_2022_JP));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-kr"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"csiso2022kr"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"replacement"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"windows-1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"windows-1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"windows-1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"windows-1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"windows-1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"windows-1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"windows-1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"windows-1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"windows-1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6-e"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8-e"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6-i"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8-i"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"sun_eu_greek"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"csksc56011987"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"unicode20utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"unicode11utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"ks_c_5601-1987"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"ansi_x3.4-1968"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ks_c_5601-1989"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-mac-cyrillic"), Some(X_MAC_CYRILLIC));
|
||||
assert_eq!(Encoding::for_label(b"x-user-defined"), Some(X_USER_DEFINED));
|
||||
assert_eq!(Encoding::for_label(b"csiso58gb231280"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso-10646-ucs-2"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-1:1987"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-2:1987"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-6:1987"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-7:1987"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-3:1988"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-4:1988"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-5:1988"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-8:1988"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"x-unicode20utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-9:1989"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"csisolatingreek"), Some(ISO_8859_7));
|
||||
assert_eq!(
|
||||
Encoding::for_label(b"x-mac-ukrainian"),
|
||||
Some(X_MAC_CYRILLIC)
|
||||
);
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-cn-ext"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"csisolatinarabic"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csisolatinhebrew"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"unicode-1-1-utf-8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"csisolatincyrillic"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"cseucpkdfmtjapanese"), Some(EUC_JP));
|
||||
}
|
||||
262
zeroidc/vendor/encoding_rs/src/testing.rs
vendored
Normal file
262
zeroidc/vendor/encoding_rs/src/testing.rs
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
|
||||
pub fn decode(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let mut vec = Vec::with_capacity(bytes.len() + 32);
|
||||
let mut string = String::with_capacity(expect.len() + 32);
|
||||
let range = if cfg!(miri) {
|
||||
0usize..4usize
|
||||
} else {
|
||||
0usize..32usize
|
||||
};
|
||||
for i in range {
|
||||
vec.clear();
|
||||
string.clear();
|
||||
for j in 0usize..i {
|
||||
let c = 0x40u8 + (j as u8);
|
||||
vec.push(c);
|
||||
string.push(c as char);
|
||||
}
|
||||
vec.extend_from_slice(bytes);
|
||||
string.push_str(expect);
|
||||
decode_without_padding_impl(encoding, &vec[..], &string[..], i);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_without_padding(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
decode_without_padding_impl(encoding, bytes, expect, 0);
|
||||
}
|
||||
|
||||
fn decode_without_padding_impl(
|
||||
encoding: &'static Encoding,
|
||||
bytes: &[u8],
|
||||
expect: &str,
|
||||
padding: usize,
|
||||
) {
|
||||
decode_to_utf8_impl(encoding, bytes, expect, padding);
|
||||
decode_to_utf16_impl(encoding, bytes, &utf16_from_utf8(expect)[..], padding);
|
||||
decode_to_string(encoding, bytes, expect);
|
||||
}
|
||||
|
||||
pub fn encode(encoding: &'static Encoding, str: &str, expect: &[u8]) {
|
||||
let mut vec = Vec::with_capacity(expect.len() + 32);
|
||||
let mut string = String::with_capacity(str.len() + 32);
|
||||
let range = if cfg!(miri) {
|
||||
0usize..4usize
|
||||
} else {
|
||||
0usize..32usize
|
||||
};
|
||||
for i in range {
|
||||
vec.clear();
|
||||
string.clear();
|
||||
for j in 0usize..i {
|
||||
let c = 0x40u8 + (j as u8);
|
||||
vec.push(c);
|
||||
string.push(c as char);
|
||||
}
|
||||
vec.extend_from_slice(expect);
|
||||
string.push_str(str);
|
||||
encode_without_padding(encoding, &string[..], &vec[..]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_without_padding(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
encode_from_utf8(encoding, string, expect);
|
||||
encode_from_utf16(encoding, &utf16_from_utf8(string)[..], expect);
|
||||
encode_to_vec(encoding, string, expect);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16(encoding: &'static Encoding, bytes: &[u8], expect: &[u16]) {
|
||||
decode_to_utf16_impl(encoding, bytes, expect, 0);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_impl(
|
||||
encoding: &'static Encoding,
|
||||
bytes: &[u8],
|
||||
expect: &[u16],
|
||||
padding: usize,
|
||||
) {
|
||||
for i in padding..bytes.len() {
|
||||
let (head, tail) = bytes.split_at(i);
|
||||
decode_to_utf16_with_boundary(encoding, head, tail, expect);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_with_boundary(
|
||||
encoding: &'static Encoding,
|
||||
head: &[u8],
|
||||
tail: &[u8],
|
||||
expect: &[u16],
|
||||
) {
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let mut dest: Vec<u16> = Vec::with_capacity(
|
||||
decoder
|
||||
.max_utf16_buffer_length(head.len() + tail.len())
|
||||
.unwrap(),
|
||||
);
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u16);
|
||||
let mut total_read = 0;
|
||||
let mut total_written = 0;
|
||||
{
|
||||
let (complete, read, written, _) = decoder.decode_to_utf16(head, &mut dest, false);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
{
|
||||
let (complete, read, written, _) =
|
||||
decoder.decode_to_utf16(tail, &mut dest[total_written..], true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
assert_eq!(total_read, head.len() + tail.len());
|
||||
assert_eq!(total_written, expect.len());
|
||||
dest.truncate(total_written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
decode_to_utf8_impl(encoding, bytes, expect, 0);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_impl(
|
||||
encoding: &'static Encoding,
|
||||
bytes: &[u8],
|
||||
expect: &str,
|
||||
padding: usize,
|
||||
) {
|
||||
for i in padding..bytes.len() {
|
||||
let (head, tail) = bytes.split_at(i);
|
||||
decode_to_utf8_with_boundary(encoding, head, tail, expect);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_with_boundary(
|
||||
encoding: &'static Encoding,
|
||||
head: &[u8],
|
||||
tail: &[u8],
|
||||
expect: &str,
|
||||
) {
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(
|
||||
decoder
|
||||
.max_utf8_buffer_length(head.len() + tail.len())
|
||||
.unwrap(),
|
||||
);
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let mut total_read = 0;
|
||||
let mut total_written = 0;
|
||||
{
|
||||
let (complete, read, written, _) = decoder.decode_to_utf8(head, &mut dest, false);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
{
|
||||
let (complete, read, written, _) =
|
||||
decoder.decode_to_utf8(tail, &mut dest[total_written..], true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
assert_eq!(total_read, head.len() + tail.len());
|
||||
assert_eq!(total_written, expect.len());
|
||||
dest.truncate(total_written);
|
||||
assert_eq!(&dest[..], expect.as_bytes());
|
||||
}
|
||||
|
||||
pub fn decode_to_string(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let (cow, _, _) = encoding.decode(bytes);
|
||||
assert_eq!(&cow[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = encoder.encode_from_utf8(string, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, string.len());
|
||||
assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_from_utf16(encoding: &'static Encoding, string: &[u16], expect: &[u8]) {
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = encoder.encode_from_utf16(string, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, string.len());
|
||||
// assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_to_vec(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
let (cow, _, _) = encoding.encode(string);
|
||||
assert_eq!(&cow[..], expect);
|
||||
}
|
||||
|
||||
pub fn utf16_from_utf8(string: &str) -> Vec<u16> {
|
||||
let mut decoder = UTF_8.new_decoder_without_bom_handling();
|
||||
let mut vec = Vec::with_capacity(decoder.max_utf16_buffer_length(string.len()).unwrap());
|
||||
let capacity = vec.capacity();
|
||||
vec.resize(capacity, 0);
|
||||
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf16_without_replacement(string.as_bytes(), &mut vec[..], true);
|
||||
match result {
|
||||
DecoderResult::InputEmpty => {
|
||||
debug_assert_eq!(read, string.len());
|
||||
vec.resize(written, 0);
|
||||
vec
|
||||
}
|
||||
DecoderResult::Malformed(_, _) => unreachable!("Malformed"),
|
||||
DecoderResult::OutputFull => unreachable!("Output full"),
|
||||
}
|
||||
}
|
||||
472
zeroidc/vendor/encoding_rs/src/utf_16.rs
vendored
Normal file
472
zeroidc/vendor/encoding_rs/src/utf_16.rs
vendored
Normal file
@@ -0,0 +1,472 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
|
||||
pub struct Utf16Decoder {
|
||||
lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
|
||||
lead_byte: Option<u8>,
|
||||
be: bool,
|
||||
pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
|
||||
}
|
||||
|
||||
impl Utf16Decoder {
|
||||
pub fn new(big_endian: bool) -> VariantDecoder {
|
||||
VariantDecoder::Utf16(Utf16Decoder {
|
||||
lead_surrogate: 0,
|
||||
lead_byte: None,
|
||||
be: big_endian,
|
||||
pending_bmp: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn additional_from_state(&self) -> usize {
|
||||
1 + if self.lead_byte.is_some() { 1 } else { 0 }
|
||||
+ if self.lead_surrogate == 0 { 0 } else { 2 }
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_mul(
|
||||
3,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_mul(
|
||||
3,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
decoder_functions!(
|
||||
{
|
||||
if self.pending_bmp {
|
||||
match dest.check_space_bmp() {
|
||||
Space::Full(_) => {
|
||||
return (DecoderResult::OutputFull, 0, 0);
|
||||
}
|
||||
Space::Available(destination_handle) => {
|
||||
destination_handle.write_bmp(self.lead_surrogate);
|
||||
self.pending_bmp = false;
|
||||
self.lead_surrogate = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// This is the fast path. The rest runs only at the
|
||||
// start and end for partial sequences.
|
||||
if self.lead_byte.is_none() && self.lead_surrogate == 0 {
|
||||
if let Some((read, written)) = if self.be {
|
||||
dest.copy_utf16_from::<BigEndian>(&mut source)
|
||||
} else {
|
||||
dest.copy_utf16_from::<LittleEndian>(&mut source)
|
||||
} {
|
||||
return (DecoderResult::Malformed(2, 0), read, written);
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
debug_assert!(!self.pending_bmp);
|
||||
if self.lead_surrogate != 0 || self.lead_byte.is_some() {
|
||||
// We need to check space without intent to write in order to
|
||||
// make sure that there is space for the replacement character.
|
||||
match dest.check_space_bmp() {
|
||||
Space::Full(_) => {
|
||||
return (DecoderResult::OutputFull, 0, 0);
|
||||
}
|
||||
Space::Available(_) => {
|
||||
if self.lead_surrogate != 0 {
|
||||
self.lead_surrogate = 0;
|
||||
match self.lead_byte {
|
||||
None => {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
src_consumed,
|
||||
dest.written(),
|
||||
);
|
||||
}
|
||||
Some(_) => {
|
||||
self.lead_byte = None;
|
||||
return (
|
||||
DecoderResult::Malformed(3, 0),
|
||||
src_consumed,
|
||||
dest.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
debug_assert!(self.lead_byte.is_some());
|
||||
self.lead_byte = None;
|
||||
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
match self.lead_byte {
|
||||
None => {
|
||||
self.lead_byte = Some(b);
|
||||
continue;
|
||||
}
|
||||
Some(lead) => {
|
||||
self.lead_byte = None;
|
||||
let code_unit = if self.be {
|
||||
u16::from(lead) << 8 | u16::from(b)
|
||||
} else {
|
||||
u16::from(b) << 8 | u16::from(lead)
|
||||
};
|
||||
let high_bits = code_unit & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
// high surrogate
|
||||
if self.lead_surrogate != 0 {
|
||||
// The previous high surrogate was in
|
||||
// error and this one becomes the new
|
||||
// pending one.
|
||||
self.lead_surrogate = code_unit as u16;
|
||||
return (
|
||||
DecoderResult::Malformed(2, 2),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
self.lead_surrogate = code_unit;
|
||||
continue;
|
||||
}
|
||||
if high_bits == 0xDC00u16 {
|
||||
// low surrogate
|
||||
if self.lead_surrogate == 0 {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
|
||||
self.lead_surrogate = 0;
|
||||
continue;
|
||||
}
|
||||
// bmp
|
||||
if self.lead_surrogate != 0 {
|
||||
// The previous high surrogate was in
|
||||
// error and this code unit becomes a
|
||||
// pending BMP character.
|
||||
self.lead_surrogate = code_unit;
|
||||
self.pending_bmp = true;
|
||||
return (
|
||||
DecoderResult::Malformed(2, 2),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_bmp(code_unit);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_astral
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_utf_16le(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(UTF_16LE, bytes, expect);
|
||||
}
|
||||
|
||||
fn decode_utf_16be(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(UTF_16BE, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_utf_16le(string: &str, expect: &[u8]) {
|
||||
encode(UTF_16LE, string, expect);
|
||||
}
|
||||
|
||||
fn encode_utf_16be(string: &str, expect: &[u8]) {
|
||||
encode(UTF_16BE, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16_decode() {
|
||||
decode_utf_16le(b"", "");
|
||||
decode_utf_16be(b"", "");
|
||||
|
||||
decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
|
||||
decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
|
||||
decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
|
||||
|
||||
decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
|
||||
decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
|
||||
decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
|
||||
decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
|
||||
|
||||
decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
|
||||
decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
|
||||
decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
|
||||
|
||||
// The \xFF makes sure that the parts before and after have different alignment
|
||||
let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
|
||||
let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
|
||||
let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
|
||||
decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
|
||||
decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
|
||||
decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
|
||||
decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16_encode() {
|
||||
// Empty
|
||||
encode_utf_16be("", b"");
|
||||
encode_utf_16le("", b"");
|
||||
|
||||
// Encodes as UTF-8
|
||||
assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
|
||||
assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
|
||||
encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_one_by_one() {
|
||||
let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
for b in input.chunks(1) {
|
||||
assert_eq!(b.len(), 1);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_one_by_one() {
|
||||
let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
for b in input.chunks(1) {
|
||||
assert_eq!(b.len(), 1);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_three_at_a_time() {
|
||||
let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
for b in input.chunks(3) {
|
||||
assert_eq!(b.len(), 3);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, b.len());
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_three_at_a_time() {
|
||||
let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
for b in input.chunks(3) {
|
||||
assert_eq!(b.len(), 3);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, b.len());
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xFDFF);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xFEFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_bom_prefix() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_bom_prefix() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_near_end() {
|
||||
let mut output = [0u8; 4];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x03], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0x0);
|
||||
}
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::OutputFull);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xE2);
|
||||
assert_eq!(output[1], 0x98);
|
||||
assert_eq!(output[2], 0x83);
|
||||
assert_eq!(output[3], 0x00);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_near_end() {
|
||||
let mut output = [0u8; 4];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x26], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0x0);
|
||||
}
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::OutputFull);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xE2);
|
||||
assert_eq!(output[1], 0x98);
|
||||
assert_eq!(output[2], 0x83);
|
||||
assert_eq!(output[3], 0x00);
|
||||
}
|
||||
}
|
||||
}
|
||||
1631
zeroidc/vendor/encoding_rs/src/utf_8.rs
vendored
Normal file
1631
zeroidc/vendor/encoding_rs/src/utf_8.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
400
zeroidc/vendor/encoding_rs/src/variant.rs
vendored
Normal file
400
zeroidc/vendor/encoding_rs/src/variant.rs
vendored
Normal file
@@ -0,0 +1,400 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
//! This module provides enums that wrap the various decoders and encoders.
|
||||
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
|
||||
//! dispatch explicitly for a finite set of specialized decoders and encoders.
|
||||
//! Unfortunately, this means the compiler doesn't generate the dispatch code
|
||||
//! and it has to be written here instead.
|
||||
//!
|
||||
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
|
||||
//! allocation in Rust code, including the convenience methods on `Encoding`.
|
||||
|
||||
use super::*;
|
||||
use big5::*;
|
||||
use euc_jp::*;
|
||||
use euc_kr::*;
|
||||
use gb18030::*;
|
||||
use iso_2022_jp::*;
|
||||
use replacement::*;
|
||||
use shift_jis::*;
|
||||
use single_byte::*;
|
||||
use utf_16::*;
|
||||
use utf_8::*;
|
||||
use x_user_defined::*;
|
||||
|
||||
pub enum VariantDecoder {
|
||||
SingleByte(SingleByteDecoder),
|
||||
Utf8(Utf8Decoder),
|
||||
Gb18030(Gb18030Decoder),
|
||||
Big5(Big5Decoder),
|
||||
EucJp(EucJpDecoder),
|
||||
Iso2022Jp(Iso2022JpDecoder),
|
||||
ShiftJis(ShiftJisDecoder),
|
||||
EucKr(EucKrDecoder),
|
||||
Replacement(ReplacementDecoder),
|
||||
UserDefined(UserDefinedDecoder),
|
||||
Utf16(Utf16Decoder),
|
||||
}
|
||||
|
||||
impl VariantDecoder {
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Utf8(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Gb18030(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Big5(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::EucJp(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Iso2022Jp(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::ShiftJis(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::EucKr(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Replacement(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::UserDefined(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Utf16(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Utf8(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Gb18030(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Big5(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::EucJp(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Iso2022Jp(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::ShiftJis(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::EucKr(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Replacement(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::UserDefined(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Utf16(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Utf8(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Gb18030(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Big5(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::EucJp(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Iso2022Jp(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::ShiftJis(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::EucKr(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Replacement(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::UserDefined(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Utf16(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Big5(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Big5(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => {
|
||||
return Some(v.latin1_byte_compatible_up_to(buffer));
|
||||
}
|
||||
VariantDecoder::Utf8(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::Gb18030(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::Big5(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::EucJp(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::Iso2022Jp(ref v) => {
|
||||
if v.in_neutral_state() {
|
||||
return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
|
||||
}
|
||||
return None;
|
||||
}
|
||||
VariantDecoder::ShiftJis(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::EucKr(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::UserDefined(_) => {}
|
||||
VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(Encoding::ascii_valid_up_to(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
pub enum VariantEncoder {
|
||||
SingleByte(SingleByteEncoder),
|
||||
Utf8(Utf8Encoder),
|
||||
Gb18030(Gb18030Encoder),
|
||||
Big5(Big5Encoder),
|
||||
EucJp(EucJpEncoder),
|
||||
Iso2022Jp(Iso2022JpEncoder),
|
||||
ShiftJis(ShiftJisEncoder),
|
||||
EucKr(EucKrEncoder),
|
||||
UserDefined(UserDefinedEncoder),
|
||||
}
|
||||
|
||||
impl VariantEncoder {
|
||||
pub fn has_pending_state(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoder::Iso2022Jp(ref v) => v.has_pending_state(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Utf8(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Gb18030(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Big5(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::EucJp(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Iso2022Jp(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::ShiftJis(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::EucKr(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::UserDefined(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Utf8(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Gb18030(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Big5(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::EucJp(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Iso2022Jp(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::ShiftJis(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::EucKr(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::UserDefined(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_from_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u16],
|
||||
dst: &mut [u8],
|
||||
last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Big5(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8_raw(
|
||||
&mut self,
|
||||
src: &str,
|
||||
dst: &mut [u8],
|
||||
last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Big5(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum VariantEncoding {
|
||||
SingleByte(&'static [u16; 128], u16, u8, u8),
|
||||
Utf8,
|
||||
Gbk,
|
||||
Gb18030,
|
||||
Big5,
|
||||
EucJp,
|
||||
Iso2022Jp,
|
||||
ShiftJis,
|
||||
EucKr,
|
||||
Replacement,
|
||||
Utf16Be,
|
||||
Utf16Le,
|
||||
UserDefined,
|
||||
}
|
||||
|
||||
impl VariantEncoding {
|
||||
pub fn new_variant_decoder(&self) -> VariantDecoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::Utf8 => Utf8Decoder::new(),
|
||||
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
|
||||
VariantEncoding::Big5 => Big5Decoder::new(),
|
||||
VariantEncoding::EucJp => EucJpDecoder::new(),
|
||||
VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
|
||||
VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
|
||||
VariantEncoding::EucKr => EucKrDecoder::new(),
|
||||
VariantEncoding::Replacement => ReplacementDecoder::new(),
|
||||
VariantEncoding::UserDefined => UserDefinedDecoder::new(),
|
||||
VariantEncoding::Utf16Be => Utf16Decoder::new(true),
|
||||
VariantEncoding::Utf16Le => Utf16Decoder::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => {
|
||||
SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length)
|
||||
}
|
||||
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
|
||||
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
|
||||
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
|
||||
VariantEncoding::Big5 => Big5Encoder::new(encoding),
|
||||
VariantEncoding::EucJp => EucJpEncoder::new(encoding),
|
||||
VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
|
||||
VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
|
||||
VariantEncoding::EucKr => EucKrEncoder::new(encoding),
|
||||
VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
|
||||
VariantEncoding::Utf16Be | VariantEncoding::Replacement | VariantEncoding::Utf16Le => {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_single_byte(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
249
zeroidc/vendor/encoding_rs/src/x_user_defined.rs
vendored
Normal file
249
zeroidc/vendor/encoding_rs/src/x_user_defined.rs
vendored
Normal file
@@ -0,0 +1,249 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(feature = "simd-accel")] {
|
||||
use simd_funcs::*;
|
||||
use packed_simd::u16x8;
|
||||
|
||||
#[inline(always)]
|
||||
fn shift_upper(unpacked: u16x8) -> u16x8 {
|
||||
let highest_ascii = u16x8::splat(0x7F);
|
||||
unpacked + unpacked.gt(highest_ascii).select(u16x8::splat(0xF700), u16x8::splat(0)) }
|
||||
} else {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UserDefinedDecoder;
|
||||
|
||||
impl UserDefinedDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::UserDefined(UserDefinedDecoder)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
decoder_function!(
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{
|
||||
if b < 0x80 {
|
||||
// ASCII run not optimized, because binary data expected
|
||||
destination_handle.write_ascii(b);
|
||||
continue;
|
||||
}
|
||||
destination_handle.write_upper_bmp(u16::from(b) + 0xF700);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
_unread_handle,
|
||||
check_space_bmp,
|
||||
decode_to_utf8_raw,
|
||||
u8,
|
||||
Utf8Destination
|
||||
);
|
||||
|
||||
#[cfg(not(feature = "simd-accel"))]
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let src_trim = &src[..length];
|
||||
let dst_trim = &mut dst[..length];
|
||||
src_trim
|
||||
.iter()
|
||||
.zip(dst_trim.iter_mut())
|
||||
.for_each(|(from, to)| {
|
||||
*to = {
|
||||
let unit = *from;
|
||||
if unit < 0x80 {
|
||||
u16::from(unit)
|
||||
} else {
|
||||
u16::from(unit) + 0xF700
|
||||
}
|
||||
}
|
||||
});
|
||||
(pending, length, length)
|
||||
}
|
||||
|
||||
#[cfg(feature = "simd-accel")]
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
// Not bothering with alignment
|
||||
let tail_start = length & !0xF;
|
||||
let simd_iterations = length >> 4;
|
||||
let src_ptr = src.as_ptr();
|
||||
let dst_ptr = dst.as_mut_ptr();
|
||||
for i in 0..simd_iterations {
|
||||
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
|
||||
let (first, second) = simd_unpack(input);
|
||||
unsafe {
|
||||
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
|
||||
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
|
||||
}
|
||||
}
|
||||
let src_tail = &src[tail_start..length];
|
||||
let dst_tail = &mut dst[tail_start..length];
|
||||
src_tail
|
||||
.iter()
|
||||
.zip(dst_tail.iter_mut())
|
||||
.for_each(|(from, to)| {
|
||||
*to = {
|
||||
let unit = *from;
|
||||
if unit < 0x80 {
|
||||
u16::from(unit)
|
||||
} else {
|
||||
u16::from(unit) + 0xF700
|
||||
}
|
||||
}
|
||||
});
|
||||
(pending, length, length)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UserDefinedEncoder;
|
||||
|
||||
impl UserDefinedEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::UserDefined(UserDefinedEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(u16_length)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
encoder_functions!(
|
||||
{},
|
||||
{
|
||||
if c <= '\u{7F}' {
|
||||
// TODO optimize ASCII run
|
||||
destination_handle.write_one(c as u8);
|
||||
continue;
|
||||
}
|
||||
if c < '\u{F780}' || c > '\u{F7FF}' {
|
||||
return (
|
||||
EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_one((u32::from(c) - 0xF700) as u8);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
source,
|
||||
dest,
|
||||
c,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_one
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_x_user_defined(bytes: &[u8], expect: &str) {
|
||||
decode(X_USER_DEFINED, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_x_user_defined(string: &str, expect: &[u8]) {
|
||||
encode(X_USER_DEFINED, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_decode() {
|
||||
// Empty
|
||||
decode_x_user_defined(b"", "");
|
||||
|
||||
// ASCII
|
||||
decode_x_user_defined(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_x_user_defined(b"\x80\xFF", "\u{F780}\u{F7FF}");
|
||||
decode_x_user_defined(b"\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62", "\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_encode() {
|
||||
// Empty
|
||||
encode_x_user_defined("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_x_user_defined("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
encode_x_user_defined("\u{F780}\u{F7FF}", b"\x80\xFF");
|
||||
encode_x_user_defined("\u{F77F}\u{F800}", b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = X_USER_DEFINED.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user