RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.

This commit is contained in:
Adam Ierymenko
2022-06-08 07:32:16 -04:00
parent 373ca30269
commit d5ca4e5f52
12611 changed files with 2898014 additions and 284 deletions

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,48 @@
If you send a pull request / patch, please observe the following.
## Licensing
Since this crate is dual-licensed,
[section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions)
is considered to apply in the sense of Contributions being automatically
under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file).
That is, by the act of offering a Contribution, you place your Contribution
under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT`
file. Please do not contribute if you aren't willing or allowed to license your
contributions in this manner.
You are encouraged to dedicate test code that you contribute to the Public
Domain using the CC0 dedication. If you contribute test code that is not
dedicated to the Public Domain, please be sure not to put it in a part of
source code that the comments designate as being dedicated to the Public
Domain.
## Copyright Notices
If you require the addition of your copyright notice, it's up to you to edit in
your notice as part of your Contribution. Not adding a copyright notice is
taken as a waiver of copyright notice.
## No Encodings Beyond The Encoding Standard
Please do not contribute implementations of encodings that are not specified
in the [Encoding Standard](https://encoding.spec.whatwg.org/).
For example, an implementation of UTF-7 is explicitly out of scope for this
crate and is, therefore, provided by the [`charset`](https://crates.io/crates/charset)
crate instead. For single-byte DOS encodings, please see the
[`oem_cp`](https://crates.io/crates/oem_cp) crate.
## Compatibility with Stable Rust
Please ensure that your Contribution compiles with the latest stable-channel
rustc.
## rustfmt
The `rustfmt` version used for this code is `rustfmt-nightly`. Please either
use that version or avoid using `rustfmt` (so as not to reformat all the code).
## Unit tests
Please ensure that `cargo test` succeeds.

17
zeroidc/vendor/encoding_rs/COPYRIGHT vendored Normal file
View File

@@ -0,0 +1,17 @@
encoding_rs is copyright Mozilla Foundation.
Licensed under the Apache License, Version 2.0
<LICENSE-APACHE or
https://www.apache.org/licenses/LICENSE-2.0> or the MIT
license <LICENSE-MIT or https://opensource.org/licenses/MIT>,
at your option. All files in the project carrying such
notice may not be copied, modified, or distributed except
according to those terms.
This crate includes data derived from the data files supplied
with the WHATWG Encoding Standard, which, when incorporated into
source code, are licensed under the BSD 3-Clause License
<LICENSE-WHATWG>.
Test code within encoding_rs is dedicated to the Public Domain when so
designated (see the individual files for PD/CC0-dedicated sections).

84
zeroidc/vendor/encoding_rs/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,84 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "encoding_rs"
version = "0.8.31"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "A Gecko-oriented implementation of the Encoding Standard"
homepage = "https://docs.rs/encoding_rs/"
documentation = "https://docs.rs/encoding_rs/"
readme = "README.md"
keywords = [
"encoding",
"web",
"unicode",
"charset",
]
categories = [
"text-processing",
"encoding",
"web-programming",
"internationalization",
]
license = "(Apache-2.0 OR MIT) AND BSD-3-Clause"
repository = "https://github.com/hsivonen/encoding_rs"
[profile.release]
lto = true
[dependencies.cfg-if]
version = "1.0"
[dependencies.packed_simd]
version = "0.3.4"
optional = true
package = "packed_simd_2"
[dependencies.serde]
version = "1.0"
optional = true
[dev-dependencies.bincode]
version = "1.0"
[dev-dependencies.serde_derive]
version = "1.0"
[dev-dependencies.serde_json]
version = "1.0"
[features]
alloc = []
default = ["alloc"]
fast-big5-hanzi-encode = []
fast-gb-hanzi-encode = []
fast-hangul-encode = []
fast-hanja-encode = []
fast-kanji-encode = []
fast-legacy-encode = [
"fast-hangul-encode",
"fast-hanja-encode",
"fast-kanji-encode",
"fast-gb-hanzi-encode",
"fast-big5-hanzi-encode",
]
less-slow-big5-hanzi-encode = []
less-slow-gb-hanzi-encode = []
less-slow-kanji-encode = []
simd-accel = [
"packed_simd",
"packed_simd/into_bits",
]
[badges.travis-ci]
repository = "hsivonen/encoding_rs"

106
zeroidc/vendor/encoding_rs/Ideas.md vendored Normal file
View File

@@ -0,0 +1,106 @@
This document contains notes about various ideas that for one reason or another
are not being actively pursued.
## Next byte is non-ASCII after ASCII optimization
The current plan for a SIMD-accelerated inner loop for handling ASCII bytes
makes no use of the bit of information that if the buffers didn't end but the
ASCII loop exited, the next byte will not be an ASCII byte.
## Handling ASCII with table lookups when decoding single-byte to UTF-16
Both uconv and ICU outperform encoding_rs when decoding single-byte to UTF-16.
unconv doesn't even do anything fancy to manually unroll the loop (see below).
Both handle even the ASCII range using table lookup. That is, there's no branch
for checking if we're in the lower or upper half of the encoding.
However, adding SIMD acceleration for the ASCII half will likely be a bigger
win than eliminating the branch to decide ASCII vs. non-ASCII.
## Manual loop unrolling for single-byte encodings
ICU currently outperforms encoding_rs (by over x2!) when decoding a single-byte
encoding to UTF-16. This appears to be thanks to manually unrolling the
conversion loop by 16. See [ucnv_MBCSSingleToBMPWithOffsets][1].
[1]: https://ssl.icu-project.org/repos/icu/icu/tags/release-55-1/source/common/ucnvmbcs.cpp
Notably, none of the single-byte encodings have bytes that'd decode to the
upper half of BMP. Therefore, if the unmappable marker has the highest bit set
instead of being zero, the check for unmappables within a 16-character stride
can be done either by ORing the BMP characters in the stride together and
checking the high bit or by loading the upper halves of the BMP charaters
in a `u8x8` register and checking the high bits using the `_mm_movemask_epi8`
/ `pmovmskb` SSE2 instruction.
## After non-ASCII, handle ASCII punctuation without SIMD
Since the failure mode of SIMD ASCII acceleration involves wasted aligment
checks and a wasted SIMD read when the next code unit is non-ASCII and non-Latin
scripts have runs of non-ASCII even if ASCII spaces and punctuation is used,
consider handling the next two or three bytes following non-ASCII as non-SIMD
before looping back to the SIMD mode. Maybe move back to SIMD ASCII faster if
there's ASCII that's not space or punctuation. Maybe with the "space or
punctuation" check in place, this code can be allowed to be in place even for
UTF-8 and Latin single-byte (i.e. not having different code for Latin and
non-Latin single-byte).
## Prefer maintaining aligment
Instead of returning to acceleration directly after non-ASCII, consider
continuing to the alignment boundary without acceleration.
## Read from SIMD lanes instead of RAM (cache) when ASCII check fails
When the SIMD ASCII check fails, the data has already been read from memory.
Test whether it's faster to read the data by lane from the SIMD register than
to read it again from RAM (cache).
## Use Level 2 Hanzi and Level 2 Kanji ordering
These two are ordered by radical and then by stroke count, so in principle,
they should be mostly Unicode-ordered, although at least Level 2 Hanzi isn't
fully Unicode-ordered. Is "mostly" good enough for encode accelelation?
## Create a `divmod_94()` function
Experiment with a function that computes `(i / 94, i % 94)` more efficiently
than generic code.
## Align writes on Aarch64
On [Cortex-A57](https://stackoverflow.com/questions/45714535/performance-of-unaligned-simd-load-store-on-aarch64/45938112#45938112
), it might be a good idea to move the destination into 16-byte alignment.
## Unalign UTF-8 validation on Aarch64
Currently, Aarch64 runs the generic ALU UTF-8 validation code that aligns
reads. That's probably unnecessary on Aarch64. (SIMD was slower than ALU!)
## Table-driven UTF-8 validation
When there are at least four bytes left, read all four. With each byte
index into tables corresponding to magic values indexable by byte in
each position.
In the value read from the table indexed by lead byte, encode the
following in 16 bits: advance 2 bits (2, 3 or 4 bytes), 9 positional
bits one of which is set to indicate the type of lead byte (8 valid
types, in the 8 lowest bits, and invalid, ASCII would be tenth type),
and the mask for extracting the payload bits from the lead byte
(for conversion to UTF-16 or UTF-32).
In the tables indexable by the trail bytes, in each positions
corresponding byte the lead byte type, store 1 if the trail is
invalid given the lead and 0 if valid given the lead.
Use the low 8 bits of the of the 16 bits read from the first
table to mask (bitwise AND) one positional bit from each of the
three other values. Bitwise OR the results together with the
bit that is 1 if the lead is invalid. If the result is zero,
the sequence is valid. Otherwise it's invalid.
Use the advance to advance. In the conversion to UTF-16 or
UTF-32 case, use the mast for extracting the meaningful
bits from the lead byte to mask them from the lead. Shift
left by 6 as many times as the advance indicates, etc.

View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

25
zeroidc/vendor/encoding_rs/LICENSE-MIT vendored Normal file
View File

@@ -0,0 +1,25 @@
Copyright Mozilla Foundation
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@@ -0,0 +1,26 @@
Copyright © WHATWG (Apple, Google, Mozilla, Microsoft).
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

821
zeroidc/vendor/encoding_rs/README.md vendored Normal file
View File

@@ -0,0 +1,821 @@
# encoding_rs
[![Build Status](https://travis-ci.org/hsivonen/encoding_rs.svg?branch=master)](https://travis-ci.org/hsivonen/encoding_rs)
[![crates.io](https://img.shields.io/crates/v/encoding_rs.svg)](https://crates.io/crates/encoding_rs)
[![docs.rs](https://docs.rs/encoding_rs/badge.svg)](https://docs.rs/encoding_rs/)
encoding_rs an implementation of the (non-JavaScript parts of) the
[Encoding Standard](https://encoding.spec.whatwg.org/) written in Rust.
The Encoding Standard defines the Web-compatible set of character encodings,
which means this crate can be used to decode Web content. encoding_rs is
used in Gecko starting with Firefox 56. Due to the notable overlap between
the legacy encodings on the Web and the legacy encodings used on Windows,
this crate may be of use for non-Web-related situations as well; see below
for links to adjacent crates.
Additionally, the `mem` module provides various operations for dealing with
in-RAM text (as opposed to data that's coming from or going to an IO boundary).
The `mem` module is a module instead of a separate crate due to internal
implementation detail efficiencies.
## Functionality
Due to the Gecko use case, encoding_rs supports decoding to and encoding from
UTF-16 in addition to supporting the usual Rust use case of decoding to and
encoding from UTF-8. Additionally, the API has been designed to be FFI-friendly
to accommodate the C++ side of Gecko.
Specifically, encoding_rs does the following:
* Decodes a stream of bytes in an Encoding Standard-defined character encoding
into valid aligned native-endian in-RAM UTF-16 (units of `u16` / `char16_t`).
* Encodes a stream of potentially-invalid aligned native-endian in-RAM UTF-16
(units of `u16` / `char16_t`) into a sequence of bytes in an Encoding
Standard-defined character encoding as if the lone surrogates had been
replaced with the REPLACEMENT CHARACTER before performing the encode.
(Gecko's UTF-16 is potentially invalid.)
* Decodes a stream of bytes in an Encoding Standard-defined character
encoding into valid UTF-8.
* Encodes a stream of valid UTF-8 into a sequence of bytes in an Encoding
Standard-defined character encoding. (Rust's UTF-8 is guaranteed-valid.)
* Does the above in streaming (input and output split across multiple
buffers) and non-streaming (whole input in a single buffer and whole
output in a single buffer) variants.
* Avoids copying (borrows) when possible in the non-streaming cases when
decoding to or encoding from UTF-8.
* Resolves textual labels that identify character encodings in
protocol text into type-safe objects representing the those encodings
conceptually.
* Maps the type-safe encoding objects onto strings suitable for
returning from `document.characterSet`.
* Validates UTF-8 (in common instruction set scenarios a bit faster for Web
workloads than the standard library; hopefully will get upstreamed some
day) and ASCII.
Additionally, `encoding_rs::mem` does the following:
* Checks if a byte buffer contains only ASCII.
* Checks if a potentially-invalid UTF-16 buffer contains only Basic Latin (ASCII).
* Checks if a valid UTF-8, potentially-invalid UTF-8 or potentially-invalid UTF-16
buffer contains only Latin1 code points (below U+0100).
* Checks if a valid UTF-8, potentially-invalid UTF-8 or potentially-invalid UTF-16
buffer or a code point or a UTF-16 code unit can trigger right-to-left behavior
(suitable for checking if the Unicode Bidirectional Algorithm can be optimized
out).
* Combined versions of the above two checks.
* Converts valid UTF-8, potentially-invalid UTF-8 and Latin1 to UTF-16.
* Converts potentially-invalid UTF-16 and Latin1 to UTF-8.
* Converts UTF-8 and UTF-16 to Latin1 (if in range).
* Finds the first invalid code unit in a buffer of potentially-invalid UTF-16.
* Makes a mutable buffer of potential-invalid UTF-16 contain valid UTF-16.
* Copies ASCII from one buffer to another up to the first non-ASCII byte.
* Converts ASCII to UTF-16 up to the first non-ASCII byte.
* Converts UTF-16 to ASCII up to the first non-Basic Latin code unit.
## Integration with `std::io`
Notably, the above feature list doesn't include the capability to wrap
a `std::io::Read`, decode it into UTF-8 and presenting the result via
`std::io::Read`. The [`encoding_rs_io`](https://crates.io/crates/encoding_rs_io)
crate provides that capability.
## `no_std` Environment
The crate works in a `no_std` environment. By default, the `alloc` feature,
which assumes that an allocator is present is enabled. For a no-allocator
environment, the default features (i.e. `alloc`) can be turned off. This
makes the part of the API that returns `Vec`/`String`/`Cow` unavailable.
## Decoding Email
For decoding character encodings that occur in email, use the
[`charset`](https://crates.io/crates/charset) crate instead of using this
one directly. (It wraps this crate and adds UTF-7 decoding.)
## Windows Code Page Identifier Mappings
For mappings to and from Windows code page identifiers, use the
[`codepage`](https://crates.io/crates/codepage) crate.
## DOS Encodings
This crate does not support single-byte DOS encodings that aren't required by
the Web Platform, but the [`oem_cp`](https://crates.io/crates/oem_cp) crate does.
## Preparing Text for the Encoders
Normalizing text into Unicode Normalization Form C prior to encoding text into
a legacy encoding minimizes unmappable characters. Text can be normalized to
Unicode Normalization Form C using the
[`unic-normal`](https://crates.io/crates/unic-normal) crate.
The exception is windows-1258, which after normalizing to Unicode Normalization
Form C requires tone marks to be decomposed in order to minimize unmappable
characters. Vietnamese tone marks can be decomposed using the
[`detone`](https://crates.io/crates/detone) crate.
## Licensing
TL;DR: `(Apache-2.0 OR MIT) AND BSD-3-Clause` for the code and data combination.
Please see the file named
[COPYRIGHT](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT).
The non-test code that isn't generated from the WHATWG data in this crate is
under Apache-2.0 OR MIT. Test code is under CC0.
This crate contains code/data generated from WHATWG-supplied data. The WHATWG
upstream changed its license for portions of specs incorporated into source code
from CC0 to BSD-3-Clause between the initial release of this crate and the present
version of this crate. The in-source licensing legends have been updated for the
parts of the generated code that have changed since the upstream license change.
## Documentation
Generated [API documentation](https://docs.rs/encoding_rs/) is available
online.
There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
design and internals of the crate.
## C and C++ bindings
An FFI layer for encoding_rs is available as a
[separate crate](https://github.com/hsivonen/encoding_c). The crate comes
with a [demo C++ wrapper](https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h)
using the C++ standard library and [GSL](https://github.com/Microsoft/GSL/) types.
The bindings for the `mem` module are in the
[encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
For the Gecko context, there's a
[C++ wrapper using the MFBT/XPCOM types](https://searchfox.org/mozilla-central/source/intl/Encoding.h#100).
There's a [write-up](https://hsivonen.fi/modern-cpp-in-rust/) about the C++
wrappers.
## Sample programs
* [Rust](https://github.com/hsivonen/recode_rs)
* [C](https://github.com/hsivonen/recode_c)
* [C++](https://github.com/hsivonen/recode_cpp)
## Optional features
There are currently these optional cargo features:
### `simd-accel`
Enables SIMD acceleration using the nightly-dependent `packed_simd_2` crate.
This is an opt-in feature, because enabling this feature _opts out_ of Rust's
guarantees of future compilers compiling old code (aka. "stability story").
Currently, this has not been tested to be an improvement except for these
targets:
* x86_64
* i686
* aarch64
* thumbv7neon
If you use nightly Rust, you use targets whose first component is one of the
above, and you are prepared _to have to revise your configuration when updating
Rust_, you should enable this feature. Otherwise, please _do not_ enable this
feature.
_Note!_ If you are compiling for a target that does not have 128-bit SIMD
enabled as part of the target definition and you are enabling 128-bit SIMD
using `-C target_feature`, you need to enable the `core_arch` Cargo feature
for `packed_simd_2` to compile a crates.io snapshot of `core_arch` instead of
using the standard-library copy of `core::arch`, because the `core::arch`
module of the pre-compiled standard library has been compiled with the
assumption that the CPU doesn't have 128-bit SIMD. At present this applies
mainly to 32-bit ARM targets whose first component does not include the
substring `neon`.
The encoding_rs side of things has not been properly set up for POWER,
PowerPC, MIPS, etc., SIMD at this time, so even if you were to follow
the advice from the previous paragraph, you probably shouldn't use
the `simd-accel` option on the less mainstream architectures at this
time.
Used by Firefox.
### `serde`
Enables support for serializing and deserializing `&'static Encoding`-typed
struct fields using [Serde][1].
[1]: https://serde.rs/
Not used by Firefox.
### `fast-legacy-encode`
A catch-all option for enabling the fastest legacy encode options. _Does not
affect decode speed or UTF-8 encode speed._
At present, this option is equivalent to enabling the following options:
* `fast-hangul-encode`
* `fast-hanja-encode`
* `fast-kanji-encode`
* `fast-gb-hanzi-encode`
* `fast-big5-hanzi-encode`
Adds 176 KB to the binary size.
Not used by Firefox.
### `fast-hangul-encode`
Changes encoding precomposed Hangul syllables into EUC-KR from binary
search over the decode-optimized tables to lookup by index making Korean
plain-text encode about 4 times as fast as without this option.
Adds 20 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-hanja-encode`
Changes encoding of Hanja into EUC-KR from linear search over the
decode-optimized table to lookup by index. Since Hanja is practically absent
in modern Korean text, this option doesn't affect perfomance in the common
case and mainly makes sense if you want to make your application resilient
agaist denial of service by someone intentionally feeding it a lot of Hanja
to encode into EUC-KR.
Adds 40 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-kanji-encode`
Changes encoding of Kanji into Shift_JIS, EUC-JP and ISO-2022-JP from linear
search over the decode-optimized tables to lookup by index making Japanese
plain-text encode to legacy encodings 30 to 50 times as fast as without this
option (about 2 times as fast as with `less-slow-kanji-encode`).
Takes precedence over `less-slow-kanji-encode`.
Adds 36 KB to the binary size (24 KB compared to `less-slow-kanji-encode`).
Does _not_ affect decode speed.
Not used by Firefox.
### `less-slow-kanji-encode`
Makes JIS X 0208 Level 1 Kanji (the most common Kanji in Shift_JIS, EUC-JP and
ISO-2022-JP) encode less slow (binary search instead of linear search) making
Japanese plain-text encode to legacy encodings 14 to 23 times as fast as
without this option.
Adds 12 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-gb-hanzi-encode`
Changes encoding of Hanzi in the CJK Unified Ideographs block into GBK and
gb18030 from linear search over a part the decode-optimized tables followed
by a binary search over another part of the decode-optimized tables to lookup
by index making Simplified Chinese plain-text encode to the legacy encodings
100 to 110 times as fast as without this option (about 2.5 times as fast as
with `less-slow-gb-hanzi-encode`).
Takes precedence over `less-slow-gb-hanzi-encode`.
Adds 36 KB to the binary size (24 KB compared to `less-slow-gb-hanzi-encode`).
Does _not_ affect decode speed.
Not used by Firefox.
### `less-slow-gb-hanzi-encode`
Makes GB2312 Level 1 Hanzi (the most common Hanzi in gb18030 and GBK) encode
less slow (binary search instead of linear search) making Simplified Chinese
plain-text encode to the legacy encodings about 40 times as fast as without
this option.
Adds 12 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
### `fast-big5-hanzi-encode`
Changes encoding of Hanzi in the CJK Unified Ideographs block into Big5 from
linear search over a part the decode-optimized tables to lookup by index
making Traditional Chinese plain-text encode to Big5 105 to 125 times as fast
as without this option (about 3 times as fast as with
`less-slow-big5-hanzi-encode`).
Takes precedence over `less-slow-big5-hanzi-encode`.
Adds 40 KB to the binary size (20 KB compared to `less-slow-big5-hanzi-encode`).
Does _not_ affect decode speed.
Not used by Firefox.
### `less-slow-big5-hanzi-encode`
Makes Big5 Level 1 Hanzi (the most common Hanzi in Big5) encode less slow
(binary search instead of linear search) making Traditional Chinese
plain-text encode to Big5 about 36 times as fast as without this option.
Adds 20 KB to the binary size.
Does _not_ affect decode speed.
Not used by Firefox.
## Performance goals
For decoding to UTF-16, the goal is to perform at least as well as Gecko's old
uconv. For decoding to UTF-8, the goal is to perform at least as well as
rust-encoding. These goals have been achieved.
Encoding to UTF-8 should be fast. (UTF-8 to UTF-8 encode should be equivalent
to `memcpy` and UTF-16 to UTF-8 should be fast.)
Speed is a non-goal when encoding to legacy encodings. By default, encoding to
legacy encodings should not be optimized for speed at the expense of code size
as long as form submission and URL parsing in Gecko don't become noticeably
too slow in real-world use.
In the interest of binary size, by default, encoding_rs does not have
encode-specific data tables beyond 32 bits of encode-specific data for each
single-byte encoding. Therefore, encoders search the decode-optimized data
tables. This is a linear search in most cases. As a result, by default, encode
to legacy encodings varies from slow to extremely slow relative to other
libraries. Still, with realistic work loads, this seemed fast enough not to be
user-visibly slow on Raspberry Pi 3 (which stood in for a phone for testing)
in the Web-exposed encoder use cases.
See the cargo features above for optionally making CJK legacy encode fast.
A framework for measuring performance is [available separately][2].
[2]: https://github.com/hsivonen/encoding_bench/
## Rust Version Compatibility
It is a goal to support the latest stable Rust, the latest nightly Rust and
the version of Rust that's used for Firefox Nightly.
At this time, there is no firm commitment to support a version older than
what's required by Firefox, and there is no commitment to treat MSRV changes
as semver-breaking, because this crate depends on `cfg-if`, which doesn't
appear to treat MSRV changes as semver-breaking, so it would be useless for
this crate to treat MSRV changes as semver-breaking.
As of 2021-02-04, MSRV appears to be Rust 1.36.0 for using the crate and
1.42.0 for doc tests to pass without errors about the global allocator.
## Compatibility with rust-encoding
A compatibility layer that implements the rust-encoding API on top of
encoding_rs is
[provided as a separate crate](https://github.com/hsivonen/encoding_rs_compat)
(cannot be uploaded to crates.io). The compatibility layer was originally
written with the assuption that Firefox would need it, but it is not currently
used in Firefox.
## Regenerating Generated Code
To regenerate the generated code:
* Have Python 2 installed.
* Clone [`https://github.com/hsivonen/encoding_c`](https://github.com/hsivonen/encoding_c)
next to the `encoding_rs` directory.
* Clone [`https://github.com/hsivonen/codepage`](https://github.com/hsivonen/codepage)
next to the `encoding_rs` directory.
* Clone [`https://github.com/whatwg/encoding`](https://github.com/whatwg/encoding)
next to the `encoding_rs` directory.
* Checkout revision `be3337450e7df1c49dca7872153c4c4670dd8256` of the `encoding` repo.
(Note: `f381389` was the revision of `encoding` used from before the `encoding` repo
license change. So far, only output changed since then has been updated to
the new license legend.)
* With the `encoding_rs` directory as the working directory, run
`python generate-encoding-data.py`.
## Roadmap
- [x] Design the low-level API.
- [x] Provide Rust-only convenience features.
- [x] Provide an stl/gsl-flavored C++ API.
- [x] Implement all decoders and encoders.
- [x] Add unit tests for all decoders and encoders.
- [x] Finish BOM sniffing variants in Rust-only convenience features.
- [x] Document the API.
- [x] Publish the crate on crates.io.
- [x] Create a solution for measuring performance.
- [x] Accelerate ASCII conversions using SSE2 on x86.
- [x] Accelerate ASCII conversions using ALU register-sized operations on
non-x86 architectures (process an `usize` instead of `u8` at a time).
- [x] Split FFI into a separate crate so that the FFI doesn't interfere with
LTO in pure-Rust usage.
- [x] Compress CJK indices by making use of sequential code points as well
as Unicode-ordered parts of indices.
- [x] Make lookups by label or name use binary search that searches from the
end of the label/name to the start.
- [x] Make labels with non-ASCII bytes fail fast.
- [ ] ~Parallelize UTF-8 validation using [Rayon](https://github.com/nikomatsakis/rayon).~
(This turned out to be a pessimization in the ASCII case due to memory bandwidth reasons.)
- [x] Provide an XPCOM/MFBT-flavored C++ API.
- [x] Investigate accelerating single-byte encode with a single fast-tracked
range per encoding.
- [x] Replace uconv with encoding_rs in Gecko.
- [x] Implement the rust-encoding API in terms of encoding_rs.
- [x] Add SIMD acceleration for Aarch64.
- [x] Investigate the use of NEON on 32-bit ARM.
- [ ] ~Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
adapted to Rust in rust-encoding.~
- [x] Add actually fast CJK encode options.
- [ ] ~Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).~
- [ ] Provide a build mode that works without `alloc` (with lesser API surface).
- [ ] Migrate to `std::simd` once it is stable and declare 1.0.
## Release Notes
### 0.8.31
* Use SPDX with parentheses now that crates.io supports parentheses.
### 0.8.30
* Update the licensing information to take into account the WHATWG data license change.
### 0.8.29
* Make the parts that use an allocator optional.
### 0.8.28
* Fix error in Serde support introduced as part of `no_std` support.
### 0.8.27
* Make the crate works in a `no_std` environment (with `alloc`).
### 0.8.26
* Fix oversights in edition 2018 migration that broke the `simd-accel` feature.
### 0.8.25
* Do pointer alignment checks in a way where intermediate steps aren't defined to be Undefined Behavior.
* Update the `packed_simd` dependency to `packed_simd_2`.
* Update the `cfg-if` dependency to 1.0.
* Address warnings that have been introduced by newer Rust versions along the way.
* Update to edition 2018, since even prior to 1.0 `cfg-if` updated to edition 2018 without a semver break.
### 0.8.24
* Avoid computing an intermediate (not dereferenced) pointer value in a manner designated as Undefined Behavior when computing pointer alignment.
### 0.8.23
* Remove year from copyright notices. (No features or bug fixes.)
### 0.8.22
* Formatting fix and new unit test. (No features or bug fixes.)
### 0.8.21
* Fixed a panic with invalid UTF-16[BE|LE] input at the end of the stream.
### 0.8.20
* Make `Decoder::latin1_byte_compatible_up_to` return `None` in more
cases to make the method actually useful. While this could be argued
to be a breaking change due to the bug fix changing semantics, it does
not break callers that had to handle the `None` case in a reasonable
way anyway.
### 0.8.19
* Removed a bunch of bound checks in `convert_str_to_utf16`.
* Added `mem::convert_utf8_to_utf16_without_replacement`.
### 0.8.18
* Added `mem::utf8_latin1_up_to` and `mem::str_latin1_up_to`.
* Added `Decoder::latin1_byte_compatible_up_to`.
### 0.8.17
* Update `bincode` (dev dependency) version requirement to 1.0.
### 0.8.16
* Switch from the `simd` crate to `packed_simd`.
### 0.8.15
* Adjust documentation for `simd-accel` (README-only release).
### 0.8.14
* Made UTF-16 to UTF-8 encode conversion fill the output buffer as
closely as possible.
### 0.8.13
* Made the UTF-8 to UTF-16 decoder compare the number of code units written
with the length of the right slice (the output slice) to fix a panic
introduced in 0.8.11.
### 0.8.12
* Removed the `clippy::` prefix from clippy lint names.
### 0.8.11
* Changed minimum Rust requirement to 1.29.0 (for the ability to refer
to the interior of a `static` when defining another `static`).
* Explicitly aligned the lookup tables for single-byte encodings and
UTF-8 to cache lines in the hope of freeing up one cache line for
other data. (Perhaps the tables were already aligned and this is
placebo.)
* Added 32 bits of encode-oriented data for each single-byte encoding.
The change was performance-neutral for non-Latin1-ish Latin legacy
encodings, improved Latin1-ish and Arabic legacy encode speed
somewhat (new speed is 2.4x the old speed for German, 2.3x for
Arabic, 1.7x for Portuguese and 1.4x for French) and improved
non-Latin1, non-Arabic legacy single-byte encode a lot (7.2x for
Thai, 6x for Greek, 5x for Russian, 4x for Hebrew).
* Added compile-time options for fast CJK legacy encode options (at
the cost of binary size (up to 176 KB) and run-time memory usage).
These options still retain the overall code structure instead of
rewriting the CJK encoders totally, so the speed isn't as good as
what could be achieved by using even more memory / making the
binary even langer.
* Made UTF-8 decode and validation faster.
* Added method `is_single_byte()` on `Encoding`.
* Added `mem::decode_latin1()` and `mem::encode_latin1_lossy()`.
### 0.8.10
* Disabled a unit test that tests a panic condition when the assertion
being tested is disabled.
### 0.8.9
* Made `--features simd-accel` work with stable-channel compiler to
simplify the Firefox build system.
### 0.8.8
* Made the `is_foo_bidi()` not treat U+FEFF (ZERO WIDTH NO-BREAK SPACE
aka. BYTE ORDER MARK) as right-to-left.
* Made the `is_foo_bidi()` functions report `true` if the input contains
Hebrew presentations forms (which are right-to-left but not in a
right-to-left-roadmapped block).
### 0.8.7
* Fixed a panic in the UTF-16LE/UTF-16BE decoder when decoding to UTF-8.
### 0.8.6
* Temporarily removed the debug assertion added in version 0.8.5 from
`convert_utf16_to_latin1_lossy`.
### 0.8.5
* If debug assertions are enabled but fuzzing isn't enabled, lossy conversions
to Latin1 in the `mem` module assert that the input is in the range
U+0000...U+00FF (inclusive).
* In the `mem` module provide conversions from Latin1 and UTF-16 to UTF-8
that can deal with insufficient output space. The idea is to use them
first with an allocation rounded up to jemalloc bucket size and do the
worst-case allocation only if the jemalloc rounding up was insufficient
as the first guess.
### 0.8.4
* Fix SSE2-specific, `simd-accel`-specific memory corruption introduced in
version 0.8.1 in conversions between UTF-16 and Latin1 in the `mem` module.
### 0.8.3
* Removed an `#[inline(never)]` annotation that was not meant for release.
### 0.8.2
* Made non-ASCII UTF-16 to UTF-8 encode faster by manually omitting bound
checks and manually adding branch prediction annotations.
### 0.8.1
* Tweaked loop unrolling and memory alignment for SSE2 conversions between
UTF-16 and Latin1 in the `mem` module to increase the performance when
converting long buffers.
### 0.8.0
* Changed the minimum supported version of Rust to 1.21.0 (semver breaking
change).
* Flipped around the defaults vs. optional features for controlling the size
vs. speed trade-off for Kanji and Hanzi legacy encode (semver breaking
change).
* Added NEON support on ARMv7.
* SIMD-accelerated x-user-defined to UTF-16 decode.
* Made UTF-16LE and UTF-16BE decode a lot faster (including SIMD
acceleration).
### 0.7.2
* Add the `mem` module.
* Refactor SIMD code which can affect performance outside the `mem`
module.
### 0.7.1
* When encoding from invalid UTF-16, correctly handle U+DC00 followed by
another low surrogate.
### 0.7.0
* [Make `replacement` a label of the replacement
encoding.](https://github.com/whatwg/encoding/issues/70) (Spec change.)
* Remove `Encoding::for_name()`. (`Encoding::for_label(foo).unwrap()` is
now close enough after the above label change.)
* Remove the `parallel-utf8` cargo feature.
* Add optional Serde support for `&'static Encoding`.
* Performance tweaks for ASCII handling.
* Performance tweaks for UTF-8 validation.
* SIMD support on aarch64.
### 0.6.11
* Make `Encoder::has_pending_state()` public.
* Update the `simd` crate dependency to 0.2.0.
### 0.6.10
* Reserve enough space for NCRs when encoding to ISO-2022-JP.
* Correct max length calculations for multibyte decoders.
* Correct max length calculations before BOM sniffing has been
performed.
* Correctly calculate max length when encoding from UTF-16 to GBK.
### 0.6.9
* [Don't prepend anything when gb18030 range decode
fails](https://github.com/whatwg/encoding/issues/110). (Spec change.)
### 0.6.8
* Correcly handle the case where the first buffer contains potentially
partial BOM and the next buffer is the last buffer.
* Decode byte `7F` correctly in ISO-2022-JP.
* Make UTF-16 to UTF-8 encode write closer to the end of the buffer.
* Implement `Hash` for `Encoding`.
### 0.6.7
* [Map half-width katakana to full-width katana in ISO-2022-JP
encoder](https://github.com/whatwg/encoding/issues/105). (Spec change.)
* Give `InputEmpty` correct precedence over `OutputFull` when encoding
with replacement and the output buffer passed in is too short or the
remaining space in the output buffer is too small after a replacement.
### 0.6.6
* Correct max length calculation when a partial BOM prefix is part of
the decoder's state.
### 0.6.5
* Correct max length calculation in various encoders.
* Correct max length calculation in the UTF-16 decoder.
* Derive `PartialEq` and `Eq` for the `CoderResult`, `DecoderResult`
and `EncoderResult` types.
### 0.6.4
* Avoid panic when encoding with replacement and the destination buffer is
too short to hold one numeric character reference.
### 0.6.3
* Add support for 32-bit big-endian hosts. (For real this time.)
### 0.6.2
* Fix a panic from subslicing with bad indices in
`Encoder::encode_from_utf16`. (Due to an oversight, it lacked the fix that
`Encoder::encode_from_utf8` already had.)
* Micro-optimize error status accumulation in non-streaming case.
### 0.6.1
* Avoid panic near integer overflow in a case that's unlikely to actually
happen.
* Address Clippy lints.
### 0.6.0
* Make the methods for computing worst-case buffer size requirements check
for integer overflow.
* Upgrade rayon to 0.7.0.
### 0.5.1
* Reorder methods for better documentation readability.
* Add support for big-endian hosts. (Only 64-bit case actually tested.)
* Optimize the ALU (non-SIMD) case for 32-bit ARM instead of x86_64.
### 0.5.0
* Avoid allocating an excessively long buffers in non-streaming decode.
* Fix the behavior of ISO-2022-JP and replacement decoders near the end of the
output buffer.
* Annotate the result structs with `#[must_use]`.
### 0.4.0
* Split FFI into a separate crate.
* Performance tweaks.
* CJK binary size and encoding performance changes.
* Parallelize UTF-8 validation in the case of long buffers (with optional
feature `parallel-utf8`).
* Borrow even with ISO-2022-JP when possible.
### 0.3.2
* Fix moving pointers to alignment in ALU-based ASCII acceleration.
* Fix errors in documentation and improve documentation.
### 0.3.1
* Fix UTF-8 to UTF-16 decode for byte sequences beginning with 0xEE.
* Make UTF-8 to UTF-8 decode SSE2-accelerated when feature `simd-accel` is used.
* When decoding and encoding ASCII-only input from or to an ASCII-compatible
encoding using the non-streaming API, return a borrow of the input.
* Make encode from UTF-16 to UTF-8 faster.
### 0.3
* Change the references to the instances of `Encoding` from `const` to `static`
to make the referents unique across crates that use the refernces.
* Introduce non-reference-typed `FOO_INIT` instances of `Encoding` to allow
foreign crates to initialize `static` arrays with references to `Encoding`
instances even under Rust's constraints that prohibit the initialization of
`&'static Encoding`-typed array items with `&'static Encoding`-typed
`statics`.
* Document that the above two points will be reverted if Rust changes `const`
to work so that cross-crate usage keeps the referents unique.
* Return `Cow`s from Rust-only non-streaming methods for encode and decode.
* `Encoding::for_bom()` returns the length of the BOM.
* ASCII-accelerated conversions for encodings other than UTF-16LE, UTF-16BE,
ISO-2022-JP and x-user-defined.
* Add SSE2 acceleration behind the `simd-accel` feature flag. (Requires
nightly Rust.)
* Fix panic with long bogus labels.
* Map [0xCA to U+05BA in windows-1255](https://github.com/whatwg/encoding/issues/73).
(Spec change.)
* Correct the [end of the Shift_JIS EUDC range](https://github.com/whatwg/encoding/issues/53).
(Spec change.)
### 0.2.4
* Polish FFI documentation.
### 0.2.3
* Fix UTF-16 to UTF-8 encode.
### 0.2.2
* Add `Encoder.encode_from_utf8_to_vec_without_replacement()`.
### 0.2.1
* Add `Encoding.is_ascii_compatible()`.
* Add `Encoding::for_bom()`.
* Make `==` for `Encoding` use name comparison instead of pointer comparison,
because uses of the encoding constants in different crates result in
different addresses and the constant cannot be turned into statics without
breaking other things.
### 0.2.0
The initial release.

12
zeroidc/vendor/encoding_rs/build.rs vendored Normal file
View File

@@ -0,0 +1,12 @@
fn main() {
// This does not enable `RUSTC_BOOTSTRAP=1` for `packed_simd`.
// You still need to knowingly have a setup that makes
// `packed_simd` compile. Therefore, having this file on
// crates.io is harmless in terms of users of `encoding_rs`
// accidentally depending on nightly features. Having this
// here means that if you knowingly want this, you only
// need to maintain a fork of `packed_simd` without _also_
// having to maintain a fork of `encoding_rs`.
#[cfg(feature = "simd-accel")]
println!("cargo:rustc-env=RUSTC_BOOTSTRAP=1");
}

14
zeroidc/vendor/encoding_rs/ci/miri.sh vendored Normal file
View File

@@ -0,0 +1,14 @@
set -ex
# Install Miri.
MIRI_NIGHTLY=nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/miri)
echo "Installing latest nightly with Miri: $MIRI_NIGHTLY"
rustup default "$MIRI_NIGHTLY"
rustup component add miri
# Run tests.
# Stacked Borrows is disabled as it costs too much RAM (due to our large tables).
MIRIFLAGS="-Zmiri-disable-stacked-borrows" cargo miri test
# Restore old state in case Travis uses this cache for other jobs.
rustup default nightly

16
zeroidc/vendor/encoding_rs/doc/Big5.txt vendored Normal file
View File

@@ -0,0 +1,16 @@
/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
/// instead of the Private Use Area code points that have been used historically.
/// It is believed to be able to decode existing Web content in a way that makes
/// sense.
///
/// To avoid form submissions generating data that Web servers don't understand,
/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
/// Big5 in the lexical order.
///
/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
///
/// This encoding is designed to be suited for decoding the Windows code page 950
/// and its HKSCS patched "951" variant such that the text makes sense, given
/// assignments that Unicode has made after those encodings used Private Use
/// Area characters.

View File

@@ -0,0 +1,12 @@
/// This is the legacy Unix encoding for Japanese.
///
/// For compatibility with Web servers that don't expect three-byte sequences
/// in form submissions, the encoder doesn't generate three-byte sequences.
/// That is, the JIS X 0212 support is decode-only.
///
/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
///
/// This encoding roughly matches the Windows code page 20932. There are error
/// handling differences and a handful of 2-byte sequences that decode differently.
/// Additionall, Windows doesn't support 3-byte sequences.

View File

@@ -0,0 +1,10 @@
/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
/// Classic), with all the characters from the Hangul Syllables block of Unicode.
///
/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
///
/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
/// to U+0080 and some byte sequences that are error per the Encoding Standard to
/// the question mark or the Private Use Area.

16
zeroidc/vendor/encoding_rs/doc/GBK.txt vendored Normal file
View File

@@ -0,0 +1,16 @@
/// The decoder for this encoding is the same as the decoder for gb18030.
/// The encoder side of this encoding is GBK with Windows code page 936 euro
/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
/// Unicode block as well as a handful of ideographs from the CJK Unified
/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
///
/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
/// unified with the gb18030 encoder in the Encoding Standard out of concern
/// that servers that expect GBK form submissions might not be able to handle
/// the four-byte sequences.
///
/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
///
/// The encoder of this encoding roughly matches the Windows code page 936.
/// The decoder side is a superset.

View File

@@ -0,0 +1,8 @@
/// This the most notable one of the DOS Cyrillic code pages. It has the same
/// box drawing characters as code page 437, so it can be used for decoding
/// DOS-era ASCII + box drawing data.
///
/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
///
/// This encoding matches the Windows code page 866.

View File

@@ -0,0 +1,10 @@
/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
/// byte range to encode non-Basic Latin characters. It's the only encoding
/// supported by this crate whose encoder is stateful.
///
/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
///
/// This encoding roughly matches the Windows code page 50220. Notably, Windows
/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
/// error handling.

View File

@@ -0,0 +1,8 @@
/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
/// is also known as Latin 6.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
///
/// The Windows code page number for this encoding is 28600, but kernel32.dll
/// does not support this encoding.

View File

@@ -0,0 +1,8 @@
/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
/// is also known as Latin 7.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
///
/// This encoding matches the Windows code page 28603, except Windows decodes
/// unassigned code points to the Private Use Area of Unicode.

View File

@@ -0,0 +1,8 @@
/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
/// is also known as Latin 8.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
///
/// The Windows code page number for this encoding is 28604, but kernel32.dll
/// does not support this encoding.

View File

@@ -0,0 +1,7 @@
/// This is the revised Western European part of the ISO/IEC 8859 encoding
/// family. This encoding is also known as Latin 9.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
///
/// This encoding matches the Windows code page 28605.

View File

@@ -0,0 +1,8 @@
/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
/// family. This encoding is also known as Latin 10.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
///
/// The Windows code page number for this encoding is 28606, but kernel32.dll
/// does not support this encoding.

View File

@@ -0,0 +1,6 @@
/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
///
/// This encoding matches the Windows code page 28592.

View File

@@ -0,0 +1,6 @@
/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
///
/// This encoding matches the Windows code page 28593.

View File

@@ -0,0 +1,6 @@
/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
///
/// This encoding matches the Windows code page 28594.

View File

@@ -0,0 +1,6 @@
/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
///
/// This encoding matches the Windows code page 28595.

View File

@@ -0,0 +1,7 @@
/// This is the Arabic part of the ISO/IEC 8859 encoding family.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
///
/// This encoding matches the Windows code page 28596, except Windows decodes
/// unassigned code points to the Private Use Area of Unicode.

View File

@@ -0,0 +1,11 @@
/// This is the Greek part of the ISO/IEC 8859 encoding family.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
///
/// This encoding roughly matches the Windows code page 28597. Windows decodes
/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.

View File

@@ -0,0 +1,9 @@
/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
///
/// This encoding roughly matches the Windows code page 38598. Windows decodes
/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
/// Area instead of LRM and RLM. Windows decodes unassigned code points to
/// the private use area.

View File

@@ -0,0 +1,9 @@
/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
///
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
///
/// This encoding roughly matches the Windows code page 28598. Windows decodes
/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
/// Area instead of LRM and RLM. Windows decodes unassigned code points to
/// the private use area.

View File

@@ -0,0 +1,6 @@
/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
///
/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
///
/// This encoding matches the Windows code page 20866.

View File

@@ -0,0 +1,6 @@
/// This is an encoding for Ukrainian adapted from KOI8-R.
///
/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
///
/// This encoding matches the Windows code page 21866.

View File

@@ -0,0 +1,8 @@
/// This is the Japanese encoding for Windows.
///
/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
///
/// This encoding matches the Windows code page 932, except Windows decodes some byte
/// sequences that are error per the Encoding Standard to the question mark or the
/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.

View File

@@ -0,0 +1,8 @@
/// This decode-only encoding uses 16-bit code units due to Unicode originally
/// having been designed as a 16-bit reportoire. In the absence of a byte order
/// mark the big endian byte order is assumed.
///
/// There is no corresponding encoder in this crate or in the Encoding
/// Standard. The output encoding of this encoding is UTF-8.
///
/// This encoding matches the Windows code page 1201.

View File

@@ -0,0 +1,8 @@
/// This decode-only encoding uses 16-bit code units due to Unicode originally
/// having been designed as a 16-bit reportoire. In the absence of a byte order
/// mark the little endian byte order is assumed.
///
/// There is no corresponding encoder in this crate or in the Encoding
/// Standard. The output encoding of this encoding is UTF-8.
///
/// This encoding matches the Windows code page 1200.

View File

@@ -0,0 +1,5 @@
/// This is the encoding that should be used for all new development it can
/// represent all of Unicode.
///
/// This encoding matches the Windows code page 65001, except Windows differs
/// in the number of errors generated for some erroneous byte sequences.

View File

@@ -0,0 +1,9 @@
/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
/// maps to U+3000 for compatibility with existing Web content. As a result,
/// this encoding can represent all of Unicode except for the private-use
/// character U+E5E5.
///
/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
///
/// This encoding matches the Windows code page 54936.

View File

@@ -0,0 +1,7 @@
/// This is the MacRoman encoding from Mac OS Classic.
///
/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
///
/// This encoding matches the Windows code page 10000, except Windows decodes
/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.

View File

@@ -0,0 +1,10 @@
/// This decode-only encoding decodes all non-zero-length streams to a single
/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
/// ASCII-compatible fallback encoding (typically windows-1252) for some
/// encodings that are no longer supported by the Web Platform and that
/// would be dangerous to treat as ASCII-compatible.
///
/// There is no corresponding encoder. The output encoding of this encoding
/// is UTF-8.
///
/// This encoding does not have a Windows code page number.

View File

@@ -0,0 +1,6 @@
/// This is the Central European encoding for Windows.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
///
/// This encoding matches the Windows code page 1250.

View File

@@ -0,0 +1,6 @@
/// This is the Cyrillic encoding for Windows.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
///
/// This encoding matches the Windows code page 1251.

View File

@@ -0,0 +1,7 @@
/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
/// which is known as Latin 1.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
///
/// This encoding matches the Windows code page 1252.

View File

@@ -0,0 +1,8 @@
/// This is the Greek encoding for Windows. It is mostly an extension of
/// ISO-8859-7, but U+0386 is mapped to a different byte.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
///
/// This encoding matches the Windows code page 1253, except Windows decodes
/// unassigned code points to the Private Use Area of Unicode.

View File

@@ -0,0 +1,7 @@
/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
/// which is known as Latin 5.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
///
/// This encoding matches the Windows code page 1254.

View File

@@ -0,0 +1,8 @@
/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
/// except for a currency sign swap.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
///
/// This encoding matches the Windows code page 1255, except Windows decodes
/// unassigned code points to the Private Use Area of Unicode.

View File

@@ -0,0 +1,6 @@
/// This is the Arabic encoding for Windows.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
///
/// This encoding matches the Windows code page 1256.

View File

@@ -0,0 +1,7 @@
/// This is the Baltic encoding for Windows.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
///
/// This encoding matches the Windows code page 1257, except Windows decodes
/// unassigned code points to the Private Use Area of Unicode.

View File

@@ -0,0 +1,11 @@
/// This is the Vietnamese encoding for Windows.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
///
/// This encoding matches the Windows code page 1258 when used in the
/// non-normalizing mode. Unlike with the other single-byte encodings, the
/// result of decoding is not necessarily in Normalization Form C. On the
/// other hand, input in the Normalization Form C is not encoded without
/// replacement. In general, it's a bad idea to encode to encodings other
/// than UTF-8, but this encoding is especially hazardous to encode to.

View File

@@ -0,0 +1,7 @@
/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
///
/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
///
/// This encoding matches the Windows code page 874, except Windows decodes
/// unassigned code points to the Private Use Area of Unicode.

View File

@@ -0,0 +1,6 @@
/// This is the MacUkrainian encoding from Mac OS Classic.
///
/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
///
/// This encoding matches the Windows code page 10017.

View File

@@ -0,0 +1,6 @@
/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
/// them to the Private Use Area of Unicode. It was used for loading binary
/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
/// the `"arraybuffer"` response type.
///
/// This encoding does not have a Windows code page number.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
error_on_line_overflow = false

1548
zeroidc/vendor/encoding_rs/src/ascii.rs vendored Normal file

File diff suppressed because it is too large Load Diff

427
zeroidc/vendor/encoding_rs/src/big5.rs vendored Normal file
View File

@@ -0,0 +1,427 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range32;
pub struct Big5Decoder {
lead: Option<u8>,
}
impl Big5Decoder {
pub fn new() -> VariantDecoder {
VariantDecoder::Big5(Big5Decoder { lead: None })
}
pub fn in_neutral_state(&self) -> bool {
self.lead.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(match self.lead {
None => 0,
Some(_) => 1,
})
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
// If there is a lead but the next byte isn't a valid trail, an
// error is generated for the lead (+1). Then another iteration checks
// space, which needs +1 to account for the possibility of astral
// output or combining pair.
checked_add(1, self.plus_one_if_lead(byte_length))
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// No need to account for REPLACEMENT CHARACTERS.
// Cases:
// ASCII: 1 to 1
// Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
// lead set and first byte is trail: 1 to 4 worst case
//
// When checking for space for the last byte:
// no lead: the last byte must be ASCII (or fatal error): 1 to 1
// lead set: space for 4 bytes was already checked when reading the
// lead, hence the last lead and the last trail together are worst
// case 2 to 4.
//
// If lead set and the input is a single trail byte, the worst-case
// output is 4, so we need to add one before multiplying if lead is
// set.
//
// Finally, add two so that if input is non-zero, the output is at
// least 4.
checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
// If there is a lead but the next byte isn't a valid trail, an
// error is generated for the lead (+(1*3)). Then another iteration
// checks space, which needs +3 to account for the possibility of astral
// output or combining pair. In between start and end, the worst case
// is that every byte is bad: *3.
checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
}
ascii_compatible_two_byte_decoder_functions!(
{
// If lead is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let non_ascii_minus_offset =
non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0xFE - 0x81) {
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
non_ascii_minus_offset
},
{
// If trail is between 0x40 and 0x7E, inclusive,
// subtract offset 0x40. Else if trail is
// between 0xA1 and 0xFE, inclusive, subtract
// offset 0x62.
// TODO: Find out which range is more probable.
let mut trail_minus_offset =
byte.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start =
byte.wrapping_sub(0xA1);
if trail_minus_range_start >
(0xFE - 0xA1) {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
trail_minus_offset = byte - 0x62;
}
let pointer = lead_minus_offset as usize *
157usize +
trail_minus_offset as usize;
let rebased_pointer = pointer.wrapping_sub(942);
let low_bits = big5_low_bits(rebased_pointer);
if low_bits == 0 {
match pointer {
1133 => {
handle.write_big5_combination(0x00CAu16,
0x0304u16)
}
1135 => {
handle.write_big5_combination(0x00CAu16,
0x030Cu16)
}
1164 => {
handle.write_big5_combination(0x00EAu16,
0x0304u16)
}
1166 => {
handle.write_big5_combination(0x00EAu16,
0x030Cu16)
}
_ => {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
} else if big5_is_astral(rebased_pointer) {
handle.write_astral(u32::from(low_bits) |
0x20000u32)
} else {
handle.write_bmp_excl_ascii(low_bits)
}
},
self,
non_ascii,
byte,
lead_minus_offset,
unread_handle_trail,
source,
handle,
'outermost,
copy_ascii_from_check_space_astral,
check_space_astral,
false);
}
pub struct Big5Encoder;
impl Big5Encoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
// Astral: 2 to 2
// ASCII: 1 to 1
// Other: 1 to 2
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
// Astral: 4 to 2
// Upper BMP: 3 to 2
// Lower BMP: 2 to 2
// ASCII: 1 to 1
byte_length.checked_add(1)
}
ascii_compatible_encoder_functions!(
{
// For simplicity, unified ideographs
// in the pointer range 11206...11212 are handled
// as Level 1 Hanzi.
if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
handle.write_two(lead, trail)
} else {
let pointer = if let Some(pointer) = big5_box_encode(bmp) {
pointer
} else if let Some(pointer) = big5_other_encode(bmp) {
pointer
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
let lead = pointer / 157 + 0x81;
let remainder = pointer % 157;
let trail = if remainder < 0x3F {
remainder + 0x40
} else {
remainder + 0x62
};
handle.write_two(lead as u8, trail as u8)
}
},
{
if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
// big5_astral_encode returns rebased pointer,
// so adding 0x87 instead of 0x81.
let lead = rebased_pointer / 157 + 0x87;
let remainder = rebased_pointer % 157;
let trail = if remainder < 0x3F {
remainder + 0x40
} else {
remainder + 0x62
};
handle.write_two(lead as u8, trail as u8)
} else {
return (
EncoderResult::Unmappable(astral),
source.consumed(),
handle.written(),
);
}
} else {
return (
EncoderResult::Unmappable(astral),
source.consumed(),
handle.written(),
);
}
},
bmp,
astral,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_big5(bytes: &[u8], expect: &str) {
decode(BIG5, bytes, expect);
}
fn encode_big5(string: &str, expect: &[u8]) {
encode(BIG5, string, expect);
}
#[test]
fn test_big5_decode() {
// Empty
decode_big5(b"", &"");
// ASCII
decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
// Edge cases
decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
// Edge cases surrounded with ASCII
decode_big5(
&[0x61u8, 0x87u8, 0x40u8, 0x62u8],
&"\u{0061}\u{43F0}\u{0062}",
);
decode_big5(
&[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
&"\u{0061}\u{79D4}\u{0062}",
);
decode_big5(
&[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
&"\u{0061}\u{2910D}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0x62u8, 0x62u8],
&"\u{0061}\u{00CA}\u{0304}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0x64u8, 0x62u8],
&"\u{0061}\u{00CA}\u{030C}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0x66u8, 0x62u8],
&"\u{0061}\u{00CA}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
&"\u{0061}\u{00EA}\u{0304}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
&"\u{0061}\u{00EA}\u{030C}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
&"\u{0061}\u{00EA}\u{0062}",
);
decode_big5(
&[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
&"\u{0061}\u{8991}\u{0062}",
);
decode_big5(
&[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
&"\u{0061}\u{27967}\u{0062}",
);
decode_big5(
&[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
&"\u{0061}\u{8A29}\u{0062}",
);
// Bad sequences
decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
}
#[test]
fn test_big5_encode() {
// Empty
encode_big5("", b"");
// ASCII
encode_big5("\u{0061}\u{0062}", b"\x61\x62");
if !cfg!(miri) {
// Miri is too slow
// Edge cases
encode_big5("\u{9EA6}\u{0061}", b"&#40614;\x61");
encode_big5("\u{2626B}\u{0061}", b"&#156267;\x61");
encode_big5("\u{3000}", b"\xA1\x40");
encode_big5("\u{20AC}", b"\xA3\xE1");
encode_big5("\u{4E00}", b"\xA4\x40");
encode_big5("\u{27607}", b"\xC8\xA4");
encode_big5("\u{FFE2}", b"\xC8\xCD");
encode_big5("\u{79D4}", b"\xFE\xFE");
// Not in index
encode_big5("\u{2603}\u{0061}", b"&#9731;\x61");
}
// duplicate low bits
encode_big5("\u{203B5}", b"\xFD\x6A");
encode_big5("\u{25605}", b"\xFE\x46");
// prefer last
encode_big5("\u{2550}", b"\xF9\xF9");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_big5_decode_all() {
let input = include_bytes!("test_data/big5_in.txt");
let expectation = include_str!("test_data/big5_in_ref.txt");
let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_big5_encode_all() {
let input = include_str!("test_data/big5_out.txt");
let expectation = include_bytes!("test_data/big5_out_ref.txt");
let (cow, encoding, had_errors) = BIG5.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, BIG5);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_big5_encode_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = BIG5.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
}

114378
zeroidc/vendor/encoding_rs/src/data.rs vendored Normal file

File diff suppressed because it is too large Load Diff

469
zeroidc/vendor/encoding_rs/src/euc_jp.rs vendored Normal file
View File

@@ -0,0 +1,469 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
enum EucJpPending {
None,
Jis0208Lead(u8),
Jis0212Shift,
Jis0212Lead(u8),
HalfWidthKatakana,
}
impl EucJpPending {
fn is_none(&self) -> bool {
match *self {
EucJpPending::None => true,
_ => false,
}
}
fn count(&self) -> usize {
match *self {
EucJpPending::None => 0,
EucJpPending::Jis0208Lead(_)
| EucJpPending::Jis0212Shift
| EucJpPending::HalfWidthKatakana => 1,
EucJpPending::Jis0212Lead(_) => 2,
}
}
}
pub struct EucJpDecoder {
pending: EucJpPending,
}
impl EucJpDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::EucJp(EucJpDecoder {
pending: EucJpPending::None,
})
}
pub fn in_neutral_state(&self) -> bool {
self.pending.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
self.plus_one_if_lead(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// worst case: 2 to 3
let len = self.plus_one_if_lead(byte_length);
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_mul(3, self.plus_one_if_lead(byte_length))
}
euc_jp_decoder_functions!(
{
let trail_minus_offset = byte.wrapping_sub(0xA1);
// Fast-track Hiragana (60% according to Lunde)
// and Katakana (10% acconding to Lunde).
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
} else if trail_minus_offset > (0xFE - 0xA1) {
if byte < 0x80 {
return (
DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written(),
);
} else {
let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
let level1_pointer = pointer.wrapping_sub(1410);
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
} else {
let level2_pointer = pointer.wrapping_sub(4418);
if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
} else {
let ibm_pointer = pointer.wrapping_sub(8272);
if ibm_pointer < IBM_KANJI.len() {
handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else if let Some(bmp) = jis0208_range_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else {
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
}
}
}
},
{
// If lead is between 0xA1 and 0xFE, inclusive,
// subtract 0xA1.
let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
if jis0212_lead_minus_offset > (0xFE - 0xA1) {
if lead < 0x80 {
return (
DecoderResult::Malformed(1, 0),
unread_handle_jis0212.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(2, 0),
unread_handle_jis0212.consumed(),
handle.written(),
);
}
jis0212_lead_minus_offset
},
{
// If trail is between 0xA1 and 0xFE, inclusive,
// subtract 0xA1.
let trail_minus_offset = byte.wrapping_sub(0xA1);
if trail_minus_offset > (0xFE - 0xA1) {
if byte < 0x80 {
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(3, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
let pointer_minus_kanji = pointer.wrapping_sub(1410);
if pointer_minus_kanji < JIS0212_KANJI.len() {
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
} else if let Some(bmp) = jis0212_accented_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else {
let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
if pointer_minus_upper_cyrillic <= (607 - 597) {
handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
} else {
let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
if pointer_minus_lower_cyrillic <= (655 - 645) {
handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
} else {
return (
DecoderResult::Malformed(3, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
}
}
},
{
// If trail is between 0xA1 and 0xDF, inclusive,
// subtract 0xA1 and map to half-width Katakana.
let trail_minus_offset = byte.wrapping_sub(0xA1);
if trail_minus_offset > (0xDF - 0xA1) {
if byte < 0x80 {
return (
DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
},
self,
non_ascii,
jis0208_lead_minus_offset,
byte,
unread_handle_trail,
jis0212_lead_minus_offset,
lead,
unread_handle_jis0212,
source,
handle
);
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_euc_jp_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if 0x4EDD == bmp {
// Ideograph on the symbol row!
Some((0xA1, 0xB8))
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
Some((lead, trail))
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
let lead = (pos / 94) + 0xD0;
let trail = (pos % 94) + 0xA1;
Some((lead as u8, trail as u8))
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
let lead = (pos / 94) + 0xF9;
let trail = (pos % 94) + 0xA1;
Some((lead as u8, trail as u8))
} else {
None
}
}
pub struct EucJpEncoder;
impl EucJpEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
byte_length.checked_add(1)
}
ascii_compatible_bmp_encoder_functions!(
{
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
if bmp_minus_hiragana < 0x53 {
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
if bmp_minus_katakana < 0x56 {
handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
} else {
let bmp_minus_space = bmp.wrapping_sub(0x3000);
if bmp_minus_space < 3 {
// fast-track common punctuation
handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
} else if bmp == 0xA5 {
handle.write_one(0x5Cu8)
} else if bmp == 0x203E {
handle.write_one(0x7Eu8)
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
} else if bmp == 0x2212 {
handle.write_two(0xA1u8, 0xDDu8)
} else if let Some(pointer) = jis0208_range_encode(bmp) {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|| bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
let pos = position(&IBM_KANJI[..], bmp).unwrap();
let lead = (pos / 94) + 0xF9;
let trail = (pos % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if let Some(pointer) = ibm_symbol_encode(bmp) {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
}
}
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_euc_jp(bytes: &[u8], expect: &str) {
decode(EUC_JP, bytes, expect);
}
fn encode_euc_jp(string: &str, expect: &[u8]) {
encode(EUC_JP, string, expect);
}
#[test]
fn test_euc_jp_decode() {
// Empty
decode_euc_jp(b"", &"");
// ASCII
decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
// Half-width
decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
decode_euc_jp(b"\x8E", "\u{FFFD}");
// JIS 0212
decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
decode_euc_jp(b"\x8F", "\u{FFFD}");
// JIS 0208
decode_euc_jp(b"\xA1\xA1", "\u{3000}");
decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
decode_euc_jp(b"\xA1", "\u{FFFD}");
// Bad leads
decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
// Bad ASCII trail
decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
}
#[test]
fn test_euc_jp_encode() {
// Empty
encode_euc_jp("", b"");
// ASCII
encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
// Exceptional code points
encode_euc_jp("\u{00A5}", b"\x5C");
encode_euc_jp("\u{203E}", b"\x7E");
encode_euc_jp("\u{2212}", b"\xA1\xDD");
// Half-width
encode_euc_jp("\u{FF61}", b"\x8E\xA1");
encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
// JIS 0212
encode_euc_jp("\u{02D8}", b"&#728;");
// JIS 0208
encode_euc_jp("\u{3000}", b"\xA1\xA1");
encode_euc_jp("\u{FF02}", b"\xFC\xFE");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_jis0208_decode_all() {
let input = include_bytes!("test_data/jis0208_in.txt");
let expectation = include_str!("test_data/jis0208_in_ref.txt");
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_jis0208_encode_all() {
let input = include_str!("test_data/jis0208_out.txt");
let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
let (cow, encoding, had_errors) = EUC_JP.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, EUC_JP);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_jis0212_decode_all() {
let input = include_bytes!("test_data/jis0212_in.txt");
let expectation = include_str!("test_data/jis0212_in_ref.txt");
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
}

442
zeroidc/vendor/encoding_rs/src/euc_kr.rs vendored Normal file
View File

@@ -0,0 +1,442 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
use super::in_range16;
pub struct EucKrDecoder {
lead: Option<u8>,
}
impl EucKrDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::EucKr(EucKrDecoder { lead: None })
}
pub fn in_neutral_state(&self) -> bool {
self.lead.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(match self.lead {
None => 0,
Some(_) => 1,
})
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
self.plus_one_if_lead(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// worst case: 2 to 3
let len = self.plus_one_if_lead(byte_length);
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_mul(3, self.plus_one_if_lead(byte_length))
}
ascii_compatible_two_byte_decoder_functions!(
{
// If lead is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let non_ascii_minus_offset =
non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0xFE - 0x81) {
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
non_ascii_minus_offset
},
{
if lead_minus_offset >= 0x20 {
// Not the extension range above KS X 1001
let trail_minus_offset =
byte.wrapping_sub(0xA1);
if trail_minus_offset <= (0xFE - 0xA1) {
// KS X 1001
let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
if hangul_pointer < KSX1001_HANGUL.len() {
let upper_bmp = KSX1001_HANGUL[hangul_pointer];
handle.write_upper_bmp(upper_bmp)
} else if ksx_pointer < KSX1001_SYMBOLS.len() {
let bmp = KSX1001_SYMBOLS[ksx_pointer];
handle.write_bmp_excl_ascii(bmp)
} else {
let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
if hanja_pointer < KSX1001_HANJA.len() {
let upper_bmp = KSX1001_HANJA[hanja_pointer];
handle.write_upper_bmp(upper_bmp)
} else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
if mid_bmp == 0 {
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
handle.write_mid_bmp(mid_bmp)
} else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
handle.write_mid_bmp(mid_bmp)
} else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
handle.write_upper_bmp(upper_bmp)
} else {
let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
if other_pointer < 0x039F {
let bmp = ksx1001_other_decode(other_pointer as u16);
// ASCII range means unassigned
if bmp < 0x80 {
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
handle.write_bmp_excl_ascii(bmp)
} else {
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
}
} else {
// Extension range to the left of
// KS X 1001
let left_lead = lead_minus_offset - 0x20;
let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
byte - (12 + 0x41)
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
byte - (6 + 0x41)
} else if byte.wrapping_sub(0x41) < 0x1A {
byte - 0x41
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
};
let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
handle.write_upper_bmp(upper_bmp)
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
} else {
// Extension range above KS X 1001
let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
byte - (12 + 0x41)
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
byte - (6 + 0x41)
} else if byte.wrapping_sub(0x41) < 0x1A {
byte - 0x41
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
};
let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
handle.write_upper_bmp(upper_bmp)
}
},
self,
non_ascii,
byte,
lead_minus_offset,
unread_handle_trail,
source,
handle,
'outermost,
copy_ascii_from_check_space_bmp,
check_space_bmp,
true);
}
fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
if in_inclusive_range16(bmp, 0x3000, 0x3015) {
if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
return Some((0xA1, pos + 0xA1));
}
}
if let Some(other_pointer) = ksx1001_other_encode(bmp) {
let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
let other_trail = ((other_pointer as usize) % 94) + 0xA1;
return Some((other_lead, other_trail));
}
if in_range16(bmp, 0x00AA, 0x0168) {
// Latin
if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
return Some((0x81 + 0x28, 0xA1 + pos));
}
if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
return Some((0x81 + 0x27, 0xA1 + pos));
}
} else if in_range16(bmp, 0x2500, 0x254C) {
if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
return Some((0x81 + 0x25, 0xA1 + pos));
}
}
if in_inclusive_range16(bmp, 0x2015, 0x266D)
|| in_inclusive_range16(bmp, 0x321C, 0x33D8)
|| in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
|| in_inclusive_range16(bmp, 0x00A1, 0x00F7)
|| in_inclusive_range16(bmp, 0x02C7, 0x02DD)
{
if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
if pos < (94 - 3) {
return Some((0xA1, pos + 0xA1 + 3));
}
return Some((0xA2, pos - (94 - 3) + 0xA1));
}
}
None
}
#[cfg(not(feature = "fast-hangul-encode"))]
#[inline(always)]
fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
match KSX1001_HANGUL.binary_search(&bmp) {
Ok(ksx_hangul_pointer) => {
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
(ksx_hangul_lead as u8, ksx_hangul_trail as u8)
}
Err(_) => {
let (lead, cp949_trail) = if bmp < 0xC8A5 {
// Above KS X 1001
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
let top_lead = (top_pointer / (190 - 12)) + 0x81;
let top_trail = top_pointer % (190 - 12);
(top_lead as u8, top_trail as u8)
} else {
// To the left of KS X 1001
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
let left_trail = left_pointer % (190 - 94 - 12);
(left_lead as u8, left_trail as u8)
};
let offset = if cp949_trail >= (0x40 - 12) {
0x41 + 12
} else if cp949_trail >= (0x20 - 6) {
0x41 + 6
} else {
0x41
};
(lead as u8, (cp949_trail + offset) as u8)
}
}
}
#[cfg(feature = "fast-hangul-encode")]
#[inline(always)]
fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
cp949_hangul_encode(bmp_minus_hangul_start)
}
#[cfg(not(feature = "fast-hanja-encode"))]
#[inline(always)]
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
let hanja_trail = (hanja_pointer % 94) + 0xA1;
Some((hanja_lead as u8, hanja_trail as u8))
} else {
None
}
}
#[cfg(feature = "fast-hanja-encode")]
#[inline(always)]
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
if bmp < 0xF900 {
ksx1001_unified_hangul_encode(bmp)
} else {
Some(ksx1001_compatibility_hangul_encode(bmp))
}
}
pub struct EucKrEncoder;
impl EucKrEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
byte_length.checked_add(1)
}
ascii_compatible_bmp_encoder_functions!(
{
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
// Hangul
ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
} else if in_range16(bmp, 0x33DE, 0xFF01) {
// Vast range that includes no other
// mappables except Hangul (already
// processed) and Hanja.
// Narrow the range further to Unified and
// Compatibility ranges of Hanja.
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
(hanja_lead, hanja_trail)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
(lead as u8, trail as u8)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
handle.write_two(lead, trail)
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
true
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_euc_kr(bytes: &[u8], expect: &str) {
decode(EUC_KR, bytes, expect);
}
fn encode_euc_kr(string: &str, expect: &[u8]) {
encode(EUC_KR, string, expect);
}
#[test]
fn test_euc_kr_decode() {
// Empty
decode_euc_kr(b"", &"");
// ASCII
decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
decode_euc_kr(b"\x81\x41", "\u{AC02}");
decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
}
#[test]
fn test_euc_kr_encode() {
// Empty
encode_euc_kr("", b"");
// ASCII
encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
encode_euc_kr("\u{AC02}", b"\x81\x41");
encode_euc_kr("\u{8A70}", b"\xFD\xFE");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_euc_kr_decode_all() {
let input = include_bytes!("test_data/euc_kr_in.txt");
let expectation = include_str!("test_data/euc_kr_in_ref.txt");
let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_euc_kr_encode_all() {
let input = include_str!("test_data/euc_kr_out.txt");
let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
let (cow, encoding, had_errors) = EUC_KR.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, EUC_KR);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
fn test_euc_kr_encode_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = EUC_KR.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
}

View File

@@ -0,0 +1,767 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
use super::in_range16;
enum Gb18030Pending {
None,
One(u8),
Two(u8, u8),
Three(u8, u8, u8),
}
impl Gb18030Pending {
fn is_none(&self) -> bool {
match *self {
Gb18030Pending::None => true,
_ => false,
}
}
fn count(&self) -> usize {
match *self {
Gb18030Pending::None => 0,
Gb18030Pending::One(_) => 1,
Gb18030Pending::Two(_, _) => 2,
Gb18030Pending::Three(_, _, _) => 3,
}
}
}
pub struct Gb18030Decoder {
first: Option<u8>,
second: Option<u8>,
third: Option<u8>,
pending: Gb18030Pending,
pending_ascii: Option<u8>,
}
impl Gb18030Decoder {
pub fn new() -> VariantDecoder {
VariantDecoder::Gb18030(Gb18030Decoder {
first: None,
second: None,
third: None,
pending: Gb18030Pending::None,
pending_ascii: None,
})
}
pub fn in_neutral_state(&self) -> bool {
self.first.is_none()
&& self.second.is_none()
&& self.third.is_none()
&& self.pending.is_none()
&& self.pending_ascii.is_none()
}
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(
self.pending.count()
+ match self.first {
None => 0,
Some(_) => 1,
}
+ match self.second {
None => 0,
Some(_) => 1,
}
+ match self.third {
None => 0,
Some(_) => 1,
}
+ match self.pending_ascii {
None => 0,
Some(_) => 1,
},
)
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
// ASCII: 1 to 1 (worst case)
// gbk: 2 to 1
// ranges: 4 to 1 or 4 to 2
checked_add(1, self.extra_from_state(byte_length))
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// ASCII: 1 to 1
// gbk: 2 to 2 or 2 to 3
// ranges: 4 to 2, 4 to 3 or 4 to 4
// 0x80: 1 to 3 (worst case)
self.max_utf8_buffer_length(byte_length)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
}
gb18030_decoder_functions!(
{
// If first is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0xFE - 0x81) {
if non_ascii == 0x80 {
handle.write_upper_bmp(0x20ACu16);
continue 'outermost;
}
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
non_ascii_minus_offset
},
{
// Two-byte (or error)
if first_minus_offset >= 0x20 {
// Not the gbk ideograph range above GB2312
let trail_minus_offset = second.wrapping_sub(0xA1);
if trail_minus_offset <= (0xFE - 0xA1) {
// GB2312
let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
if hanzi_lead < (0x77 - 0x2F) {
// Level 1 Hanzi, Level 2 Hanzi
// or one of the 5 PUA code
// points in between.
let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
let upper_bmp = GB2312_HANZI[hanzi_pointer];
handle.write_upper_bmp(upper_bmp)
} else if first_minus_offset == 0x20 {
// Symbols (starting with ideographic space)
let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
handle.write_bmp_excl_ascii(bmp)
} else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
} else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
} else if first_minus_offset > 0x76 {
// Bottom PUA
let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
handle.write_upper_bmp(pua)
} else {
let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
handle.write_bmp_excl_ascii(bmp)
}
} else {
// gbk range on the left
let mut trail_minus_offset = second.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start = second.wrapping_sub(0x80);
if trail_minus_range_start > (0xA0 - 0x80) {
if second < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_second.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_second.consumed(),
handle.written());
}
trail_minus_offset = second - 0x41;
}
// Zero-base lead
let left_lead = first_minus_offset - 0x20;
let left_pointer = left_lead as usize * (190 - 94) +
trail_minus_offset as usize;
let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
handle.write_upper_bmp(upper_bmp)
} else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
let bmp = gbk_other_decode(left_pointer as u16);
handle.write_bmp_excl_ascii(bmp)
} else {
let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
let upper_bmp = GBK_BOTTOM[bottom_pointer];
handle.write_upper_bmp(upper_bmp)
}
}
} else {
// gbk ideograph range above GB2312
let mut trail_minus_offset = second.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start = second.wrapping_sub(0x80);
if trail_minus_range_start > (0xFE - 0x80) {
if second < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_second.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_second.consumed(),
handle.written());
}
trail_minus_offset = second - 0x41;
}
let pointer = first_minus_offset as usize * 190usize +
trail_minus_offset as usize;
let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
handle.write_upper_bmp(upper_bmp)
}
},
{
// If third is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let third_minus_offset = third.wrapping_sub(0x81);
if third_minus_offset > (0xFE - 0x81) {
// We have an error. Let's inline what's going
// to happen when `second` is
// reprocessed. (`third` gets unread.)
// `second` is guaranteed ASCII, so let's
// put it in `pending_ascii`. Recompute
// `second` from `second_minus_offset`.
self.pending_ascii = Some(second_minus_offset + 0x30);
// Now unread `third` and designate the previous
// `first` as being in error.
return (DecoderResult::Malformed(1, 1),
unread_handle_third.unread(),
handle.written());
}
third_minus_offset
},
{
// If fourth is between 0x30 and 0x39, inclusive,
// subtract offset 0x30.
//
// If we have an error, we'll inline what's going
// to happen when `second` and `third` are
// reprocessed. (`fourth` gets unread.)
// `second` is guaranteed ASCII, so let's
// put it in `pending_ascii`. Recompute
// `second` from `second_minus_offset` to
// make this block reusable when `second`
// is not in scope.
//
// `third` is guaranteed to be in the range
// that makes it become the new `self.first`.
//
// `fourth` gets unread and the previous
// `first` gets designates as being in error.
let fourth_minus_offset = fourth.wrapping_sub(0x30);
if fourth_minus_offset > (0x39 - 0x30) {
self.pending_ascii = Some(second_minus_offset + 0x30);
self.pending = Gb18030Pending::One(third_minus_offset);
return (DecoderResult::Malformed(1, 2),
unread_handle_fourth.unread(),
handle.written());
}
let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
(second_minus_offset as usize * (10 * 126)) +
(third_minus_offset as usize * 10) +
fourth_minus_offset as usize;
if pointer <= 39419 {
// BMP
if pointer == 7457 {
handle.write_upper_bmp(0xE7C7)
} else {
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
}
} else if pointer >= 189_000 && pointer <= 1_237_575 {
// Astral
handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
} else {
return (DecoderResult::Malformed(4, 0),
unread_handle_fourth.consumed(),
handle.written());
}
},
self,
non_ascii,
first_minus_offset,
second,
second_minus_offset,
unread_handle_second,
third,
third_minus_offset,
unread_handle_third,
fourth,
fourth_minus_offset,
unread_handle_fourth,
source,
handle,
'outermost);
}
// XXX Experiment with inline directives
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
// Try ideographic punctuation first as it's the most likely case.
// Throwing in the check for full-width currencies and tilde is probably
// more size-efficient here than elsewhere.
if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
return Some((0xA1, pos + 0xA1));
}
}
// Ext A
if in_range16(bmp, 0x3400, 0x4E00) {
return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
(
0xFE,
pos + if pos < (0x3F - 16) {
0x40 + 16
} else {
0x41 + 16
},
)
});
}
// Compatibility ideographs
if in_range16(bmp, 0xF900, 0xFB00) {
return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
if pos < 5 {
// end of second to last row
(0xFD, pos + (190 - 94 - 5 + 0x41))
} else {
// last row
(0xFE, pos + (0x40 - 5))
}
});
}
// Handle everything below U+02CA, which is in GBK_OTHER.
if bmp < 0x02CA {
if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
// Pinyin except U+1E3F
if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
return Some((0xA8, pos + 0xA1));
}
} else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
|| in_inclusive_range16(bmp, 0x02C7, 0x02C9)
{
// Diacritics and Latin 1 symbols
if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
return Some((0xA1, pos + 0xA1 + 3));
}
}
return None;
}
if bmp >= 0xE794 {
// Various brackets, all in PUA or full-width regions
if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
}
} else if bmp == 0x1E3F {
// The one Pinyin placed elsewhere on the BMP
return Some((0xA8, 0x7B - 0x60 + 0xA1));
} else if in_range16(bmp, 0xA000, 0xD800) {
// Since Korean has usage in China, let's spend a branch to fast-track
// Hangul.
return None;
}
// GB2312 other (except bottom PUA and PUA between Hanzi levels).
if let Some(other_pointer) = gb2312_other_encode(bmp) {
let other_lead = other_pointer as usize / 94;
let other_trail = other_pointer as usize % 94;
return Some((0xA2 + other_lead, 0xA1 + other_trail));
}
// At this point, we've handled all mappable characters above U+02D9 but
// below U+2010. Let's check for that range in order to let lower BMP
// characters used for minority languages in China avoid the subsequent
// search that deals mainly with various symbols.
if in_range16(bmp, 0x02DA, 0x2010) {
return None;
}
// GBK other (except radicals and PUA in GBK_BOTTOM).
if let Some(other_pointer) = gbk_other_encode(bmp) {
let other_lead = other_pointer as usize / (190 - 94);
let other_trail = other_pointer as usize % (190 - 94);
let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
return Some((other_lead + (0x81 + 0x20), other_trail + offset));
}
// CJK Radicals Supplement or PUA in GBK_BOTTOM
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
let trail = pos + 16;
let offset = if trail < 0x3F { 0x40 } else { 0x41 };
return Some((0xFE, trail + offset));
}
}
// GB2312 bottom PUA
let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
}
// PUA between Hanzi Levels
let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
if bmp_minus_pua_between_hanzi < 5 {
return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
}
None
}
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
#[inline(always)]
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
(lead, trail)
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
(hanzi_lead as u8, hanzi_trail as u8)
} else {
let (lead, gbk_trail) = if bmp < 0x72DC {
// Above GB2312
let pointer = gbk_top_ideograph_encode(bmp) as usize;
let lead = (pointer / 190) + 0x81;
let gbk_trail = pointer % 190;
(lead, gbk_trail)
} else {
// To the left of GB2312
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
(lead, gbk_trail)
};
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
(lead as u8, (gbk_trail + offset) as u8)
}
}
#[cfg(feature = "fast-gb-hanzi-encode")]
#[inline(always)]
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
gbk_hanzi_encode(bmp_minus_unified_start)
}
pub struct Gb18030Encoder {
extended: bool,
}
impl Gb18030Encoder {
pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
Encoder::new(
encoding,
VariantEncoder::Gb18030(Gb18030Encoder {
extended: extended_range,
}),
)
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
if self.extended {
u16_length.checked_mul(4)
} else {
// Need to add, because space check is done with the four-byte
// assumption.
checked_add(2, u16_length.checked_mul(2))
}
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
if self.extended {
// 1 to 1
// 2 to 2
// 3 to 2
// 2 to 4 (worst)
// 3 to 4
// 4 to 4
checked_add(2, byte_length.checked_mul(2))
} else {
// 1 to 1
// 2 to 2
// 3 to 2
// Need to add, because space check is done with the four-byte
// assumption.
byte_length.checked_add(3)
}
}
ascii_compatible_encoder_functions!(
{
let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
// CJK Unified Ideographs
// Can't fail now, since all are
// mapped.
let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
handle.write_two(lead, trail)
} else if bmp == 0xE5E5 {
// It's not optimal to check for the unmappable
// and for euro at this stage, but getting
// the out of the way makes the rest of the
// code less messy.
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
} else if bmp == 0x20AC && !self.extended {
handle.write_one(0x80u8)
} else {
match gbk_encode_non_unified(bmp) {
Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
None => {
if !self.extended {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
let range_pointer = gb18030_range_encode(bmp);
let first = range_pointer / (10 * 126 * 10);
let rem_first = range_pointer % (10 * 126 * 10);
let second = rem_first / (10 * 126);
let rem_second = rem_first % (10 * 126);
let third = rem_second / 10;
let fourth = rem_second % 10;
handle.write_four(
(first + 0x81) as u8,
(second + 0x30) as u8,
(third + 0x81) as u8,
(fourth + 0x30) as u8,
)
}
}
}
},
{
if !self.extended {
return (
EncoderResult::Unmappable(astral),
source.consumed(),
handle.written(),
);
}
let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
let first = range_pointer / (10 * 126 * 10);
let rem_first = range_pointer % (10 * 126 * 10);
let second = rem_first / (10 * 126);
let rem_second = rem_first % (10 * 126);
let third = rem_second / 10;
let fourth = rem_second % 10;
handle.write_four(
(first + 0x81) as u8,
(second + 0x30) as u8,
(third + 0x81) as u8,
(fourth + 0x30) as u8,
)
},
bmp,
astral,
self,
source,
handle,
copy_ascii_to_check_space_four,
check_space_four,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_gb18030(bytes: &[u8], expect: &str) {
decode(GB18030, bytes, expect);
}
fn encode_gb18030(string: &str, expect: &[u8]) {
encode(GB18030, string, expect);
}
fn encode_gbk(string: &str, expect: &[u8]) {
encode(GBK, string, expect);
}
#[test]
fn test_gb18030_decode() {
// Empty
decode_gb18030(b"", &"");
// ASCII
decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
// euro
decode_gb18030(b"\x80", "\u{20AC}");
decode_gb18030(b"\xA2\xE3", "\u{20AC}");
// two bytes
decode_gb18030(b"\x81\x40", "\u{4E02}");
decode_gb18030(b"\x81\x7E", "\u{4E8A}");
decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
decode_gb18030(b"\x81\x80", "\u{4E90}");
decode_gb18030(b"\x81\xFE", "\u{4FA2}");
decode_gb18030(b"\xFE\x40", "\u{FA0C}");
decode_gb18030(b"\xFE\x7E", "\u{E843}");
decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
decode_gb18030(b"\xFE\x80", "\u{4723}");
decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
// The difference from the original GB18030
decode_gb18030(b"\xA3\xA0", "\u{3000}");
decode_gb18030(b"\xA1\xA1", "\u{3000}");
// 0xFF
decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
decode_gb18030(
b"\xFF\x32\x9A\x33\x00",
"\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
);
// Four bytes
decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
}
#[test]
fn test_gb18030_encode() {
// Empty
encode_gb18030("", b"");
// ASCII
encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
// euro
encode_gb18030("\u{20AC}", b"\xA2\xE3");
// two bytes
encode_gb18030("\u{4E02}", b"\x81\x40");
encode_gb18030("\u{4E8A}", b"\x81\x7E");
if !cfg!(miri) {
// Miri is too slow
encode_gb18030("\u{4E90}", b"\x81\x80");
encode_gb18030("\u{4FA2}", b"\x81\xFE");
encode_gb18030("\u{FA0C}", b"\xFE\x40");
encode_gb18030("\u{E843}", b"\xFE\x7E");
encode_gb18030("\u{4723}", b"\xFE\x80");
encode_gb18030("\u{E4C5}", b"\xFE\xFE");
}
// The difference from the original GB18030
encode_gb18030("\u{E5E5}", b"&#58853;");
encode_gb18030("\u{3000}", b"\xA1\xA1");
// Four bytes
encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
if !cfg!(miri) {
// Miri is too slow
encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
}
// Edge cases
encode_gb18030("\u{00F7}", b"\xA1\xC2");
}
#[test]
fn test_gbk_encode() {
// Empty
encode_gbk("", b"");
// ASCII
encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
// euro
encode_gbk("\u{20AC}", b"\x80");
// two bytes
encode_gbk("\u{4E02}", b"\x81\x40");
encode_gbk("\u{4E8A}", b"\x81\x7E");
if !cfg!(miri) {
// Miri is too slow
encode_gbk("\u{4E90}", b"\x81\x80");
encode_gbk("\u{4FA2}", b"\x81\xFE");
encode_gbk("\u{FA0C}", b"\xFE\x40");
encode_gbk("\u{E843}", b"\xFE\x7E");
encode_gbk("\u{4723}", b"\xFE\x80");
encode_gbk("\u{E4C5}", b"\xFE\xFE");
}
// The difference from the original gb18030
encode_gbk("\u{E5E5}", b"&#58853;");
encode_gbk("\u{3000}", b"\xA1\xA1");
// Four bytes
encode_gbk("\u{0080}", b"&#128;");
encode_gbk("\u{E7C7}", b"&#59335;");
if !cfg!(miri) {
// Miri is too slow
encode_gbk("\u{2603}", b"&#9731;");
encode_gbk("\u{1F4A9}", b"&#128169;");
encode_gbk("\u{10FFFF}", b"&#1114111;");
}
// Edge cases
encode_gbk("\u{00F7}", b"\xA1\xC2");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_gb18030_decode_all() {
let input = include_bytes!("test_data/gb18030_in.txt");
let expectation = include_str!("test_data/gb18030_in_ref.txt");
let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_gb18030_encode_all() {
let input = include_str!("test_data/gb18030_out.txt");
let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
let (cow, encoding, had_errors) = GB18030.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, GB18030);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
fn test_gb18030_encode_from_utf16_max_length() {
let mut output = [0u8; 20];
let mut encoder = GB18030.new_encoder();
{
let needed = encoder
.max_buffer_length_from_utf16_without_replacement(1)
.unwrap();
let (result, read, written) = encoder.encode_from_utf16_without_replacement(
&[0x3000],
&mut output[..needed],
true,
);
assert_eq!(result, EncoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 2);
assert_eq!(output[0], 0xA1);
assert_eq!(output[1], 0xA1);
}
}
}

1969
zeroidc/vendor/encoding_rs/src/handles.rs vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

6113
zeroidc/vendor/encoding_rs/src/lib.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1622
zeroidc/vendor/encoding_rs/src/macros.rs vendored Normal file

File diff suppressed because it is too large Load Diff

3356
zeroidc/vendor/encoding_rs/src/mem.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::variant::*;
pub struct ReplacementDecoder {
emitted: bool,
}
impl ReplacementDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::Replacement(ReplacementDecoder { emitted: false })
}
pub fn max_utf16_buffer_length(&self, _u16_length: usize) -> Option<usize> {
Some(1)
}
pub fn max_utf8_buffer_length_without_replacement(&self, _byte_length: usize) -> Option<usize> {
Some(3)
}
pub fn max_utf8_buffer_length(&self, _byte_length: usize) -> Option<usize> {
Some(3)
}
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
// Don't err if the input stream is empty. See
// https://github.com/whatwg/encoding/issues/33
if self.emitted || src.is_empty() {
(DecoderResult::InputEmpty, src.len(), 0)
} else if dst.is_empty() {
// Make sure there's room for the replacement character.
(DecoderResult::OutputFull, 0, 0)
} else {
self.emitted = true;
(DecoderResult::Malformed(1, 0), 1, 0)
}
}
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
_last: bool,
) -> (DecoderResult, usize, usize) {
// Don't err if the input stream is empty. See
// https://github.com/whatwg/encoding/issues/33
if self.emitted || src.is_empty() {
(DecoderResult::InputEmpty, src.len(), 0)
} else if dst.len() < 3 {
// Make sure there's room for the replacement character.
(DecoderResult::OutputFull, 0, 0)
} else {
self.emitted = true;
(DecoderResult::Malformed(1, 0), 1, 0)
}
}
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_replacement(bytes: &[u8], expect: &str) {
decode_without_padding(REPLACEMENT, bytes, expect);
}
fn encode_replacement(string: &str, expect: &[u8]) {
encode(REPLACEMENT, string, expect);
}
#[test]
fn test_replacement_decode() {
decode_replacement(b"", "");
decode_replacement(b"A", "\u{FFFD}");
decode_replacement(b"AB", "\u{FFFD}");
}
#[test]
fn test_replacement_encode() {
// Empty
encode_replacement("", b"");
assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
encode_replacement("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
}
}

View File

@@ -0,0 +1,426 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range;
use super::in_inclusive_range16;
pub struct ShiftJisDecoder {
lead: Option<u8>,
}
impl ShiftJisDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
}
pub fn in_neutral_state(&self) -> bool {
self.lead.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(match self.lead {
None => 0,
Some(_) => 1,
})
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
self.plus_one_if_lead(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// worst case: 1 to 3 (half-width katakana)
self.max_utf8_buffer_length(byte_length)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_mul(3, self.plus_one_if_lead(byte_length))
}
ascii_compatible_two_byte_decoder_functions!(
{
// If lead is between 0x81 and 0x9F, inclusive,
// subtract offset 0x81. Else if lead is
// between 0xE0 and 0xFC, inclusive, subtract
// offset 0xC1. Else if lead is between
// 0xA1 and 0xDF, inclusive, map to half-width
// Katakana. Else if lead is 0x80, pass through.
let mut non_ascii_minus_offset =
non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0x9F - 0x81) {
let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
if non_ascii_minus_range_start > (0xFC - 0xE0) {
let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
if non_ascii == 0x80 {
handle.write_mid_bmp(0x80);
// Not caring about optimizing subsequent non-ASCII
continue 'outermost;
}
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
// Not caring about optimizing subsequent non-ASCII
continue 'outermost;
}
non_ascii_minus_offset = non_ascii - 0xC1;
}
non_ascii_minus_offset
},
{
// If trail is between 0x40 and 0x7E, inclusive,
// subtract offset 0x40. Else if trail is
// between 0x80 and 0xFC, inclusive, subtract
// offset 0x41.
// Fast-track Hiragana (60% according to Lunde)
// and Katakana (10% acconding to Lunde).
// Hiragana doesn't cross 0x7F, but Katakana does.
// We can check for Hiragana before normalizing
// trail.
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
} else {
let mut trail_minus_offset =
byte.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start =
byte.wrapping_sub(0x80);
if trail_minus_range_start > (0xFC - 0x80) {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
trail_minus_offset = byte - 0x41;
}
if lead_minus_offset == 0x02 &&
trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
} else {
let pointer = lead_minus_offset as usize *
188usize +
trail_minus_offset as usize;
let level1_pointer = pointer.wrapping_sub(1410);
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
} else {
let level2_pointer = pointer.wrapping_sub(4418);
if level2_pointer <
JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
} else {
let upper_ibm_pointer = pointer.wrapping_sub(10744);
if upper_ibm_pointer < IBM_KANJI.len() {
handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
} else {
let lower_ibm_pointer = pointer.wrapping_sub(8272);
if lower_ibm_pointer < IBM_KANJI.len() {
handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
} else if in_inclusive_range(pointer, 8836, 10715) {
handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else if let Some(bmp) = jis0208_range_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
}
}
}
}
},
self,
non_ascii,
byte,
lead_minus_offset,
unread_handle_trail,
source,
handle,
'outermost,
copy_ascii_from_check_space_bmp,
check_space_bmp,
false);
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_shift_jis_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
return Some((lead, trail));
}
let pointer = if 0x4EDD == bmp {
// Ideograph on the symbol row!
23
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
4418 + pos
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
10744 + pos
} else {
return None;
};
let lead = pointer / 188;
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
let trail = pointer % 188;
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
}
pub struct ShiftJisEncoder;
impl ShiftJisEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
byte_length.checked_add(1)
}
ascii_compatible_bmp_encoder_functions!(
{
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
if bmp_minus_hiragana < 0x53 {
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
if bmp_minus_katakana < 0x56 {
let trail_offset = if bmp_minus_katakana < 0x3F {
0x40
} else {
0x41
};
handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
} else {
let bmp_minus_space = bmp.wrapping_sub(0x3000);
if bmp_minus_space < 3 {
// fast-track common punctuation
handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
} else if bmp == 0xA5 {
handle.write_one(0x5Cu8)
} else if bmp == 0x80 {
handle.write_one(0x80u8)
} else if bmp == 0x203E {
handle.write_one(0x7Eu8)
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
} else if bmp == 0x2212 {
handle.write_two(0x81u8, 0x7Cu8)
} else {
let bmp_minus_roman = bmp.wrapping_sub(0x2170);
let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
10716 + bmp_minus_roman as usize
} else if let Some(pointer) = jis0208_range_encode(bmp) {
pointer
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|| bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
let pos = position(&IBM_KANJI[..], bmp).unwrap();
10744 + pos
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
pointer
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
let lead = pointer / 188;
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
let trail = pointer % 188;
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
}
}
}
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_shift_jis(bytes: &[u8], expect: &str) {
decode(SHIFT_JIS, bytes, expect);
}
fn encode_shift_jis(string: &str, expect: &[u8]) {
encode(SHIFT_JIS, string, expect);
}
#[test]
fn test_shift_jis_decode() {
// Empty
decode_shift_jis(b"", &"");
// ASCII
decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
// Half-width
decode_shift_jis(b"\xA1", "\u{FF61}");
decode_shift_jis(b"\xDF", "\u{FF9F}");
decode_shift_jis(b"\xA0", "\u{FFFD}");
decode_shift_jis(b"\xE0", "\u{FFFD}");
decode_shift_jis(b"\xA0+", "\u{FFFD}+");
decode_shift_jis(b"\xE0+", "\u{FFFD}+");
// EUDC
decode_shift_jis(b"\xF0\x40", "\u{E000}");
decode_shift_jis(b"\xF9\xFC", "\u{E757}");
decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
decode_shift_jis(b"\xFA\x40", "\u{2170}");
// JIS 0208
decode_shift_jis(b"\x81\x40", "\u{3000}");
decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
decode_shift_jis(b"\xFA\x40", "\u{2170}");
decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
//
}
#[test]
fn test_shift_jis_encode() {
// Empty
encode_shift_jis("", b"");
// ASCII
encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
// Exceptional code points
encode_shift_jis("\u{0080}", b"\x80");
encode_shift_jis("\u{00A5}", b"\x5C");
encode_shift_jis("\u{203E}", b"\x7E");
encode_shift_jis("\u{2212}", b"\x81\x7C");
// Half-width
encode_shift_jis("\u{FF61}", b"\xA1");
encode_shift_jis("\u{FF9F}", b"\xDF");
// EUDC
encode_shift_jis("\u{E000}", b"&#57344;");
encode_shift_jis("\u{E757}", b"&#59223;");
// JIS 0212
encode_shift_jis("\u{02D8}", b"&#728;");
// JIS 0208
encode_shift_jis("\u{3000}", b"\x81\x40");
encode_shift_jis("\u{FF02}", b"\xFA\x57");
encode_shift_jis("\u{2170}", b"\xFA\x40");
encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_shift_jis_decode_all() {
let input = include_bytes!("test_data/shift_jis_in.txt");
let expectation = include_str!("test_data/shift_jis_in_ref.txt");
let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_shift_jis_encode_all() {
let input = include_str!("test_data/shift_jis_out.txt");
let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, SHIFT_JIS);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
fn test_shift_jis_half_width_katakana_length() {
let mut output = [0u8; 20];
let mut decoder = SHIFT_JIS.new_decoder();
{
let needed = decoder
.max_utf8_buffer_length_without_replacement(1)
.unwrap();
let (result, read, written) =
decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
assert_eq!(result, DecoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert_eq!(output[0], 0xEF);
assert_eq!(output[1], 0xBD);
assert_eq!(output[2], 0xA1);
}
}
}

View File

@@ -0,0 +1,455 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use packed_simd::u16x8;
use packed_simd::u8x16;
use packed_simd::FromBits;
// TODO: Migrate unaligned access to stdlib code if/when the RFC
// https://github.com/rust-lang/rfcs/pull/1725 is implemented.
#[inline(always)]
pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
let mut simd = ::core::mem::uninitialized();
::core::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
simd
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
*(ptr as *const u8x16)
}
#[inline(always)]
pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
::core::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
*(ptr as *mut u8x16) = s;
}
#[inline(always)]
pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
let mut simd = ::core::mem::uninitialized();
::core::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
simd
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
*(ptr as *const u16x8)
}
#[inline(always)]
pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
::core::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
*(ptr as *mut u16x8) = s;
}
cfg_if! {
if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] {
use core::arch::x86_64::__m128i;
use core::arch::x86_64::_mm_movemask_epi8;
use core::arch::x86_64::_mm_packus_epi16;
} else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] {
use core::arch::x86::__m128i;
use core::arch::x86::_mm_movemask_epi8;
use core::arch::x86::_mm_packus_epi16;
} else if #[cfg(target_arch = "aarch64")]{
use core::arch::aarch64::uint8x16_t;
use core::arch::aarch64::uint16x8_t;
use core::arch::aarch64::vmaxvq_u8;
use core::arch::aarch64::vmaxvq_u16;
} else {
}
}
// #[inline(always)]
// fn simd_byte_swap_u8(s: u8x16) -> u8x16 {
// unsafe {
// shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
// }
// }
// #[inline(always)]
// pub fn simd_byte_swap(s: u16x8) -> u16x8 {
// to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s)))
// }
#[inline(always)]
pub fn simd_byte_swap(s: u16x8) -> u16x8 {
let left = s << 8;
let right = s >> 8;
left | right
}
#[inline(always)]
pub fn to_u16_lanes(s: u8x16) -> u16x8 {
u16x8::from_bits(s)
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
// Expose low-level mask instead of higher-level conclusion,
// because the non-ASCII case would perform less well otherwise.
#[inline(always)]
pub fn mask_ascii(s: u8x16) -> i32 {
unsafe {
_mm_movemask_epi8(__m128i::from_bits(s))
}
}
} else {
}
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
#[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool {
unsafe {
_mm_movemask_epi8(__m128i::from_bits(s)) == 0
}
}
} else if #[cfg(target_arch = "aarch64")]{
#[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool {
unsafe {
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80
}
}
} else {
#[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool {
// This optimizes better on ARM than
// the lt formulation.
let highest_ascii = u8x16::splat(0x7F);
!s.gt(highest_ascii).any()
}
}
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
#[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool {
if simd_is_ascii(s) {
return true;
}
let above_str_latin1 = u8x16::splat(0xC4);
s.lt(above_str_latin1).all()
}
} else if #[cfg(target_arch = "aarch64")]{
#[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool {
unsafe {
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4
}
}
} else {
#[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool {
let above_str_latin1 = u8x16::splat(0xC4);
s.lt(above_str_latin1).all()
}
}
}
cfg_if! {
if #[cfg(target_arch = "aarch64")]{
#[inline(always)]
pub fn simd_is_basic_latin(s: u16x8) -> bool {
unsafe {
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80
}
}
#[inline(always)]
pub fn simd_is_latin1(s: u16x8) -> bool {
unsafe {
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100
}
}
} else {
#[inline(always)]
pub fn simd_is_basic_latin(s: u16x8) -> bool {
let above_ascii = u16x8::splat(0x80);
s.lt(above_ascii).all()
}
#[inline(always)]
pub fn simd_is_latin1(s: u16x8) -> bool {
// For some reason, on SSE2 this formulation
// seems faster in this case while the above
// function is better the other way round...
let highest_latin1 = u16x8::splat(0xFF);
!s.gt(highest_latin1).any()
}
}
}
#[inline(always)]
pub fn contains_surrogates(s: u16x8) -> bool {
let mask = u16x8::splat(0xF800);
let surrogate_bits = u16x8::splat(0xD800);
(s & mask).eq(surrogate_bits).any()
}
cfg_if! {
if #[cfg(target_arch = "aarch64")]{
macro_rules! aarch64_return_false_if_below_hebrew {
($s:ident) => ({
unsafe {
if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 {
return false;
}
}
})
}
macro_rules! non_aarch64_return_false_if_all {
($s:ident) => ()
}
} else {
macro_rules! aarch64_return_false_if_below_hebrew {
($s:ident) => ()
}
macro_rules! non_aarch64_return_false_if_all {
($s:ident) => ({
if $s.all() {
return false;
}
})
}
}
}
macro_rules! in_range16x8 {
($s:ident, $start:expr, $end:expr) => {{
// SIMD sub is wrapping
($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
}};
}
#[inline(always)]
pub fn is_u16x8_bidi(s: u16x8) -> bool {
// We try to first quickly refute the RTLness of the vector. If that
// fails, we do the real RTL check, so in that case we end up wasting
// the work for the up-front quick checks. Even the quick-check is
// two-fold in order to return `false` ASAP if everything is below
// Hebrew.
aarch64_return_false_if_below_hebrew!(s);
let below_hebrew = s.lt(u16x8::splat(0x0590));
non_aarch64_return_false_if_all!(below_hebrew);
if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
return false;
}
// Quick refutation failed. Let's do the full check.
(in_range16x8!(s, 0x0590, 0x0900)
| in_range16x8!(s, 0xFB1D, 0xFE00)
| in_range16x8!(s, 0xFE70, 0xFEFF)
| in_range16x8!(s, 0xD802, 0xD804)
| in_range16x8!(s, 0xD83A, 0xD83C)
| s.eq(u16x8::splat(0x200F))
| s.eq(u16x8::splat(0x202B))
| s.eq(u16x8::splat(0x202E))
| s.eq(u16x8::splat(0x2067)))
.any()
}
#[inline(always)]
pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
unsafe {
let first: u8x16 = shuffle!(
s,
u8x16::splat(0),
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
);
let second: u8x16 = shuffle!(
s,
u8x16::splat(0),
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
);
(u16x8::from_bits(first), u16x8::from_bits(second))
}
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
#[inline(always)]
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
unsafe {
u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b)))
}
}
} else {
#[inline(always)]
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
unsafe {
let first = u8x16::from_bits(a);
let second = u8x16::from_bits(b);
shuffle!(
first,
second,
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
)
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec::Vec;
#[test]
fn test_unpack() {
let ascii: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let basic_latin: [u16; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
let mut vec = Vec::with_capacity(16);
vec.resize(16, 0u16);
let (first, second) = simd_unpack(simd);
let ptr = vec.as_mut_ptr();
unsafe {
store8_unaligned(ptr, first);
store8_unaligned(ptr.add(8), second);
}
assert_eq!(&vec[..], &basic_latin[..]);
}
#[test]
fn test_simd_is_basic_latin_success() {
let ascii: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let basic_latin: [u16; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
let mut vec = Vec::with_capacity(16);
vec.resize(16, 0u8);
let ptr = vec.as_mut_ptr();
assert!(simd_is_basic_latin(first | second));
unsafe {
store16_unaligned(ptr, simd_pack(first, second));
}
assert_eq!(&vec[..], &ascii[..]);
}
#[test]
fn test_simd_is_basic_latin_c0() {
let input: [u16; 16] = [
0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
#[test]
fn test_simd_is_basic_latin_0fff() {
let input: [u16; 16] = [
0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
#[test]
fn test_simd_is_basic_latin_ffff() {
let input: [u16; 16] = [
0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
#[test]
fn test_simd_is_ascii_success() {
let ascii: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
assert!(simd_is_ascii(simd));
}
#[test]
fn test_simd_is_ascii_failure() {
let input: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(input.as_ptr()) };
assert!(!simd_is_ascii(simd));
}
#[cfg(target_feature = "sse2")]
#[test]
fn test_check_ascii() {
let input: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(input.as_ptr()) };
let mask = mask_ascii(simd);
assert_ne!(mask, 0);
assert_eq!(mask.trailing_zeros(), 4);
}
#[test]
fn test_alu() {
let input: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let mut alu = 0u64;
unsafe {
::core::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
}
let masked = alu & 0x8080808080808080;
assert_eq!(masked.trailing_zeros(), 39);
}
}

View File

@@ -0,0 +1,714 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::ascii::*;
use crate::data::position;
use crate::handles::*;
use crate::variant::*;
pub struct SingleByteDecoder {
table: &'static [u16; 128],
}
impl SingleByteDecoder {
pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
VariantDecoder::SingleByte(SingleByteDecoder { table: data })
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
Some(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
_last: bool,
) -> (DecoderResult, usize, usize) {
let mut source = ByteSource::new(src);
let mut dest = Utf8Destination::new(dst);
'outermost: loop {
match dest.copy_ascii_from_check_space_bmp(&mut source) {
CopyAsciiResult::Stop(ret) => return ret,
CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
// Start non-boilerplate
//
// Since the non-ASCIIness of `non_ascii` is hidden from
// the optimizer, it can't figure out that it's OK to
// statically omit the bound check when accessing
// `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`.
let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize];
if mapped == 0u16 {
return (
DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written(),
);
}
let dest_again = handle.write_bmp_excl_ascii(mapped);
// End non-boilerplate
match source.check_available() {
Space::Full(src_consumed) => {
return (
DecoderResult::InputEmpty,
src_consumed,
dest_again.written(),
);
}
Space::Available(source_handle) => {
match dest_again.check_space_bmp() {
Space::Full(dst_written) => {
return (
DecoderResult::OutputFull,
source_handle.consumed(),
dst_written,
);
}
Space::Available(mut destination_handle) => {
let (mut b, unread_handle) = source_handle.read();
let source_again = unread_handle.commit();
'innermost: loop {
if b > 127 {
non_ascii = b;
handle = destination_handle;
continue 'middle;
}
// Testing on Haswell says that we should write the
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
let dest_again_again = destination_handle.write_ascii(b);
if b < 60 {
// We've got punctuation
match source_again.check_available() {
Space::Full(src_consumed_again) => {
return (
DecoderResult::InputEmpty,
src_consumed_again,
dest_again_again.written(),
);
}
Space::Available(source_handle_again) => {
match dest_again_again.check_space_bmp() {
Space::Full(dst_written_again) => {
return (
DecoderResult::OutputFull,
source_handle_again.consumed(),
dst_written_again,
);
}
Space::Available(
destination_handle_again,
) => {
let (b_again, _unread_handle_again) =
source_handle_again.read();
b = b_again;
destination_handle =
destination_handle_again;
continue 'innermost;
}
}
}
}
}
// We've got markup or ASCII text
continue 'outermost;
}
}
}
}
}
},
}
}
}
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(DecoderResult::OutputFull, dst.len())
} else {
(DecoderResult::InputEmpty, src.len())
};
let mut converted = 0usize;
'outermost: loop {
match unsafe {
ascii_to_basic_latin(
src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted),
length - converted,
)
} {
None => {
return (pending, length, length);
}
Some((mut non_ascii, consumed)) => {
converted += consumed;
'middle: loop {
// `converted` doesn't count the reading of `non_ascii` yet.
// Since the non-ASCIIness of `non_ascii` is hidden from
// the optimizer, it can't figure out that it's OK to
// statically omit the bound check when accessing
// `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`.
let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize];
if mapped == 0u16 {
return (
DecoderResult::Malformed(1, 0),
converted + 1, // +1 `for non_ascii`
converted,
);
}
unsafe {
// The bound check has already been performed
*(dst.get_unchecked_mut(converted)) = mapped;
}
converted += 1;
// Next, handle ASCII punctuation and non-ASCII without
// going back to ASCII acceleration. Non-ASCII scripts
// use ASCII punctuation, so this avoid going to
// acceleration just for punctuation/space and then
// failing. This is a significant boost to non-ASCII
// scripts.
// TODO: Split out Latin converters without this part
// this stuff makes Latin script-conversion slower.
if converted == length {
return (pending, length, length);
}
let mut b = unsafe { *(src.get_unchecked(converted)) };
'innermost: loop {
if b > 127 {
non_ascii = b;
continue 'middle;
}
// Testing on Haswell says that we should write the
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
unsafe {
*(dst.get_unchecked_mut(converted)) = u16::from(b);
}
converted += 1;
if b < 60 {
// We've got punctuation
if converted == length {
return (pending, length, length);
}
b = unsafe { *(src.get_unchecked(converted)) };
continue 'innermost;
}
// We've got markup or ASCII text
continue 'outermost;
}
}
}
}
}
}
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
let mut bytes = buffer;
let mut total = 0;
loop {
if let Some((non_ascii, offset)) = validate_ascii(bytes) {
total += offset;
let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
if mapped != u16::from(non_ascii) {
return total;
}
total += 1;
bytes = &bytes[offset + 1..];
} else {
return total;
}
}
}
}
pub struct SingleByteEncoder {
table: &'static [u16; 128],
run_bmp_offset: usize,
run_byte_offset: usize,
run_length: usize,
}
impl SingleByteEncoder {
pub fn new(
encoding: &'static Encoding,
data: &'static [u16; 128],
run_bmp_offset: u16,
run_byte_offset: u8,
run_length: u8,
) -> Encoder {
Encoder::new(
encoding,
VariantEncoder::SingleByte(SingleByteEncoder {
table: data,
run_bmp_offset: run_bmp_offset as usize,
run_byte_offset: run_byte_offset as usize,
run_length: run_length as usize,
}),
)
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
Some(u16_length)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
Some(byte_length)
}
#[inline(always)]
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
// First, we see if the code unit falls into a run of consecutive
// code units that can be mapped by offset. This is very efficient
// for most non-Latin encodings as well as Latin1-ish encodings.
//
// For encodings that don't fit this pattern, the run (which may
// have the length of just one) just establishes the starting point
// for the next rule.
//
// Next, we do a forward linear search in the part of the index
// after the run. Even in non-Latin1-ish Latin encodings (except
// macintosh), the lower case letters are here.
//
// Next, we search the third quadrant up to the start of the run
// (upper case letters in Latin encodings except macintosh, in
// Greek and in KOI encodings) and then the second quadrant,
// except if the run stared before the third quadrant, we search
// the second quadrant up to the run.
//
// Last, we search the first quadrant, which has unused controls
// or punctuation in most encodings. This is bad for macintosh
// and IBM866, but those are rare.
// Run of consecutive units
let unit_as_usize = code_unit as usize;
let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
if offset < self.run_length {
return Some((128 + self.run_byte_offset + offset) as u8);
}
// Search after the run
let tail_start = self.run_byte_offset + self.run_length;
if let Some(pos) = position(&self.table[tail_start..], code_unit) {
return Some((128 + tail_start + pos) as u8);
}
if self.run_byte_offset >= 64 {
// Search third quadrant before the run
if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
return Some(((128 + 64) + pos) as u8);
}
// Search second quadrant
if let Some(pos) = position(&self.table[32..64], code_unit) {
return Some(((128 + 32) + pos) as u8);
}
} else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
// Search second quadrant before the run
return Some(((128 + 32) + pos) as u8);
}
// Search first quadrant
if let Some(pos) = position(&self.table[..32], code_unit) {
return Some((128 + pos) as u8);
}
None
}
ascii_compatible_bmp_encoder_function!(
{
match self.encode_u16(bmp) {
Some(byte) => handle.write_one(byte),
None => {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
}
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_one,
check_space_one,
encode_from_utf8_raw,
str,
Utf8Source,
true
);
pub fn encode_from_utf16_raw(
&mut self,
src: &[u16],
dst: &mut [u8],
_last: bool,
) -> (EncoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(EncoderResult::OutputFull, dst.len())
} else {
(EncoderResult::InputEmpty, src.len())
};
let mut converted = 0usize;
'outermost: loop {
match unsafe {
basic_latin_to_ascii(
src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted),
length - converted,
)
} {
None => {
return (pending, length, length);
}
Some((mut non_ascii, consumed)) => {
converted += consumed;
'middle: loop {
// `converted` doesn't count the reading of `non_ascii` yet.
match self.encode_u16(non_ascii) {
Some(byte) => {
unsafe {
*(dst.get_unchecked_mut(converted)) = byte;
}
converted += 1;
}
None => {
// At this point, we need to know if we
// have a surrogate.
let high_bits = non_ascii & 0xFC00u16;
if high_bits == 0xD800u16 {
// high surrogate
if converted + 1 == length {
// End of buffer. This surrogate is unpaired.
return (
EncoderResult::Unmappable('\u{FFFD}'),
converted + 1, // +1 `for non_ascii`
converted,
);
}
let second =
u32::from(unsafe { *src.get_unchecked(converted + 1) });
if second & 0xFC00u32 != 0xDC00u32 {
return (
EncoderResult::Unmappable('\u{FFFD}'),
converted + 1, // +1 `for non_ascii`
converted,
);
}
// The next code unit is a low surrogate.
let astral: char = unsafe {
::core::char::from_u32_unchecked(
(u32::from(non_ascii) << 10) + second
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
)
};
return (
EncoderResult::Unmappable(astral),
converted + 2, // +2 `for non_ascii` and `second`
converted,
);
}
if high_bits == 0xDC00u16 {
// Unpaired low surrogate
return (
EncoderResult::Unmappable('\u{FFFD}'),
converted + 1, // +1 `for non_ascii`
converted,
);
}
return (
EncoderResult::unmappable_from_bmp(non_ascii),
converted + 1, // +1 `for non_ascii`
converted,
);
}
}
// Next, handle ASCII punctuation and non-ASCII without
// going back to ASCII acceleration. Non-ASCII scripts
// use ASCII punctuation, so this avoid going to
// acceleration just for punctuation/space and then
// failing. This is a significant boost to non-ASCII
// scripts.
// TODO: Split out Latin converters without this part
// this stuff makes Latin script-conversion slower.
if converted == length {
return (pending, length, length);
}
let mut unit = unsafe { *(src.get_unchecked(converted)) };
'innermost: loop {
if unit > 127 {
non_ascii = unit;
continue 'middle;
}
// Testing on Haswell says that we should write the
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
unsafe {
*(dst.get_unchecked_mut(converted)) = unit as u8;
}
converted += 1;
if unit < 60 {
// We've got punctuation
if converted == length {
return (pending, length, length);
}
unit = unsafe { *(src.get_unchecked(converted)) };
continue 'innermost;
}
// We've got markup or ASCII text
continue 'outermost;
}
}
}
}
}
}
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
#[test]
fn test_windows_1255_ca() {
decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
}
#[test]
fn test_ascii_punctuation() {
let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
\u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
\u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
decode(WINDOWS_1253, bytes, characters);
encode(WINDOWS_1253, characters, bytes);
}
#[test]
fn test_decode_malformed() {
decode(
WINDOWS_1253,
b"\xC1\xF5\xD2\xF4\xFC",
"\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
);
}
#[test]
fn test_encode_unmappables() {
encode(
WINDOWS_1253,
"\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
b"\xC1\xF5&#9731;\xF4\xFC",
);
encode(
WINDOWS_1253,
"\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
b"\xC1\xF5&#128169;\xF4\xFC",
);
}
#[test]
fn test_encode_unpaired_surrogates() {
encode_from_utf16(
WINDOWS_1253,
&[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
b"\xC1\xF5&#65533;\xF4\xFC",
);
encode_from_utf16(
WINDOWS_1253,
&[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
b"\xC1\xF5&#65533;\xF4\xFC",
);
encode_from_utf16(
WINDOWS_1253,
&[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
b"\xC1\xF5\xF4\xFC&#65533;",
);
}
pub const HIGH_BYTES: &'static [u8; 128] = &[
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
];
fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
let mut with_replacement = [0u16; 128];
let mut it = data.iter().enumerate();
loop {
match it.next() {
Some((i, code_point)) => {
if *code_point == 0 {
with_replacement[i] = 0xFFFD;
} else {
with_replacement[i] = *code_point;
}
}
None => {
break;
}
}
}
decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
}
fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
let mut with_zeros = [0u8; 128];
let mut it = data.iter().enumerate();
loop {
match it.next() {
Some((i, code_point)) => {
if *code_point == 0 {
with_zeros[i] = 0;
} else {
with_zeros[i] = HIGH_BYTES[i];
}
}
None => {
break;
}
}
}
encode_from_utf16(encoding, data, &with_zeros[..]);
}
#[test]
fn test_single_byte_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = WINDOWS_1253.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
// These tests are so self-referential that they are pretty useless.
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
#[test]
fn test_single_byte_decode() {
decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
if cfg!(miri) {
// Miri is too slow
return;
}
decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
}
#[test]
fn test_single_byte_encode() {
encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
if cfg!(miri) {
// Miri is too slow
return;
}
encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
}
// END GENERATED CODE
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,242 @@
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
use super::*;
#[test]
fn test_all_labels() {
assert_eq!(Encoding::for_label(b"l1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"l2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"l3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"l4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"l5"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"l6"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"l9"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"mac"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"koi"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"gbk"), Some(GBK));
assert_eq!(Encoding::for_label(b"big5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"koi8"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"sjis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"ucs-2"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"ms932"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"cp866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"cp819"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"ascii"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"x-gbk"), Some(GBK));
assert_eq!(Encoding::for_label(b"greek"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"cp1250"), Some(WINDOWS_1250));
assert_eq!(Encoding::for_label(b"cp1251"), Some(WINDOWS_1251));
assert_eq!(Encoding::for_label(b"latin1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"gb2312"), Some(GBK));
assert_eq!(Encoding::for_label(b"cp1252"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"latin2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"cp1253"), Some(WINDOWS_1253));
assert_eq!(Encoding::for_label(b"latin3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"cp1254"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"latin4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"cp1255"), Some(WINDOWS_1255));
assert_eq!(Encoding::for_label(b"csbig5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"latin5"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"utf-16"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"cp1256"), Some(WINDOWS_1256));
assert_eq!(Encoding::for_label(b"ibm866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"latin6"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"cp1257"), Some(WINDOWS_1257));
assert_eq!(Encoding::for_label(b"cp1258"), Some(WINDOWS_1258));
assert_eq!(Encoding::for_label(b"greek8"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"ibm819"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"arabic"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"visual"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"korean"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"euc-jp"), Some(EUC_JP));
assert_eq!(Encoding::for_label(b"koi8-r"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"koi8_r"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"euc-kr"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"x-sjis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"koi8-u"), Some(KOI8_U));
assert_eq!(Encoding::for_label(b"hebrew"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"tis-620"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"gb18030"), Some(GB18030));
assert_eq!(Encoding::for_label(b"ksc5601"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"gb_2312"), Some(GBK));
assert_eq!(Encoding::for_label(b"dos-874"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"cn-big5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"unicode"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"chinese"), Some(GBK));
assert_eq!(Encoding::for_label(b"logical"), Some(ISO_8859_8_I));
assert_eq!(Encoding::for_label(b"cskoi8r"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"cseuckr"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"koi8-ru"), Some(KOI8_U));
assert_eq!(Encoding::for_label(b"x-cp1250"), Some(WINDOWS_1250));
assert_eq!(Encoding::for_label(b"ksc_5601"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"x-cp1251"), Some(WINDOWS_1251));
assert_eq!(Encoding::for_label(b"iso88591"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"csgb2312"), Some(GBK));
assert_eq!(Encoding::for_label(b"x-cp1252"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso88592"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"x-cp1253"), Some(WINDOWS_1253));
assert_eq!(Encoding::for_label(b"iso88593"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"ecma-114"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"x-cp1254"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso88594"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"x-cp1255"), Some(WINDOWS_1255));
assert_eq!(Encoding::for_label(b"iso88595"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"x-x-big5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"x-cp1256"), Some(WINDOWS_1256));
assert_eq!(Encoding::for_label(b"csibm866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"iso88596"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"x-cp1257"), Some(WINDOWS_1257));
assert_eq!(Encoding::for_label(b"iso88597"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"asmo-708"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"ecma-118"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"elot_928"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"x-cp1258"), Some(WINDOWS_1258));
assert_eq!(Encoding::for_label(b"iso88598"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso88599"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"cyrillic"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"utf-16be"), Some(UTF_16BE));
assert_eq!(Encoding::for_label(b"utf-16le"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"us-ascii"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"ms_kanji"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"x-euc-jp"), Some(EUC_JP));
assert_eq!(Encoding::for_label(b"iso885910"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso8859-1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso885911"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"iso8859-2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso8859-3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso885913"), Some(ISO_8859_13));
assert_eq!(Encoding::for_label(b"iso8859-4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso885914"), Some(ISO_8859_14));
assert_eq!(Encoding::for_label(b"iso8859-5"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso885915"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"iso8859-6"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso8859-7"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso8859-8"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-ir-58"), Some(GBK));
assert_eq!(Encoding::for_label(b"iso8859-9"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"csunicode"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"macintosh"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"shift-jis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"shift_jis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"iso-ir-100"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso8859-10"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso-ir-110"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"gb_2312-80"), Some(GBK));
assert_eq!(Encoding::for_label(b"iso-8859-1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso_8859-1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso-ir-101"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso8859-11"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"iso-8859-2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso_8859-2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"hz-gb-2312"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"iso-8859-3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso_8859-3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso8859-13"), Some(ISO_8859_13));
assert_eq!(Encoding::for_label(b"iso-8859-4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso_8859-4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso8859-14"), Some(ISO_8859_14));
assert_eq!(Encoding::for_label(b"iso-ir-144"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso-8859-5"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso_8859-5"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso8859-15"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"iso-8859-6"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso_8859-6"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-ir-126"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso-8859-7"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso_8859-7"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso-ir-127"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-ir-157"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso-8859-8"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso_8859-8"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-ir-138"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-ir-148"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso-8859-9"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso_8859-9"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso-ir-109"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso-ir-149"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"big5-hkscs"), Some(BIG5));
assert_eq!(Encoding::for_label(b"csshiftjis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"iso-8859-10"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso-8859-11"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"csisolatin1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"csisolatin2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso-8859-13"), Some(ISO_8859_13));
assert_eq!(Encoding::for_label(b"csisolatin3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso-8859-14"), Some(ISO_8859_14));
assert_eq!(Encoding::for_label(b"windows-874"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"csisolatin4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso-8859-15"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"iso_8859-15"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"csisolatin5"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso-8859-16"), Some(ISO_8859_16));
assert_eq!(Encoding::for_label(b"csisolatin6"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"windows-949"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"csisolatin9"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"csiso88596e"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"csiso88598e"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"unicodefffe"), Some(UTF_16BE));
assert_eq!(Encoding::for_label(b"unicodefeff"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"csmacintosh"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"csiso88596i"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"csiso88598i"), Some(ISO_8859_8_I));
assert_eq!(Encoding::for_label(b"windows-31j"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"x-mac-roman"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"iso-2022-cn"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"iso-2022-jp"), Some(ISO_2022_JP));
assert_eq!(Encoding::for_label(b"csiso2022jp"), Some(ISO_2022_JP));
assert_eq!(Encoding::for_label(b"iso-2022-kr"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"csiso2022kr"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"replacement"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"windows-1250"), Some(WINDOWS_1250));
assert_eq!(Encoding::for_label(b"windows-1251"), Some(WINDOWS_1251));
assert_eq!(Encoding::for_label(b"windows-1252"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"windows-1253"), Some(WINDOWS_1253));
assert_eq!(Encoding::for_label(b"windows-1254"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"windows-1255"), Some(WINDOWS_1255));
assert_eq!(Encoding::for_label(b"windows-1256"), Some(WINDOWS_1256));
assert_eq!(Encoding::for_label(b"windows-1257"), Some(WINDOWS_1257));
assert_eq!(Encoding::for_label(b"windows-1258"), Some(WINDOWS_1258));
assert_eq!(Encoding::for_label(b"iso-8859-6-e"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-8859-8-e"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-8859-6-i"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-8859-8-i"), Some(ISO_8859_8_I));
assert_eq!(Encoding::for_label(b"sun_eu_greek"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"csksc56011987"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"unicode20utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"unicode11utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"ks_c_5601-1987"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"ansi_x3.4-1968"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"ks_c_5601-1989"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"x-mac-cyrillic"), Some(X_MAC_CYRILLIC));
assert_eq!(Encoding::for_label(b"x-user-defined"), Some(X_USER_DEFINED));
assert_eq!(Encoding::for_label(b"csiso58gb231280"), Some(GBK));
assert_eq!(Encoding::for_label(b"iso-10646-ucs-2"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"iso_8859-1:1987"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso_8859-2:1987"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso_8859-6:1987"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso_8859-7:1987"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso_8859-3:1988"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso_8859-4:1988"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso_8859-5:1988"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso_8859-8:1988"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"x-unicode20utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"iso_8859-9:1989"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"csisolatingreek"), Some(ISO_8859_7));
assert_eq!(
Encoding::for_label(b"x-mac-ukrainian"),
Some(X_MAC_CYRILLIC)
);
assert_eq!(Encoding::for_label(b"iso-2022-cn-ext"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"csisolatinarabic"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"csisolatinhebrew"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"unicode-1-1-utf-8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"csisolatincyrillic"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"cseucpkdfmtjapanese"), Some(EUC_JP));
}

View File

@@ -0,0 +1,262 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
pub fn decode(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
let mut vec = Vec::with_capacity(bytes.len() + 32);
let mut string = String::with_capacity(expect.len() + 32);
let range = if cfg!(miri) {
0usize..4usize
} else {
0usize..32usize
};
for i in range {
vec.clear();
string.clear();
for j in 0usize..i {
let c = 0x40u8 + (j as u8);
vec.push(c);
string.push(c as char);
}
vec.extend_from_slice(bytes);
string.push_str(expect);
decode_without_padding_impl(encoding, &vec[..], &string[..], i);
}
}
pub fn decode_without_padding(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
decode_without_padding_impl(encoding, bytes, expect, 0);
}
fn decode_without_padding_impl(
encoding: &'static Encoding,
bytes: &[u8],
expect: &str,
padding: usize,
) {
decode_to_utf8_impl(encoding, bytes, expect, padding);
decode_to_utf16_impl(encoding, bytes, &utf16_from_utf8(expect)[..], padding);
decode_to_string(encoding, bytes, expect);
}
pub fn encode(encoding: &'static Encoding, str: &str, expect: &[u8]) {
let mut vec = Vec::with_capacity(expect.len() + 32);
let mut string = String::with_capacity(str.len() + 32);
let range = if cfg!(miri) {
0usize..4usize
} else {
0usize..32usize
};
for i in range {
vec.clear();
string.clear();
for j in 0usize..i {
let c = 0x40u8 + (j as u8);
vec.push(c);
string.push(c as char);
}
vec.extend_from_slice(expect);
string.push_str(str);
encode_without_padding(encoding, &string[..], &vec[..]);
}
}
pub fn encode_without_padding(encoding: &'static Encoding, string: &str, expect: &[u8]) {
encode_from_utf8(encoding, string, expect);
encode_from_utf16(encoding, &utf16_from_utf8(string)[..], expect);
encode_to_vec(encoding, string, expect);
}
pub fn decode_to_utf16(encoding: &'static Encoding, bytes: &[u8], expect: &[u16]) {
decode_to_utf16_impl(encoding, bytes, expect, 0);
}
pub fn decode_to_utf16_impl(
encoding: &'static Encoding,
bytes: &[u8],
expect: &[u16],
padding: usize,
) {
for i in padding..bytes.len() {
let (head, tail) = bytes.split_at(i);
decode_to_utf16_with_boundary(encoding, head, tail, expect);
}
}
pub fn decode_to_utf16_with_boundary(
encoding: &'static Encoding,
head: &[u8],
tail: &[u8],
expect: &[u16],
) {
let mut decoder = encoding.new_decoder();
let mut dest: Vec<u16> = Vec::with_capacity(
decoder
.max_utf16_buffer_length(head.len() + tail.len())
.unwrap(),
);
let capacity = dest.capacity();
dest.resize(capacity, 0u16);
let mut total_read = 0;
let mut total_written = 0;
{
let (complete, read, written, _) = decoder.decode_to_utf16(head, &mut dest, false);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
{
let (complete, read, written, _) =
decoder.decode_to_utf16(tail, &mut dest[total_written..], true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
assert_eq!(total_read, head.len() + tail.len());
assert_eq!(total_written, expect.len());
dest.truncate(total_written);
assert_eq!(&dest[..], expect);
}
pub fn decode_to_utf8(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
decode_to_utf8_impl(encoding, bytes, expect, 0);
}
pub fn decode_to_utf8_impl(
encoding: &'static Encoding,
bytes: &[u8],
expect: &str,
padding: usize,
) {
for i in padding..bytes.len() {
let (head, tail) = bytes.split_at(i);
decode_to_utf8_with_boundary(encoding, head, tail, expect);
}
}
pub fn decode_to_utf8_with_boundary(
encoding: &'static Encoding,
head: &[u8],
tail: &[u8],
expect: &str,
) {
let mut decoder = encoding.new_decoder();
let mut dest: Vec<u8> = Vec::with_capacity(
decoder
.max_utf8_buffer_length(head.len() + tail.len())
.unwrap(),
);
let capacity = dest.capacity();
dest.resize(capacity, 0u8);
let mut total_read = 0;
let mut total_written = 0;
{
let (complete, read, written, _) = decoder.decode_to_utf8(head, &mut dest, false);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
{
let (complete, read, written, _) =
decoder.decode_to_utf8(tail, &mut dest[total_written..], true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
assert_eq!(total_read, head.len() + tail.len());
assert_eq!(total_written, expect.len());
dest.truncate(total_written);
assert_eq!(&dest[..], expect.as_bytes());
}
pub fn decode_to_string(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
let (cow, _, _) = encoding.decode(bytes);
assert_eq!(&cow[..], expect);
}
pub fn encode_from_utf8(encoding: &'static Encoding, string: &str, expect: &[u8]) {
let mut encoder = encoding.new_encoder();
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
let capacity = dest.capacity();
dest.resize(capacity, 0u8);
let (complete, read, written, _) = encoder.encode_from_utf8(string, &mut dest, true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
assert_eq!(read, string.len());
assert_eq!(written, expect.len());
dest.truncate(written);
assert_eq!(&dest[..], expect);
}
pub fn encode_from_utf16(encoding: &'static Encoding, string: &[u16], expect: &[u8]) {
let mut encoder = encoding.new_encoder();
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
let capacity = dest.capacity();
dest.resize(capacity, 0u8);
let (complete, read, written, _) = encoder.encode_from_utf16(string, &mut dest, true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
assert_eq!(read, string.len());
// assert_eq!(written, expect.len());
dest.truncate(written);
assert_eq!(&dest[..], expect);
}
pub fn encode_to_vec(encoding: &'static Encoding, string: &str, expect: &[u8]) {
let (cow, _, _) = encoding.encode(string);
assert_eq!(&cow[..], expect);
}
pub fn utf16_from_utf8(string: &str) -> Vec<u16> {
let mut decoder = UTF_8.new_decoder_without_bom_handling();
let mut vec = Vec::with_capacity(decoder.max_utf16_buffer_length(string.len()).unwrap());
let capacity = vec.capacity();
vec.resize(capacity, 0);
let (result, read, written) =
decoder.decode_to_utf16_without_replacement(string.as_bytes(), &mut vec[..], true);
match result {
DecoderResult::InputEmpty => {
debug_assert_eq!(read, string.len());
vec.resize(written, 0);
vec
}
DecoderResult::Malformed(_, _) => unreachable!("Malformed"),
DecoderResult::OutputFull => unreachable!("Output full"),
}
}

472
zeroidc/vendor/encoding_rs/src/utf_16.rs vendored Normal file
View File

@@ -0,0 +1,472 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::handles::*;
use crate::variant::*;
pub struct Utf16Decoder {
lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
lead_byte: Option<u8>,
be: bool,
pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
}
impl Utf16Decoder {
pub fn new(big_endian: bool) -> VariantDecoder {
VariantDecoder::Utf16(Utf16Decoder {
lead_surrogate: 0,
lead_byte: None,
be: big_endian,
pending_bmp: false,
})
}
pub fn additional_from_state(&self) -> usize {
1 + if self.lead_byte.is_some() { 1 } else { 0 }
+ if self.lead_surrogate == 0 { 0 } else { 2 }
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_mul(
3,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
),
)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_mul(
3,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
),
)
}
decoder_functions!(
{
if self.pending_bmp {
match dest.check_space_bmp() {
Space::Full(_) => {
return (DecoderResult::OutputFull, 0, 0);
}
Space::Available(destination_handle) => {
destination_handle.write_bmp(self.lead_surrogate);
self.pending_bmp = false;
self.lead_surrogate = 0;
}
}
}
},
{
// This is the fast path. The rest runs only at the
// start and end for partial sequences.
if self.lead_byte.is_none() && self.lead_surrogate == 0 {
if let Some((read, written)) = if self.be {
dest.copy_utf16_from::<BigEndian>(&mut source)
} else {
dest.copy_utf16_from::<LittleEndian>(&mut source)
} {
return (DecoderResult::Malformed(2, 0), read, written);
}
}
},
{
debug_assert!(!self.pending_bmp);
if self.lead_surrogate != 0 || self.lead_byte.is_some() {
// We need to check space without intent to write in order to
// make sure that there is space for the replacement character.
match dest.check_space_bmp() {
Space::Full(_) => {
return (DecoderResult::OutputFull, 0, 0);
}
Space::Available(_) => {
if self.lead_surrogate != 0 {
self.lead_surrogate = 0;
match self.lead_byte {
None => {
return (
DecoderResult::Malformed(2, 0),
src_consumed,
dest.written(),
);
}
Some(_) => {
self.lead_byte = None;
return (
DecoderResult::Malformed(3, 0),
src_consumed,
dest.written(),
);
}
}
}
debug_assert!(self.lead_byte.is_some());
self.lead_byte = None;
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
}
}
}
},
{
match self.lead_byte {
None => {
self.lead_byte = Some(b);
continue;
}
Some(lead) => {
self.lead_byte = None;
let code_unit = if self.be {
u16::from(lead) << 8 | u16::from(b)
} else {
u16::from(b) << 8 | u16::from(lead)
};
let high_bits = code_unit & 0xFC00u16;
if high_bits == 0xD800u16 {
// high surrogate
if self.lead_surrogate != 0 {
// The previous high surrogate was in
// error and this one becomes the new
// pending one.
self.lead_surrogate = code_unit as u16;
return (
DecoderResult::Malformed(2, 2),
unread_handle.consumed(),
destination_handle.written(),
);
}
self.lead_surrogate = code_unit;
continue;
}
if high_bits == 0xDC00u16 {
// low surrogate
if self.lead_surrogate == 0 {
return (
DecoderResult::Malformed(2, 0),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
self.lead_surrogate = 0;
continue;
}
// bmp
if self.lead_surrogate != 0 {
// The previous high surrogate was in
// error and this code unit becomes a
// pending BMP character.
self.lead_surrogate = code_unit;
self.pending_bmp = true;
return (
DecoderResult::Malformed(2, 2),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_bmp(code_unit);
continue;
}
}
},
self,
src_consumed,
dest,
source,
b,
destination_handle,
unread_handle,
check_space_astral
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_utf_16le(bytes: &[u8], expect: &str) {
decode_without_padding(UTF_16LE, bytes, expect);
}
fn decode_utf_16be(bytes: &[u8], expect: &str) {
decode_without_padding(UTF_16BE, bytes, expect);
}
fn encode_utf_16le(string: &str, expect: &[u8]) {
encode(UTF_16LE, string, expect);
}
fn encode_utf_16be(string: &str, expect: &[u8]) {
encode(UTF_16BE, string, expect);
}
#[test]
fn test_utf_16_decode() {
decode_utf_16le(b"", "");
decode_utf_16be(b"", "");
decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
// The \xFF makes sure that the parts before and after have different alignment
let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
}
#[test]
fn test_utf_16_encode() {
// Empty
encode_utf_16be("", b"");
encode_utf_16le("", b"");
// Encodes as UTF-8
assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
}
#[test]
fn test_utf_16be_decode_one_by_one() {
let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
for b in input.chunks(1) {
assert_eq!(b.len(), 1);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_one_by_one() {
let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
for b in input.chunks(1) {
assert_eq!(b.len(), 1);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert!(!had_errors);
}
}
#[test]
fn test_utf_16be_decode_three_at_a_time() {
let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
for b in input.chunks(3) {
assert_eq!(b.len(), 3);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, b.len());
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_three_at_a_time() {
let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
for b in input.chunks(3) {
assert_eq!(b.len(), 3);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, b.len());
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
}
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(!had_errors);
assert_eq!(output[0], 0xFDFF);
}
}
#[test]
fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
}
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(!had_errors);
assert_eq!(output[0], 0xFEFD);
}
}
#[test]
fn test_utf_16le_decode_bom_prefix() {
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(had_errors);
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16be_decode_bom_prefix() {
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(had_errors);
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16le_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16LE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
#[test]
fn test_utf_16be_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16BE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
}

1631
zeroidc/vendor/encoding_rs/src/utf_8.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,400 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
//! This module provides enums that wrap the various decoders and encoders.
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
//! dispatch explicitly for a finite set of specialized decoders and encoders.
//! Unfortunately, this means the compiler doesn't generate the dispatch code
//! and it has to be written here instead.
//!
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
//! allocation in Rust code, including the convenience methods on `Encoding`.
use super::*;
use big5::*;
use euc_jp::*;
use euc_kr::*;
use gb18030::*;
use iso_2022_jp::*;
use replacement::*;
use shift_jis::*;
use single_byte::*;
use utf_16::*;
use utf_8::*;
use x_user_defined::*;
pub enum VariantDecoder {
SingleByte(SingleByteDecoder),
Utf8(Utf8Decoder),
Gb18030(Gb18030Decoder),
Big5(Big5Decoder),
EucJp(EucJpDecoder),
Iso2022Jp(Iso2022JpDecoder),
ShiftJis(ShiftJisDecoder),
EucKr(EucKrDecoder),
Replacement(ReplacementDecoder),
UserDefined(UserDefinedDecoder),
Utf16(Utf16Decoder),
}
impl VariantDecoder {
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Utf8(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Gb18030(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Big5(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::EucJp(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Iso2022Jp(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::ShiftJis(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::EucKr(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Replacement(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::UserDefined(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Utf16(ref v) => v.max_utf16_buffer_length(byte_length),
}
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Utf8(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Gb18030(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Big5(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::EucJp(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Iso2022Jp(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::ShiftJis(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::EucKr(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Replacement(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::UserDefined(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Utf16(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
}
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Utf8(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Gb18030(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Big5(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::EucJp(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Iso2022Jp(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::ShiftJis(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::EucKr(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Replacement(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::UserDefined(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Utf16(ref v) => v.max_utf8_buffer_length(byte_length),
}
}
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
last: bool,
) -> (DecoderResult, usize, usize) {
match *self {
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Big5(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
}
}
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
last: bool,
) -> (DecoderResult, usize, usize) {
match *self {
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Big5(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
}
}
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => {
return Some(v.latin1_byte_compatible_up_to(buffer));
}
VariantDecoder::Utf8(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::Gb18030(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::Big5(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::EucJp(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::Iso2022Jp(ref v) => {
if v.in_neutral_state() {
return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
}
return None;
}
VariantDecoder::ShiftJis(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::EucKr(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::UserDefined(_) => {}
VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
return None;
}
};
Some(Encoding::ascii_valid_up_to(buffer))
}
}
pub enum VariantEncoder {
SingleByte(SingleByteEncoder),
Utf8(Utf8Encoder),
Gb18030(Gb18030Encoder),
Big5(Big5Encoder),
EucJp(EucJpEncoder),
Iso2022Jp(Iso2022JpEncoder),
ShiftJis(ShiftJisEncoder),
EucKr(EucKrEncoder),
UserDefined(UserDefinedEncoder),
}
impl VariantEncoder {
pub fn has_pending_state(&self) -> bool {
match *self {
VariantEncoder::Iso2022Jp(ref v) => v.has_pending_state(),
_ => false,
}
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
match *self {
VariantEncoder::SingleByte(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Utf8(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Gb18030(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Big5(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::EucJp(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Iso2022Jp(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::ShiftJis(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::EucKr(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::UserDefined(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
}
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
match *self {
VariantEncoder::SingleByte(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Utf8(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Gb18030(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Big5(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::EucJp(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Iso2022Jp(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::ShiftJis(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::EucKr(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::UserDefined(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
}
}
pub fn encode_from_utf16_raw(
&mut self,
src: &[u16],
dst: &mut [u8],
last: bool,
) -> (EncoderResult, usize, usize) {
match *self {
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Big5(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
}
}
pub fn encode_from_utf8_raw(
&mut self,
src: &str,
dst: &mut [u8],
last: bool,
) -> (EncoderResult, usize, usize) {
match *self {
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Big5(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
}
}
}
pub enum VariantEncoding {
SingleByte(&'static [u16; 128], u16, u8, u8),
Utf8,
Gbk,
Gb18030,
Big5,
EucJp,
Iso2022Jp,
ShiftJis,
EucKr,
Replacement,
Utf16Be,
Utf16Le,
UserDefined,
}
impl VariantEncoding {
pub fn new_variant_decoder(&self) -> VariantDecoder {
match *self {
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
VariantEncoding::Utf8 => Utf8Decoder::new(),
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
VariantEncoding::Big5 => Big5Decoder::new(),
VariantEncoding::EucJp => EucJpDecoder::new(),
VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
VariantEncoding::EucKr => EucKrDecoder::new(),
VariantEncoding::Replacement => ReplacementDecoder::new(),
VariantEncoding::UserDefined => UserDefinedDecoder::new(),
VariantEncoding::Utf16Be => Utf16Decoder::new(true),
VariantEncoding::Utf16Le => Utf16Decoder::new(false),
}
}
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
match *self {
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => {
SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length)
}
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
VariantEncoding::Big5 => Big5Encoder::new(encoding),
VariantEncoding::EucJp => EucJpEncoder::new(encoding),
VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
VariantEncoding::EucKr => EucKrEncoder::new(encoding),
VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
VariantEncoding::Utf16Be | VariantEncoding::Replacement | VariantEncoding::Utf16Le => {
unreachable!()
}
}
}
pub fn is_single_byte(&self) -> bool {
match *self {
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
_ => false,
}
}
}

View File

@@ -0,0 +1,249 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::handles::*;
use crate::variant::*;
cfg_if! {
if #[cfg(feature = "simd-accel")] {
use simd_funcs::*;
use packed_simd::u16x8;
#[inline(always)]
fn shift_upper(unpacked: u16x8) -> u16x8 {
let highest_ascii = u16x8::splat(0x7F);
unpacked + unpacked.gt(highest_ascii).select(u16x8::splat(0xF700), u16x8::splat(0)) }
} else {
}
}
pub struct UserDefinedDecoder;
impl UserDefinedDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::UserDefined(UserDefinedDecoder)
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
Some(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
decoder_function!(
{},
{},
{},
{
if b < 0x80 {
// ASCII run not optimized, because binary data expected
destination_handle.write_ascii(b);
continue;
}
destination_handle.write_upper_bmp(u16::from(b) + 0xF700);
continue;
},
self,
src_consumed,
dest,
source,
b,
destination_handle,
_unread_handle,
check_space_bmp,
decode_to_utf8_raw,
u8,
Utf8Destination
);
#[cfg(not(feature = "simd-accel"))]
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(DecoderResult::OutputFull, dst.len())
} else {
(DecoderResult::InputEmpty, src.len())
};
let src_trim = &src[..length];
let dst_trim = &mut dst[..length];
src_trim
.iter()
.zip(dst_trim.iter_mut())
.for_each(|(from, to)| {
*to = {
let unit = *from;
if unit < 0x80 {
u16::from(unit)
} else {
u16::from(unit) + 0xF700
}
}
});
(pending, length, length)
}
#[cfg(feature = "simd-accel")]
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(DecoderResult::OutputFull, dst.len())
} else {
(DecoderResult::InputEmpty, src.len())
};
// Not bothering with alignment
let tail_start = length & !0xF;
let simd_iterations = length >> 4;
let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr();
for i in 0..simd_iterations {
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
let (first, second) = simd_unpack(input);
unsafe {
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
}
}
let src_tail = &src[tail_start..length];
let dst_tail = &mut dst[tail_start..length];
src_tail
.iter()
.zip(dst_tail.iter_mut())
.for_each(|(from, to)| {
*to = {
let unit = *from;
if unit < 0x80 {
u16::from(unit)
} else {
u16::from(unit) + 0xF700
}
}
});
(pending, length, length)
}
}
pub struct UserDefinedEncoder;
impl UserDefinedEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::UserDefined(UserDefinedEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
Some(u16_length)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
Some(byte_length)
}
encoder_functions!(
{},
{
if c <= '\u{7F}' {
// TODO optimize ASCII run
destination_handle.write_one(c as u8);
continue;
}
if c < '\u{F780}' || c > '\u{F7FF}' {
return (
EncoderResult::Unmappable(c),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_one((u32::from(c) - 0xF700) as u8);
continue;
},
self,
src_consumed,
source,
dest,
c,
destination_handle,
unread_handle,
check_space_one
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_x_user_defined(bytes: &[u8], expect: &str) {
decode(X_USER_DEFINED, bytes, expect);
}
fn encode_x_user_defined(string: &str, expect: &[u8]) {
encode(X_USER_DEFINED, string, expect);
}
#[test]
fn test_x_user_defined_decode() {
// Empty
decode_x_user_defined(b"", "");
// ASCII
decode_x_user_defined(b"\x61\x62", "\u{0061}\u{0062}");
decode_x_user_defined(b"\x80\xFF", "\u{F780}\u{F7FF}");
decode_x_user_defined(b"\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62", "\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}");
}
#[test]
fn test_x_user_defined_encode() {
// Empty
encode_x_user_defined("", b"");
// ASCII
encode_x_user_defined("\u{0061}\u{0062}", b"\x61\x62");
encode_x_user_defined("\u{F780}\u{F7FF}", b"\x80\xFF");
encode_x_user_defined("\u{F77F}\u{F800}", b"&#63359;&#63488;");
}
#[test]
fn test_x_user_defined_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = X_USER_DEFINED.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
}