RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.

This commit is contained in:
Adam Ierymenko
2022-06-08 07:32:16 -04:00
parent 373ca30269
commit d5ca4e5f52
12611 changed files with 2898014 additions and 284 deletions

1548
zeroidc/vendor/encoding_rs/src/ascii.rs vendored Normal file

File diff suppressed because it is too large Load Diff

427
zeroidc/vendor/encoding_rs/src/big5.rs vendored Normal file
View File

@@ -0,0 +1,427 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range32;
pub struct Big5Decoder {
lead: Option<u8>,
}
impl Big5Decoder {
pub fn new() -> VariantDecoder {
VariantDecoder::Big5(Big5Decoder { lead: None })
}
pub fn in_neutral_state(&self) -> bool {
self.lead.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(match self.lead {
None => 0,
Some(_) => 1,
})
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
// If there is a lead but the next byte isn't a valid trail, an
// error is generated for the lead (+1). Then another iteration checks
// space, which needs +1 to account for the possibility of astral
// output or combining pair.
checked_add(1, self.plus_one_if_lead(byte_length))
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// No need to account for REPLACEMENT CHARACTERS.
// Cases:
// ASCII: 1 to 1
// Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
// lead set and first byte is trail: 1 to 4 worst case
//
// When checking for space for the last byte:
// no lead: the last byte must be ASCII (or fatal error): 1 to 1
// lead set: space for 4 bytes was already checked when reading the
// lead, hence the last lead and the last trail together are worst
// case 2 to 4.
//
// If lead set and the input is a single trail byte, the worst-case
// output is 4, so we need to add one before multiplying if lead is
// set.
//
// Finally, add two so that if input is non-zero, the output is at
// least 4.
checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
// If there is a lead but the next byte isn't a valid trail, an
// error is generated for the lead (+(1*3)). Then another iteration
// checks space, which needs +3 to account for the possibility of astral
// output or combining pair. In between start and end, the worst case
// is that every byte is bad: *3.
checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
}
ascii_compatible_two_byte_decoder_functions!(
{
// If lead is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let non_ascii_minus_offset =
non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0xFE - 0x81) {
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
non_ascii_minus_offset
},
{
// If trail is between 0x40 and 0x7E, inclusive,
// subtract offset 0x40. Else if trail is
// between 0xA1 and 0xFE, inclusive, subtract
// offset 0x62.
// TODO: Find out which range is more probable.
let mut trail_minus_offset =
byte.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start =
byte.wrapping_sub(0xA1);
if trail_minus_range_start >
(0xFE - 0xA1) {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
trail_minus_offset = byte - 0x62;
}
let pointer = lead_minus_offset as usize *
157usize +
trail_minus_offset as usize;
let rebased_pointer = pointer.wrapping_sub(942);
let low_bits = big5_low_bits(rebased_pointer);
if low_bits == 0 {
match pointer {
1133 => {
handle.write_big5_combination(0x00CAu16,
0x0304u16)
}
1135 => {
handle.write_big5_combination(0x00CAu16,
0x030Cu16)
}
1164 => {
handle.write_big5_combination(0x00EAu16,
0x0304u16)
}
1166 => {
handle.write_big5_combination(0x00EAu16,
0x030Cu16)
}
_ => {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
} else if big5_is_astral(rebased_pointer) {
handle.write_astral(u32::from(low_bits) |
0x20000u32)
} else {
handle.write_bmp_excl_ascii(low_bits)
}
},
self,
non_ascii,
byte,
lead_minus_offset,
unread_handle_trail,
source,
handle,
'outermost,
copy_ascii_from_check_space_astral,
check_space_astral,
false);
}
pub struct Big5Encoder;
impl Big5Encoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
// Astral: 2 to 2
// ASCII: 1 to 1
// Other: 1 to 2
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
// Astral: 4 to 2
// Upper BMP: 3 to 2
// Lower BMP: 2 to 2
// ASCII: 1 to 1
byte_length.checked_add(1)
}
ascii_compatible_encoder_functions!(
{
// For simplicity, unified ideographs
// in the pointer range 11206...11212 are handled
// as Level 1 Hanzi.
if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
handle.write_two(lead, trail)
} else {
let pointer = if let Some(pointer) = big5_box_encode(bmp) {
pointer
} else if let Some(pointer) = big5_other_encode(bmp) {
pointer
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
let lead = pointer / 157 + 0x81;
let remainder = pointer % 157;
let trail = if remainder < 0x3F {
remainder + 0x40
} else {
remainder + 0x62
};
handle.write_two(lead as u8, trail as u8)
}
},
{
if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
// big5_astral_encode returns rebased pointer,
// so adding 0x87 instead of 0x81.
let lead = rebased_pointer / 157 + 0x87;
let remainder = rebased_pointer % 157;
let trail = if remainder < 0x3F {
remainder + 0x40
} else {
remainder + 0x62
};
handle.write_two(lead as u8, trail as u8)
} else {
return (
EncoderResult::Unmappable(astral),
source.consumed(),
handle.written(),
);
}
} else {
return (
EncoderResult::Unmappable(astral),
source.consumed(),
handle.written(),
);
}
},
bmp,
astral,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_big5(bytes: &[u8], expect: &str) {
decode(BIG5, bytes, expect);
}
fn encode_big5(string: &str, expect: &[u8]) {
encode(BIG5, string, expect);
}
#[test]
fn test_big5_decode() {
// Empty
decode_big5(b"", &"");
// ASCII
decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
// Edge cases
decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
// Edge cases surrounded with ASCII
decode_big5(
&[0x61u8, 0x87u8, 0x40u8, 0x62u8],
&"\u{0061}\u{43F0}\u{0062}",
);
decode_big5(
&[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
&"\u{0061}\u{79D4}\u{0062}",
);
decode_big5(
&[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
&"\u{0061}\u{2910D}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0x62u8, 0x62u8],
&"\u{0061}\u{00CA}\u{0304}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0x64u8, 0x62u8],
&"\u{0061}\u{00CA}\u{030C}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0x66u8, 0x62u8],
&"\u{0061}\u{00CA}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
&"\u{0061}\u{00EA}\u{0304}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
&"\u{0061}\u{00EA}\u{030C}\u{0062}",
);
decode_big5(
&[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
&"\u{0061}\u{00EA}\u{0062}",
);
decode_big5(
&[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
&"\u{0061}\u{8991}\u{0062}",
);
decode_big5(
&[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
&"\u{0061}\u{27967}\u{0062}",
);
decode_big5(
&[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
&"\u{0061}\u{8A29}\u{0062}",
);
// Bad sequences
decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
}
#[test]
fn test_big5_encode() {
// Empty
encode_big5("", b"");
// ASCII
encode_big5("\u{0061}\u{0062}", b"\x61\x62");
if !cfg!(miri) {
// Miri is too slow
// Edge cases
encode_big5("\u{9EA6}\u{0061}", b"&#40614;\x61");
encode_big5("\u{2626B}\u{0061}", b"&#156267;\x61");
encode_big5("\u{3000}", b"\xA1\x40");
encode_big5("\u{20AC}", b"\xA3\xE1");
encode_big5("\u{4E00}", b"\xA4\x40");
encode_big5("\u{27607}", b"\xC8\xA4");
encode_big5("\u{FFE2}", b"\xC8\xCD");
encode_big5("\u{79D4}", b"\xFE\xFE");
// Not in index
encode_big5("\u{2603}\u{0061}", b"&#9731;\x61");
}
// duplicate low bits
encode_big5("\u{203B5}", b"\xFD\x6A");
encode_big5("\u{25605}", b"\xFE\x46");
// prefer last
encode_big5("\u{2550}", b"\xF9\xF9");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_big5_decode_all() {
let input = include_bytes!("test_data/big5_in.txt");
let expectation = include_str!("test_data/big5_in_ref.txt");
let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_big5_encode_all() {
let input = include_str!("test_data/big5_out.txt");
let expectation = include_bytes!("test_data/big5_out_ref.txt");
let (cow, encoding, had_errors) = BIG5.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, BIG5);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_big5_encode_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = BIG5.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
}

114378
zeroidc/vendor/encoding_rs/src/data.rs vendored Normal file

File diff suppressed because it is too large Load Diff

469
zeroidc/vendor/encoding_rs/src/euc_jp.rs vendored Normal file
View File

@@ -0,0 +1,469 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
enum EucJpPending {
None,
Jis0208Lead(u8),
Jis0212Shift,
Jis0212Lead(u8),
HalfWidthKatakana,
}
impl EucJpPending {
fn is_none(&self) -> bool {
match *self {
EucJpPending::None => true,
_ => false,
}
}
fn count(&self) -> usize {
match *self {
EucJpPending::None => 0,
EucJpPending::Jis0208Lead(_)
| EucJpPending::Jis0212Shift
| EucJpPending::HalfWidthKatakana => 1,
EucJpPending::Jis0212Lead(_) => 2,
}
}
}
pub struct EucJpDecoder {
pending: EucJpPending,
}
impl EucJpDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::EucJp(EucJpDecoder {
pending: EucJpPending::None,
})
}
pub fn in_neutral_state(&self) -> bool {
self.pending.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
self.plus_one_if_lead(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// worst case: 2 to 3
let len = self.plus_one_if_lead(byte_length);
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_mul(3, self.plus_one_if_lead(byte_length))
}
euc_jp_decoder_functions!(
{
let trail_minus_offset = byte.wrapping_sub(0xA1);
// Fast-track Hiragana (60% according to Lunde)
// and Katakana (10% acconding to Lunde).
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
} else if trail_minus_offset > (0xFE - 0xA1) {
if byte < 0x80 {
return (
DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written(),
);
} else {
let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
let level1_pointer = pointer.wrapping_sub(1410);
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
} else {
let level2_pointer = pointer.wrapping_sub(4418);
if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
} else {
let ibm_pointer = pointer.wrapping_sub(8272);
if ibm_pointer < IBM_KANJI.len() {
handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else if let Some(bmp) = jis0208_range_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else {
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
}
}
}
},
{
// If lead is between 0xA1 and 0xFE, inclusive,
// subtract 0xA1.
let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
if jis0212_lead_minus_offset > (0xFE - 0xA1) {
if lead < 0x80 {
return (
DecoderResult::Malformed(1, 0),
unread_handle_jis0212.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(2, 0),
unread_handle_jis0212.consumed(),
handle.written(),
);
}
jis0212_lead_minus_offset
},
{
// If trail is between 0xA1 and 0xFE, inclusive,
// subtract 0xA1.
let trail_minus_offset = byte.wrapping_sub(0xA1);
if trail_minus_offset > (0xFE - 0xA1) {
if byte < 0x80 {
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(3, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
let pointer_minus_kanji = pointer.wrapping_sub(1410);
if pointer_minus_kanji < JIS0212_KANJI.len() {
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
} else if let Some(bmp) = jis0212_accented_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else {
let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
if pointer_minus_upper_cyrillic <= (607 - 597) {
handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
} else {
let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
if pointer_minus_lower_cyrillic <= (655 - 645) {
handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
} else {
return (
DecoderResult::Malformed(3, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
}
}
},
{
// If trail is between 0xA1 and 0xDF, inclusive,
// subtract 0xA1 and map to half-width Katakana.
let trail_minus_offset = byte.wrapping_sub(0xA1);
if trail_minus_offset > (0xDF - 0xA1) {
if byte < 0x80 {
return (
DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written(),
);
}
return (
DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written(),
);
}
handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
},
self,
non_ascii,
jis0208_lead_minus_offset,
byte,
unread_handle_trail,
jis0212_lead_minus_offset,
lead,
unread_handle_jis0212,
source,
handle
);
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_euc_jp_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if 0x4EDD == bmp {
// Ideograph on the symbol row!
Some((0xA1, 0xB8))
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
Some((lead, trail))
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
let lead = (pos / 94) + 0xD0;
let trail = (pos % 94) + 0xA1;
Some((lead as u8, trail as u8))
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
let lead = (pos / 94) + 0xF9;
let trail = (pos % 94) + 0xA1;
Some((lead as u8, trail as u8))
} else {
None
}
}
pub struct EucJpEncoder;
impl EucJpEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
byte_length.checked_add(1)
}
ascii_compatible_bmp_encoder_functions!(
{
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
if bmp_minus_hiragana < 0x53 {
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
if bmp_minus_katakana < 0x56 {
handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
} else {
let bmp_minus_space = bmp.wrapping_sub(0x3000);
if bmp_minus_space < 3 {
// fast-track common punctuation
handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
} else if bmp == 0xA5 {
handle.write_one(0x5Cu8)
} else if bmp == 0x203E {
handle.write_one(0x7Eu8)
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
} else if bmp == 0x2212 {
handle.write_two(0xA1u8, 0xDDu8)
} else if let Some(pointer) = jis0208_range_encode(bmp) {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|| bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
let pos = position(&IBM_KANJI[..], bmp).unwrap();
let lead = (pos / 94) + 0xF9;
let trail = (pos % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if let Some(pointer) = ibm_symbol_encode(bmp) {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
let lead = (pointer / 94) + 0xA1;
let trail = (pointer % 94) + 0xA1;
handle.write_two(lead as u8, trail as u8)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
}
}
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_euc_jp(bytes: &[u8], expect: &str) {
decode(EUC_JP, bytes, expect);
}
fn encode_euc_jp(string: &str, expect: &[u8]) {
encode(EUC_JP, string, expect);
}
#[test]
fn test_euc_jp_decode() {
// Empty
decode_euc_jp(b"", &"");
// ASCII
decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
// Half-width
decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
decode_euc_jp(b"\x8E", "\u{FFFD}");
// JIS 0212
decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
decode_euc_jp(b"\x8F", "\u{FFFD}");
// JIS 0208
decode_euc_jp(b"\xA1\xA1", "\u{3000}");
decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
decode_euc_jp(b"\xA1", "\u{FFFD}");
// Bad leads
decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
// Bad ASCII trail
decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
}
#[test]
fn test_euc_jp_encode() {
// Empty
encode_euc_jp("", b"");
// ASCII
encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
// Exceptional code points
encode_euc_jp("\u{00A5}", b"\x5C");
encode_euc_jp("\u{203E}", b"\x7E");
encode_euc_jp("\u{2212}", b"\xA1\xDD");
// Half-width
encode_euc_jp("\u{FF61}", b"\x8E\xA1");
encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
// JIS 0212
encode_euc_jp("\u{02D8}", b"&#728;");
// JIS 0208
encode_euc_jp("\u{3000}", b"\xA1\xA1");
encode_euc_jp("\u{FF02}", b"\xFC\xFE");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_jis0208_decode_all() {
let input = include_bytes!("test_data/jis0208_in.txt");
let expectation = include_str!("test_data/jis0208_in_ref.txt");
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_jis0208_encode_all() {
let input = include_str!("test_data/jis0208_out.txt");
let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
let (cow, encoding, had_errors) = EUC_JP.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, EUC_JP);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_jis0212_decode_all() {
let input = include_bytes!("test_data/jis0212_in.txt");
let expectation = include_str!("test_data/jis0212_in_ref.txt");
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
}

442
zeroidc/vendor/encoding_rs/src/euc_kr.rs vendored Normal file
View File

@@ -0,0 +1,442 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
use super::in_range16;
pub struct EucKrDecoder {
lead: Option<u8>,
}
impl EucKrDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::EucKr(EucKrDecoder { lead: None })
}
pub fn in_neutral_state(&self) -> bool {
self.lead.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(match self.lead {
None => 0,
Some(_) => 1,
})
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
self.plus_one_if_lead(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// worst case: 2 to 3
let len = self.plus_one_if_lead(byte_length);
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_mul(3, self.plus_one_if_lead(byte_length))
}
ascii_compatible_two_byte_decoder_functions!(
{
// If lead is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let non_ascii_minus_offset =
non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0xFE - 0x81) {
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
non_ascii_minus_offset
},
{
if lead_minus_offset >= 0x20 {
// Not the extension range above KS X 1001
let trail_minus_offset =
byte.wrapping_sub(0xA1);
if trail_minus_offset <= (0xFE - 0xA1) {
// KS X 1001
let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
if hangul_pointer < KSX1001_HANGUL.len() {
let upper_bmp = KSX1001_HANGUL[hangul_pointer];
handle.write_upper_bmp(upper_bmp)
} else if ksx_pointer < KSX1001_SYMBOLS.len() {
let bmp = KSX1001_SYMBOLS[ksx_pointer];
handle.write_bmp_excl_ascii(bmp)
} else {
let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
if hanja_pointer < KSX1001_HANJA.len() {
let upper_bmp = KSX1001_HANJA[hanja_pointer];
handle.write_upper_bmp(upper_bmp)
} else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
if mid_bmp == 0 {
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
handle.write_mid_bmp(mid_bmp)
} else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
handle.write_mid_bmp(mid_bmp)
} else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
handle.write_upper_bmp(upper_bmp)
} else {
let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
if other_pointer < 0x039F {
let bmp = ksx1001_other_decode(other_pointer as u16);
// ASCII range means unassigned
if bmp < 0x80 {
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
handle.write_bmp_excl_ascii(bmp)
} else {
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
}
} else {
// Extension range to the left of
// KS X 1001
let left_lead = lead_minus_offset - 0x20;
let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
byte - (12 + 0x41)
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
byte - (6 + 0x41)
} else if byte.wrapping_sub(0x41) < 0x1A {
byte - 0x41
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
};
let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
handle.write_upper_bmp(upper_bmp)
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
} else {
// Extension range above KS X 1001
let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
byte - (12 + 0x41)
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
byte - (6 + 0x41)
} else if byte.wrapping_sub(0x41) < 0x1A {
byte - 0x41
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
};
let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
handle.write_upper_bmp(upper_bmp)
}
},
self,
non_ascii,
byte,
lead_minus_offset,
unread_handle_trail,
source,
handle,
'outermost,
copy_ascii_from_check_space_bmp,
check_space_bmp,
true);
}
fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
if in_inclusive_range16(bmp, 0x3000, 0x3015) {
if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
return Some((0xA1, pos + 0xA1));
}
}
if let Some(other_pointer) = ksx1001_other_encode(bmp) {
let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
let other_trail = ((other_pointer as usize) % 94) + 0xA1;
return Some((other_lead, other_trail));
}
if in_range16(bmp, 0x00AA, 0x0168) {
// Latin
if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
return Some((0x81 + 0x28, 0xA1 + pos));
}
if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
return Some((0x81 + 0x27, 0xA1 + pos));
}
} else if in_range16(bmp, 0x2500, 0x254C) {
if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
return Some((0x81 + 0x25, 0xA1 + pos));
}
}
if in_inclusive_range16(bmp, 0x2015, 0x266D)
|| in_inclusive_range16(bmp, 0x321C, 0x33D8)
|| in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
|| in_inclusive_range16(bmp, 0x00A1, 0x00F7)
|| in_inclusive_range16(bmp, 0x02C7, 0x02DD)
{
if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
if pos < (94 - 3) {
return Some((0xA1, pos + 0xA1 + 3));
}
return Some((0xA2, pos - (94 - 3) + 0xA1));
}
}
None
}
#[cfg(not(feature = "fast-hangul-encode"))]
#[inline(always)]
fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
match KSX1001_HANGUL.binary_search(&bmp) {
Ok(ksx_hangul_pointer) => {
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
(ksx_hangul_lead as u8, ksx_hangul_trail as u8)
}
Err(_) => {
let (lead, cp949_trail) = if bmp < 0xC8A5 {
// Above KS X 1001
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
let top_lead = (top_pointer / (190 - 12)) + 0x81;
let top_trail = top_pointer % (190 - 12);
(top_lead as u8, top_trail as u8)
} else {
// To the left of KS X 1001
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
let left_trail = left_pointer % (190 - 94 - 12);
(left_lead as u8, left_trail as u8)
};
let offset = if cp949_trail >= (0x40 - 12) {
0x41 + 12
} else if cp949_trail >= (0x20 - 6) {
0x41 + 6
} else {
0x41
};
(lead as u8, (cp949_trail + offset) as u8)
}
}
}
#[cfg(feature = "fast-hangul-encode")]
#[inline(always)]
fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
cp949_hangul_encode(bmp_minus_hangul_start)
}
#[cfg(not(feature = "fast-hanja-encode"))]
#[inline(always)]
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
let hanja_trail = (hanja_pointer % 94) + 0xA1;
Some((hanja_lead as u8, hanja_trail as u8))
} else {
None
}
}
#[cfg(feature = "fast-hanja-encode")]
#[inline(always)]
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
if bmp < 0xF900 {
ksx1001_unified_hangul_encode(bmp)
} else {
Some(ksx1001_compatibility_hangul_encode(bmp))
}
}
pub struct EucKrEncoder;
impl EucKrEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
byte_length.checked_add(1)
}
ascii_compatible_bmp_encoder_functions!(
{
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
// Hangul
ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
} else if in_range16(bmp, 0x33DE, 0xFF01) {
// Vast range that includes no other
// mappables except Hangul (already
// processed) and Hanja.
// Narrow the range further to Unified and
// Compatibility ranges of Hanja.
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
(hanja_lead, hanja_trail)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
(lead as u8, trail as u8)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
handle.write_two(lead, trail)
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
true
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_euc_kr(bytes: &[u8], expect: &str) {
decode(EUC_KR, bytes, expect);
}
fn encode_euc_kr(string: &str, expect: &[u8]) {
encode(EUC_KR, string, expect);
}
#[test]
fn test_euc_kr_decode() {
// Empty
decode_euc_kr(b"", &"");
// ASCII
decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
decode_euc_kr(b"\x81\x41", "\u{AC02}");
decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
}
#[test]
fn test_euc_kr_encode() {
// Empty
encode_euc_kr("", b"");
// ASCII
encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
encode_euc_kr("\u{AC02}", b"\x81\x41");
encode_euc_kr("\u{8A70}", b"\xFD\xFE");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_euc_kr_decode_all() {
let input = include_bytes!("test_data/euc_kr_in.txt");
let expectation = include_str!("test_data/euc_kr_in_ref.txt");
let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_euc_kr_encode_all() {
let input = include_str!("test_data/euc_kr_out.txt");
let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
let (cow, encoding, had_errors) = EUC_KR.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, EUC_KR);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
fn test_euc_kr_encode_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = EUC_KR.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
}

View File

@@ -0,0 +1,767 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
use super::in_range16;
enum Gb18030Pending {
None,
One(u8),
Two(u8, u8),
Three(u8, u8, u8),
}
impl Gb18030Pending {
fn is_none(&self) -> bool {
match *self {
Gb18030Pending::None => true,
_ => false,
}
}
fn count(&self) -> usize {
match *self {
Gb18030Pending::None => 0,
Gb18030Pending::One(_) => 1,
Gb18030Pending::Two(_, _) => 2,
Gb18030Pending::Three(_, _, _) => 3,
}
}
}
pub struct Gb18030Decoder {
first: Option<u8>,
second: Option<u8>,
third: Option<u8>,
pending: Gb18030Pending,
pending_ascii: Option<u8>,
}
impl Gb18030Decoder {
pub fn new() -> VariantDecoder {
VariantDecoder::Gb18030(Gb18030Decoder {
first: None,
second: None,
third: None,
pending: Gb18030Pending::None,
pending_ascii: None,
})
}
pub fn in_neutral_state(&self) -> bool {
self.first.is_none()
&& self.second.is_none()
&& self.third.is_none()
&& self.pending.is_none()
&& self.pending_ascii.is_none()
}
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(
self.pending.count()
+ match self.first {
None => 0,
Some(_) => 1,
}
+ match self.second {
None => 0,
Some(_) => 1,
}
+ match self.third {
None => 0,
Some(_) => 1,
}
+ match self.pending_ascii {
None => 0,
Some(_) => 1,
},
)
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
// ASCII: 1 to 1 (worst case)
// gbk: 2 to 1
// ranges: 4 to 1 or 4 to 2
checked_add(1, self.extra_from_state(byte_length))
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// ASCII: 1 to 1
// gbk: 2 to 2 or 2 to 3
// ranges: 4 to 2, 4 to 3 or 4 to 4
// 0x80: 1 to 3 (worst case)
self.max_utf8_buffer_length(byte_length)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
}
gb18030_decoder_functions!(
{
// If first is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0xFE - 0x81) {
if non_ascii == 0x80 {
handle.write_upper_bmp(0x20ACu16);
continue 'outermost;
}
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
non_ascii_minus_offset
},
{
// Two-byte (or error)
if first_minus_offset >= 0x20 {
// Not the gbk ideograph range above GB2312
let trail_minus_offset = second.wrapping_sub(0xA1);
if trail_minus_offset <= (0xFE - 0xA1) {
// GB2312
let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
if hanzi_lead < (0x77 - 0x2F) {
// Level 1 Hanzi, Level 2 Hanzi
// or one of the 5 PUA code
// points in between.
let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
let upper_bmp = GB2312_HANZI[hanzi_pointer];
handle.write_upper_bmp(upper_bmp)
} else if first_minus_offset == 0x20 {
// Symbols (starting with ideographic space)
let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
handle.write_bmp_excl_ascii(bmp)
} else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
} else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
} else if first_minus_offset > 0x76 {
// Bottom PUA
let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
handle.write_upper_bmp(pua)
} else {
let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
handle.write_bmp_excl_ascii(bmp)
}
} else {
// gbk range on the left
let mut trail_minus_offset = second.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start = second.wrapping_sub(0x80);
if trail_minus_range_start > (0xA0 - 0x80) {
if second < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_second.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_second.consumed(),
handle.written());
}
trail_minus_offset = second - 0x41;
}
// Zero-base lead
let left_lead = first_minus_offset - 0x20;
let left_pointer = left_lead as usize * (190 - 94) +
trail_minus_offset as usize;
let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
handle.write_upper_bmp(upper_bmp)
} else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
let bmp = gbk_other_decode(left_pointer as u16);
handle.write_bmp_excl_ascii(bmp)
} else {
let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
let upper_bmp = GBK_BOTTOM[bottom_pointer];
handle.write_upper_bmp(upper_bmp)
}
}
} else {
// gbk ideograph range above GB2312
let mut trail_minus_offset = second.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start = second.wrapping_sub(0x80);
if trail_minus_range_start > (0xFE - 0x80) {
if second < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_second.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_second.consumed(),
handle.written());
}
trail_minus_offset = second - 0x41;
}
let pointer = first_minus_offset as usize * 190usize +
trail_minus_offset as usize;
let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
handle.write_upper_bmp(upper_bmp)
}
},
{
// If third is between 0x81 and 0xFE, inclusive,
// subtract offset 0x81.
let third_minus_offset = third.wrapping_sub(0x81);
if third_minus_offset > (0xFE - 0x81) {
// We have an error. Let's inline what's going
// to happen when `second` is
// reprocessed. (`third` gets unread.)
// `second` is guaranteed ASCII, so let's
// put it in `pending_ascii`. Recompute
// `second` from `second_minus_offset`.
self.pending_ascii = Some(second_minus_offset + 0x30);
// Now unread `third` and designate the previous
// `first` as being in error.
return (DecoderResult::Malformed(1, 1),
unread_handle_third.unread(),
handle.written());
}
third_minus_offset
},
{
// If fourth is between 0x30 and 0x39, inclusive,
// subtract offset 0x30.
//
// If we have an error, we'll inline what's going
// to happen when `second` and `third` are
// reprocessed. (`fourth` gets unread.)
// `second` is guaranteed ASCII, so let's
// put it in `pending_ascii`. Recompute
// `second` from `second_minus_offset` to
// make this block reusable when `second`
// is not in scope.
//
// `third` is guaranteed to be in the range
// that makes it become the new `self.first`.
//
// `fourth` gets unread and the previous
// `first` gets designates as being in error.
let fourth_minus_offset = fourth.wrapping_sub(0x30);
if fourth_minus_offset > (0x39 - 0x30) {
self.pending_ascii = Some(second_minus_offset + 0x30);
self.pending = Gb18030Pending::One(third_minus_offset);
return (DecoderResult::Malformed(1, 2),
unread_handle_fourth.unread(),
handle.written());
}
let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
(second_minus_offset as usize * (10 * 126)) +
(third_minus_offset as usize * 10) +
fourth_minus_offset as usize;
if pointer <= 39419 {
// BMP
if pointer == 7457 {
handle.write_upper_bmp(0xE7C7)
} else {
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
}
} else if pointer >= 189_000 && pointer <= 1_237_575 {
// Astral
handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
} else {
return (DecoderResult::Malformed(4, 0),
unread_handle_fourth.consumed(),
handle.written());
}
},
self,
non_ascii,
first_minus_offset,
second,
second_minus_offset,
unread_handle_second,
third,
third_minus_offset,
unread_handle_third,
fourth,
fourth_minus_offset,
unread_handle_fourth,
source,
handle,
'outermost);
}
// XXX Experiment with inline directives
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
// Try ideographic punctuation first as it's the most likely case.
// Throwing in the check for full-width currencies and tilde is probably
// more size-efficient here than elsewhere.
if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
return Some((0xA1, pos + 0xA1));
}
}
// Ext A
if in_range16(bmp, 0x3400, 0x4E00) {
return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
(
0xFE,
pos + if pos < (0x3F - 16) {
0x40 + 16
} else {
0x41 + 16
},
)
});
}
// Compatibility ideographs
if in_range16(bmp, 0xF900, 0xFB00) {
return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
if pos < 5 {
// end of second to last row
(0xFD, pos + (190 - 94 - 5 + 0x41))
} else {
// last row
(0xFE, pos + (0x40 - 5))
}
});
}
// Handle everything below U+02CA, which is in GBK_OTHER.
if bmp < 0x02CA {
if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
// Pinyin except U+1E3F
if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
return Some((0xA8, pos + 0xA1));
}
} else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
|| in_inclusive_range16(bmp, 0x02C7, 0x02C9)
{
// Diacritics and Latin 1 symbols
if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
return Some((0xA1, pos + 0xA1 + 3));
}
}
return None;
}
if bmp >= 0xE794 {
// Various brackets, all in PUA or full-width regions
if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
}
} else if bmp == 0x1E3F {
// The one Pinyin placed elsewhere on the BMP
return Some((0xA8, 0x7B - 0x60 + 0xA1));
} else if in_range16(bmp, 0xA000, 0xD800) {
// Since Korean has usage in China, let's spend a branch to fast-track
// Hangul.
return None;
}
// GB2312 other (except bottom PUA and PUA between Hanzi levels).
if let Some(other_pointer) = gb2312_other_encode(bmp) {
let other_lead = other_pointer as usize / 94;
let other_trail = other_pointer as usize % 94;
return Some((0xA2 + other_lead, 0xA1 + other_trail));
}
// At this point, we've handled all mappable characters above U+02D9 but
// below U+2010. Let's check for that range in order to let lower BMP
// characters used for minority languages in China avoid the subsequent
// search that deals mainly with various symbols.
if in_range16(bmp, 0x02DA, 0x2010) {
return None;
}
// GBK other (except radicals and PUA in GBK_BOTTOM).
if let Some(other_pointer) = gbk_other_encode(bmp) {
let other_lead = other_pointer as usize / (190 - 94);
let other_trail = other_pointer as usize % (190 - 94);
let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
return Some((other_lead + (0x81 + 0x20), other_trail + offset));
}
// CJK Radicals Supplement or PUA in GBK_BOTTOM
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
let trail = pos + 16;
let offset = if trail < 0x3F { 0x40 } else { 0x41 };
return Some((0xFE, trail + offset));
}
}
// GB2312 bottom PUA
let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
}
// PUA between Hanzi Levels
let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
if bmp_minus_pua_between_hanzi < 5 {
return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
}
None
}
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
#[inline(always)]
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
(lead, trail)
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
(hanzi_lead as u8, hanzi_trail as u8)
} else {
let (lead, gbk_trail) = if bmp < 0x72DC {
// Above GB2312
let pointer = gbk_top_ideograph_encode(bmp) as usize;
let lead = (pointer / 190) + 0x81;
let gbk_trail = pointer % 190;
(lead, gbk_trail)
} else {
// To the left of GB2312
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
(lead, gbk_trail)
};
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
(lead as u8, (gbk_trail + offset) as u8)
}
}
#[cfg(feature = "fast-gb-hanzi-encode")]
#[inline(always)]
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
gbk_hanzi_encode(bmp_minus_unified_start)
}
pub struct Gb18030Encoder {
extended: bool,
}
impl Gb18030Encoder {
pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
Encoder::new(
encoding,
VariantEncoder::Gb18030(Gb18030Encoder {
extended: extended_range,
}),
)
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
if self.extended {
u16_length.checked_mul(4)
} else {
// Need to add, because space check is done with the four-byte
// assumption.
checked_add(2, u16_length.checked_mul(2))
}
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
if self.extended {
// 1 to 1
// 2 to 2
// 3 to 2
// 2 to 4 (worst)
// 3 to 4
// 4 to 4
checked_add(2, byte_length.checked_mul(2))
} else {
// 1 to 1
// 2 to 2
// 3 to 2
// Need to add, because space check is done with the four-byte
// assumption.
byte_length.checked_add(3)
}
}
ascii_compatible_encoder_functions!(
{
let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
// CJK Unified Ideographs
// Can't fail now, since all are
// mapped.
let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
handle.write_two(lead, trail)
} else if bmp == 0xE5E5 {
// It's not optimal to check for the unmappable
// and for euro at this stage, but getting
// the out of the way makes the rest of the
// code less messy.
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
} else if bmp == 0x20AC && !self.extended {
handle.write_one(0x80u8)
} else {
match gbk_encode_non_unified(bmp) {
Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
None => {
if !self.extended {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
let range_pointer = gb18030_range_encode(bmp);
let first = range_pointer / (10 * 126 * 10);
let rem_first = range_pointer % (10 * 126 * 10);
let second = rem_first / (10 * 126);
let rem_second = rem_first % (10 * 126);
let third = rem_second / 10;
let fourth = rem_second % 10;
handle.write_four(
(first + 0x81) as u8,
(second + 0x30) as u8,
(third + 0x81) as u8,
(fourth + 0x30) as u8,
)
}
}
}
},
{
if !self.extended {
return (
EncoderResult::Unmappable(astral),
source.consumed(),
handle.written(),
);
}
let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
let first = range_pointer / (10 * 126 * 10);
let rem_first = range_pointer % (10 * 126 * 10);
let second = rem_first / (10 * 126);
let rem_second = rem_first % (10 * 126);
let third = rem_second / 10;
let fourth = rem_second % 10;
handle.write_four(
(first + 0x81) as u8,
(second + 0x30) as u8,
(third + 0x81) as u8,
(fourth + 0x30) as u8,
)
},
bmp,
astral,
self,
source,
handle,
copy_ascii_to_check_space_four,
check_space_four,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_gb18030(bytes: &[u8], expect: &str) {
decode(GB18030, bytes, expect);
}
fn encode_gb18030(string: &str, expect: &[u8]) {
encode(GB18030, string, expect);
}
fn encode_gbk(string: &str, expect: &[u8]) {
encode(GBK, string, expect);
}
#[test]
fn test_gb18030_decode() {
// Empty
decode_gb18030(b"", &"");
// ASCII
decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
// euro
decode_gb18030(b"\x80", "\u{20AC}");
decode_gb18030(b"\xA2\xE3", "\u{20AC}");
// two bytes
decode_gb18030(b"\x81\x40", "\u{4E02}");
decode_gb18030(b"\x81\x7E", "\u{4E8A}");
decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
decode_gb18030(b"\x81\x80", "\u{4E90}");
decode_gb18030(b"\x81\xFE", "\u{4FA2}");
decode_gb18030(b"\xFE\x40", "\u{FA0C}");
decode_gb18030(b"\xFE\x7E", "\u{E843}");
decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
decode_gb18030(b"\xFE\x80", "\u{4723}");
decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
// The difference from the original GB18030
decode_gb18030(b"\xA3\xA0", "\u{3000}");
decode_gb18030(b"\xA1\xA1", "\u{3000}");
// 0xFF
decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
decode_gb18030(
b"\xFF\x32\x9A\x33\x00",
"\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
);
// Four bytes
decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
}
#[test]
fn test_gb18030_encode() {
// Empty
encode_gb18030("", b"");
// ASCII
encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
// euro
encode_gb18030("\u{20AC}", b"\xA2\xE3");
// two bytes
encode_gb18030("\u{4E02}", b"\x81\x40");
encode_gb18030("\u{4E8A}", b"\x81\x7E");
if !cfg!(miri) {
// Miri is too slow
encode_gb18030("\u{4E90}", b"\x81\x80");
encode_gb18030("\u{4FA2}", b"\x81\xFE");
encode_gb18030("\u{FA0C}", b"\xFE\x40");
encode_gb18030("\u{E843}", b"\xFE\x7E");
encode_gb18030("\u{4723}", b"\xFE\x80");
encode_gb18030("\u{E4C5}", b"\xFE\xFE");
}
// The difference from the original GB18030
encode_gb18030("\u{E5E5}", b"&#58853;");
encode_gb18030("\u{3000}", b"\xA1\xA1");
// Four bytes
encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
if !cfg!(miri) {
// Miri is too slow
encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
}
// Edge cases
encode_gb18030("\u{00F7}", b"\xA1\xC2");
}
#[test]
fn test_gbk_encode() {
// Empty
encode_gbk("", b"");
// ASCII
encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
// euro
encode_gbk("\u{20AC}", b"\x80");
// two bytes
encode_gbk("\u{4E02}", b"\x81\x40");
encode_gbk("\u{4E8A}", b"\x81\x7E");
if !cfg!(miri) {
// Miri is too slow
encode_gbk("\u{4E90}", b"\x81\x80");
encode_gbk("\u{4FA2}", b"\x81\xFE");
encode_gbk("\u{FA0C}", b"\xFE\x40");
encode_gbk("\u{E843}", b"\xFE\x7E");
encode_gbk("\u{4723}", b"\xFE\x80");
encode_gbk("\u{E4C5}", b"\xFE\xFE");
}
// The difference from the original gb18030
encode_gbk("\u{E5E5}", b"&#58853;");
encode_gbk("\u{3000}", b"\xA1\xA1");
// Four bytes
encode_gbk("\u{0080}", b"&#128;");
encode_gbk("\u{E7C7}", b"&#59335;");
if !cfg!(miri) {
// Miri is too slow
encode_gbk("\u{2603}", b"&#9731;");
encode_gbk("\u{1F4A9}", b"&#128169;");
encode_gbk("\u{10FFFF}", b"&#1114111;");
}
// Edge cases
encode_gbk("\u{00F7}", b"\xA1\xC2");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_gb18030_decode_all() {
let input = include_bytes!("test_data/gb18030_in.txt");
let expectation = include_str!("test_data/gb18030_in_ref.txt");
let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_gb18030_encode_all() {
let input = include_str!("test_data/gb18030_out.txt");
let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
let (cow, encoding, had_errors) = GB18030.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, GB18030);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
fn test_gb18030_encode_from_utf16_max_length() {
let mut output = [0u8; 20];
let mut encoder = GB18030.new_encoder();
{
let needed = encoder
.max_buffer_length_from_utf16_without_replacement(1)
.unwrap();
let (result, read, written) = encoder.encode_from_utf16_without_replacement(
&[0x3000],
&mut output[..needed],
true,
);
assert_eq!(result, EncoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 2);
assert_eq!(output[0], 0xA1);
assert_eq!(output[1], 0xA1);
}
}
}

1969
zeroidc/vendor/encoding_rs/src/handles.rs vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

6113
zeroidc/vendor/encoding_rs/src/lib.rs vendored Normal file

File diff suppressed because it is too large Load Diff

1622
zeroidc/vendor/encoding_rs/src/macros.rs vendored Normal file

File diff suppressed because it is too large Load Diff

3356
zeroidc/vendor/encoding_rs/src/mem.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::variant::*;
pub struct ReplacementDecoder {
emitted: bool,
}
impl ReplacementDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::Replacement(ReplacementDecoder { emitted: false })
}
pub fn max_utf16_buffer_length(&self, _u16_length: usize) -> Option<usize> {
Some(1)
}
pub fn max_utf8_buffer_length_without_replacement(&self, _byte_length: usize) -> Option<usize> {
Some(3)
}
pub fn max_utf8_buffer_length(&self, _byte_length: usize) -> Option<usize> {
Some(3)
}
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
// Don't err if the input stream is empty. See
// https://github.com/whatwg/encoding/issues/33
if self.emitted || src.is_empty() {
(DecoderResult::InputEmpty, src.len(), 0)
} else if dst.is_empty() {
// Make sure there's room for the replacement character.
(DecoderResult::OutputFull, 0, 0)
} else {
self.emitted = true;
(DecoderResult::Malformed(1, 0), 1, 0)
}
}
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
_last: bool,
) -> (DecoderResult, usize, usize) {
// Don't err if the input stream is empty. See
// https://github.com/whatwg/encoding/issues/33
if self.emitted || src.is_empty() {
(DecoderResult::InputEmpty, src.len(), 0)
} else if dst.len() < 3 {
// Make sure there's room for the replacement character.
(DecoderResult::OutputFull, 0, 0)
} else {
self.emitted = true;
(DecoderResult::Malformed(1, 0), 1, 0)
}
}
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_replacement(bytes: &[u8], expect: &str) {
decode_without_padding(REPLACEMENT, bytes, expect);
}
fn encode_replacement(string: &str, expect: &[u8]) {
encode(REPLACEMENT, string, expect);
}
#[test]
fn test_replacement_decode() {
decode_replacement(b"", "");
decode_replacement(b"A", "\u{FFFD}");
decode_replacement(b"AB", "\u{FFFD}");
}
#[test]
fn test_replacement_encode() {
// Empty
encode_replacement("", b"");
assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
encode_replacement("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
}
}

View File

@@ -0,0 +1,426 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::data::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range;
use super::in_inclusive_range16;
pub struct ShiftJisDecoder {
lead: Option<u8>,
}
impl ShiftJisDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
}
pub fn in_neutral_state(&self) -> bool {
self.lead.is_none()
}
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_add(match self.lead {
None => 0,
Some(_) => 1,
})
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
self.plus_one_if_lead(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
// worst case: 1 to 3 (half-width katakana)
self.max_utf8_buffer_length(byte_length)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_mul(3, self.plus_one_if_lead(byte_length))
}
ascii_compatible_two_byte_decoder_functions!(
{
// If lead is between 0x81 and 0x9F, inclusive,
// subtract offset 0x81. Else if lead is
// between 0xE0 and 0xFC, inclusive, subtract
// offset 0xC1. Else if lead is between
// 0xA1 and 0xDF, inclusive, map to half-width
// Katakana. Else if lead is 0x80, pass through.
let mut non_ascii_minus_offset =
non_ascii.wrapping_sub(0x81);
if non_ascii_minus_offset > (0x9F - 0x81) {
let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
if non_ascii_minus_range_start > (0xFC - 0xE0) {
let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
if non_ascii == 0x80 {
handle.write_mid_bmp(0x80);
// Not caring about optimizing subsequent non-ASCII
continue 'outermost;
}
return (DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written());
}
handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
// Not caring about optimizing subsequent non-ASCII
continue 'outermost;
}
non_ascii_minus_offset = non_ascii - 0xC1;
}
non_ascii_minus_offset
},
{
// If trail is between 0x40 and 0x7E, inclusive,
// subtract offset 0x40. Else if trail is
// between 0x80 and 0xFC, inclusive, subtract
// offset 0x41.
// Fast-track Hiragana (60% according to Lunde)
// and Katakana (10% acconding to Lunde).
// Hiragana doesn't cross 0x7F, but Katakana does.
// We can check for Hiragana before normalizing
// trail.
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
// Hiragana
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
} else {
let mut trail_minus_offset =
byte.wrapping_sub(0x40);
if trail_minus_offset > (0x7E - 0x40) {
let trail_minus_range_start =
byte.wrapping_sub(0x80);
if trail_minus_range_start > (0xFC - 0x80) {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
trail_minus_offset = byte - 0x41;
}
if lead_minus_offset == 0x02 &&
trail_minus_offset < 0x56 {
// Katakana
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
} else {
let pointer = lead_minus_offset as usize *
188usize +
trail_minus_offset as usize;
let level1_pointer = pointer.wrapping_sub(1410);
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
} else {
let level2_pointer = pointer.wrapping_sub(4418);
if level2_pointer <
JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
} else {
let upper_ibm_pointer = pointer.wrapping_sub(10744);
if upper_ibm_pointer < IBM_KANJI.len() {
handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
} else {
let lower_ibm_pointer = pointer.wrapping_sub(8272);
if lower_ibm_pointer < IBM_KANJI.len() {
handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
} else if in_inclusive_range(pointer, 8836, 10715) {
handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else if let Some(bmp) = jis0208_range_decode(pointer) {
handle.write_bmp_excl_ascii(bmp)
} else {
if byte < 0x80 {
return (DecoderResult::Malformed(1, 0),
unread_handle_trail.unread(),
handle.written());
}
return (DecoderResult::Malformed(2, 0),
unread_handle_trail.consumed(),
handle.written());
}
}
}
}
}
}
},
self,
non_ascii,
byte,
lead_minus_offset,
unread_handle_trail,
source,
handle,
'outermost,
copy_ascii_from_check_space_bmp,
check_space_bmp,
false);
}
#[cfg(feature = "fast-kanji-encode")]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
jis0208_kanji_shift_jis_encode(bmp)
}
#[cfg(not(feature = "fast-kanji-encode"))]
#[inline(always)]
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
return Some((lead, trail));
}
let pointer = if 0x4EDD == bmp {
// Ideograph on the symbol row!
23
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
4418 + pos
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
10744 + pos
} else {
return None;
};
let lead = pointer / 188;
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
let trail = pointer % 188;
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
}
pub struct ShiftJisEncoder;
impl ShiftJisEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
u16_length.checked_mul(2)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
byte_length.checked_add(1)
}
ascii_compatible_bmp_encoder_functions!(
{
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
if bmp_minus_hiragana < 0x53 {
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
if let Some((lead, trail)) = encode_kanji(bmp) {
handle.write_two(lead, trail)
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
} else {
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
if bmp_minus_katakana < 0x56 {
let trail_offset = if bmp_minus_katakana < 0x3F {
0x40
} else {
0x41
};
handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
} else {
let bmp_minus_space = bmp.wrapping_sub(0x3000);
if bmp_minus_space < 3 {
// fast-track common punctuation
handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
} else if bmp == 0xA5 {
handle.write_one(0x5Cu8)
} else if bmp == 0x80 {
handle.write_one(0x80u8)
} else if bmp == 0x203E {
handle.write_one(0x7Eu8)
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
} else if bmp == 0x2212 {
handle.write_two(0x81u8, 0x7Cu8)
} else {
let bmp_minus_roman = bmp.wrapping_sub(0x2170);
let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
10716 + bmp_minus_roman as usize
} else if let Some(pointer) = jis0208_range_encode(bmp) {
pointer
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|| bmp == 0xF929
|| bmp == 0xF9DC
{
// Guaranteed to be found in IBM_KANJI
let pos = position(&IBM_KANJI[..], bmp).unwrap();
10744 + pos
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
pointer
} else {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
};
let lead = pointer / 188;
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
let trail = pointer % 188;
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
}
}
}
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_two,
check_space_two,
false
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_shift_jis(bytes: &[u8], expect: &str) {
decode(SHIFT_JIS, bytes, expect);
}
fn encode_shift_jis(string: &str, expect: &[u8]) {
encode(SHIFT_JIS, string, expect);
}
#[test]
fn test_shift_jis_decode() {
// Empty
decode_shift_jis(b"", &"");
// ASCII
decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
// Half-width
decode_shift_jis(b"\xA1", "\u{FF61}");
decode_shift_jis(b"\xDF", "\u{FF9F}");
decode_shift_jis(b"\xA0", "\u{FFFD}");
decode_shift_jis(b"\xE0", "\u{FFFD}");
decode_shift_jis(b"\xA0+", "\u{FFFD}+");
decode_shift_jis(b"\xE0+", "\u{FFFD}+");
// EUDC
decode_shift_jis(b"\xF0\x40", "\u{E000}");
decode_shift_jis(b"\xF9\xFC", "\u{E757}");
decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
decode_shift_jis(b"\xFA\x40", "\u{2170}");
// JIS 0208
decode_shift_jis(b"\x81\x40", "\u{3000}");
decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
decode_shift_jis(b"\xFA\x40", "\u{2170}");
decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
//
}
#[test]
fn test_shift_jis_encode() {
// Empty
encode_shift_jis("", b"");
// ASCII
encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
// Exceptional code points
encode_shift_jis("\u{0080}", b"\x80");
encode_shift_jis("\u{00A5}", b"\x5C");
encode_shift_jis("\u{203E}", b"\x7E");
encode_shift_jis("\u{2212}", b"\x81\x7C");
// Half-width
encode_shift_jis("\u{FF61}", b"\xA1");
encode_shift_jis("\u{FF9F}", b"\xDF");
// EUDC
encode_shift_jis("\u{E000}", b"&#57344;");
encode_shift_jis("\u{E757}", b"&#59223;");
// JIS 0212
encode_shift_jis("\u{02D8}", b"&#728;");
// JIS 0208
encode_shift_jis("\u{3000}", b"\x81\x40");
encode_shift_jis("\u{FF02}", b"\xFA\x57");
encode_shift_jis("\u{2170}", b"\xFA\x40");
encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_shift_jis_decode_all() {
let input = include_bytes!("test_data/shift_jis_in.txt");
let expectation = include_str!("test_data/shift_jis_in_ref.txt");
let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
assert!(had_errors, "Should have had errors.");
assert_eq!(&cow[..], expectation);
}
#[test]
#[cfg_attr(miri, ignore)] // Miri is too slow
fn test_shift_jis_encode_all() {
let input = include_str!("test_data/shift_jis_out.txt");
let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
assert!(!had_errors, "Should not have had errors.");
assert_eq!(encoding, SHIFT_JIS);
assert_eq!(&cow[..], &expectation[..]);
}
#[test]
fn test_shift_jis_half_width_katakana_length() {
let mut output = [0u8; 20];
let mut decoder = SHIFT_JIS.new_decoder();
{
let needed = decoder
.max_utf8_buffer_length_without_replacement(1)
.unwrap();
let (result, read, written) =
decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
assert_eq!(result, DecoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert_eq!(output[0], 0xEF);
assert_eq!(output[1], 0xBD);
assert_eq!(output[2], 0xA1);
}
}
}

View File

@@ -0,0 +1,455 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use packed_simd::u16x8;
use packed_simd::u8x16;
use packed_simd::FromBits;
// TODO: Migrate unaligned access to stdlib code if/when the RFC
// https://github.com/rust-lang/rfcs/pull/1725 is implemented.
#[inline(always)]
pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
let mut simd = ::core::mem::uninitialized();
::core::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
simd
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
*(ptr as *const u8x16)
}
#[inline(always)]
pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
::core::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
*(ptr as *mut u8x16) = s;
}
#[inline(always)]
pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
let mut simd = ::core::mem::uninitialized();
::core::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
simd
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
*(ptr as *const u16x8)
}
#[inline(always)]
pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
::core::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
}
#[allow(dead_code)]
#[inline(always)]
pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
*(ptr as *mut u16x8) = s;
}
cfg_if! {
if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] {
use core::arch::x86_64::__m128i;
use core::arch::x86_64::_mm_movemask_epi8;
use core::arch::x86_64::_mm_packus_epi16;
} else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] {
use core::arch::x86::__m128i;
use core::arch::x86::_mm_movemask_epi8;
use core::arch::x86::_mm_packus_epi16;
} else if #[cfg(target_arch = "aarch64")]{
use core::arch::aarch64::uint8x16_t;
use core::arch::aarch64::uint16x8_t;
use core::arch::aarch64::vmaxvq_u8;
use core::arch::aarch64::vmaxvq_u16;
} else {
}
}
// #[inline(always)]
// fn simd_byte_swap_u8(s: u8x16) -> u8x16 {
// unsafe {
// shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
// }
// }
// #[inline(always)]
// pub fn simd_byte_swap(s: u16x8) -> u16x8 {
// to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s)))
// }
#[inline(always)]
pub fn simd_byte_swap(s: u16x8) -> u16x8 {
let left = s << 8;
let right = s >> 8;
left | right
}
#[inline(always)]
pub fn to_u16_lanes(s: u8x16) -> u16x8 {
u16x8::from_bits(s)
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
// Expose low-level mask instead of higher-level conclusion,
// because the non-ASCII case would perform less well otherwise.
#[inline(always)]
pub fn mask_ascii(s: u8x16) -> i32 {
unsafe {
_mm_movemask_epi8(__m128i::from_bits(s))
}
}
} else {
}
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
#[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool {
unsafe {
_mm_movemask_epi8(__m128i::from_bits(s)) == 0
}
}
} else if #[cfg(target_arch = "aarch64")]{
#[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool {
unsafe {
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80
}
}
} else {
#[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool {
// This optimizes better on ARM than
// the lt formulation.
let highest_ascii = u8x16::splat(0x7F);
!s.gt(highest_ascii).any()
}
}
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
#[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool {
if simd_is_ascii(s) {
return true;
}
let above_str_latin1 = u8x16::splat(0xC4);
s.lt(above_str_latin1).all()
}
} else if #[cfg(target_arch = "aarch64")]{
#[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool {
unsafe {
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4
}
}
} else {
#[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool {
let above_str_latin1 = u8x16::splat(0xC4);
s.lt(above_str_latin1).all()
}
}
}
cfg_if! {
if #[cfg(target_arch = "aarch64")]{
#[inline(always)]
pub fn simd_is_basic_latin(s: u16x8) -> bool {
unsafe {
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80
}
}
#[inline(always)]
pub fn simd_is_latin1(s: u16x8) -> bool {
unsafe {
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100
}
}
} else {
#[inline(always)]
pub fn simd_is_basic_latin(s: u16x8) -> bool {
let above_ascii = u16x8::splat(0x80);
s.lt(above_ascii).all()
}
#[inline(always)]
pub fn simd_is_latin1(s: u16x8) -> bool {
// For some reason, on SSE2 this formulation
// seems faster in this case while the above
// function is better the other way round...
let highest_latin1 = u16x8::splat(0xFF);
!s.gt(highest_latin1).any()
}
}
}
#[inline(always)]
pub fn contains_surrogates(s: u16x8) -> bool {
let mask = u16x8::splat(0xF800);
let surrogate_bits = u16x8::splat(0xD800);
(s & mask).eq(surrogate_bits).any()
}
cfg_if! {
if #[cfg(target_arch = "aarch64")]{
macro_rules! aarch64_return_false_if_below_hebrew {
($s:ident) => ({
unsafe {
if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 {
return false;
}
}
})
}
macro_rules! non_aarch64_return_false_if_all {
($s:ident) => ()
}
} else {
macro_rules! aarch64_return_false_if_below_hebrew {
($s:ident) => ()
}
macro_rules! non_aarch64_return_false_if_all {
($s:ident) => ({
if $s.all() {
return false;
}
})
}
}
}
macro_rules! in_range16x8 {
($s:ident, $start:expr, $end:expr) => {{
// SIMD sub is wrapping
($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
}};
}
#[inline(always)]
pub fn is_u16x8_bidi(s: u16x8) -> bool {
// We try to first quickly refute the RTLness of the vector. If that
// fails, we do the real RTL check, so in that case we end up wasting
// the work for the up-front quick checks. Even the quick-check is
// two-fold in order to return `false` ASAP if everything is below
// Hebrew.
aarch64_return_false_if_below_hebrew!(s);
let below_hebrew = s.lt(u16x8::splat(0x0590));
non_aarch64_return_false_if_all!(below_hebrew);
if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
return false;
}
// Quick refutation failed. Let's do the full check.
(in_range16x8!(s, 0x0590, 0x0900)
| in_range16x8!(s, 0xFB1D, 0xFE00)
| in_range16x8!(s, 0xFE70, 0xFEFF)
| in_range16x8!(s, 0xD802, 0xD804)
| in_range16x8!(s, 0xD83A, 0xD83C)
| s.eq(u16x8::splat(0x200F))
| s.eq(u16x8::splat(0x202B))
| s.eq(u16x8::splat(0x202E))
| s.eq(u16x8::splat(0x2067)))
.any()
}
#[inline(always)]
pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
unsafe {
let first: u8x16 = shuffle!(
s,
u8x16::splat(0),
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
);
let second: u8x16 = shuffle!(
s,
u8x16::splat(0),
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
);
(u16x8::from_bits(first), u16x8::from_bits(second))
}
}
cfg_if! {
if #[cfg(target_feature = "sse2")] {
#[inline(always)]
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
unsafe {
u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b)))
}
}
} else {
#[inline(always)]
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
unsafe {
let first = u8x16::from_bits(a);
let second = u8x16::from_bits(b);
shuffle!(
first,
second,
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
)
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::vec::Vec;
#[test]
fn test_unpack() {
let ascii: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let basic_latin: [u16; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
let mut vec = Vec::with_capacity(16);
vec.resize(16, 0u16);
let (first, second) = simd_unpack(simd);
let ptr = vec.as_mut_ptr();
unsafe {
store8_unaligned(ptr, first);
store8_unaligned(ptr.add(8), second);
}
assert_eq!(&vec[..], &basic_latin[..]);
}
#[test]
fn test_simd_is_basic_latin_success() {
let ascii: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let basic_latin: [u16; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
let mut vec = Vec::with_capacity(16);
vec.resize(16, 0u8);
let ptr = vec.as_mut_ptr();
assert!(simd_is_basic_latin(first | second));
unsafe {
store16_unaligned(ptr, simd_pack(first, second));
}
assert_eq!(&vec[..], &ascii[..]);
}
#[test]
fn test_simd_is_basic_latin_c0() {
let input: [u16; 16] = [
0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
#[test]
fn test_simd_is_basic_latin_0fff() {
let input: [u16; 16] = [
0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
#[test]
fn test_simd_is_basic_latin_ffff() {
let input: [u16; 16] = [
0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let first = unsafe { load8_unaligned(input.as_ptr()) };
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
assert!(!simd_is_basic_latin(first | second));
}
#[test]
fn test_simd_is_ascii_success() {
let ascii: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
assert!(simd_is_ascii(simd));
}
#[test]
fn test_simd_is_ascii_failure() {
let input: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(input.as_ptr()) };
assert!(!simd_is_ascii(simd));
}
#[cfg(target_feature = "sse2")]
#[test]
fn test_check_ascii() {
let input: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let simd = unsafe { load16_unaligned(input.as_ptr()) };
let mask = mask_ascii(simd);
assert_ne!(mask, 0);
assert_eq!(mask.trailing_zeros(), 4);
}
#[test]
fn test_alu() {
let input: [u8; 16] = [
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76,
];
let mut alu = 0u64;
unsafe {
::core::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
}
let masked = alu & 0x8080808080808080;
assert_eq!(masked.trailing_zeros(), 39);
}
}

View File

@@ -0,0 +1,714 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::ascii::*;
use crate::data::position;
use crate::handles::*;
use crate::variant::*;
pub struct SingleByteDecoder {
table: &'static [u16; 128],
}
impl SingleByteDecoder {
pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
VariantDecoder::SingleByte(SingleByteDecoder { table: data })
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
Some(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
_last: bool,
) -> (DecoderResult, usize, usize) {
let mut source = ByteSource::new(src);
let mut dest = Utf8Destination::new(dst);
'outermost: loop {
match dest.copy_ascii_from_check_space_bmp(&mut source) {
CopyAsciiResult::Stop(ret) => return ret,
CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
// Start non-boilerplate
//
// Since the non-ASCIIness of `non_ascii` is hidden from
// the optimizer, it can't figure out that it's OK to
// statically omit the bound check when accessing
// `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`.
let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize];
if mapped == 0u16 {
return (
DecoderResult::Malformed(1, 0),
source.consumed(),
handle.written(),
);
}
let dest_again = handle.write_bmp_excl_ascii(mapped);
// End non-boilerplate
match source.check_available() {
Space::Full(src_consumed) => {
return (
DecoderResult::InputEmpty,
src_consumed,
dest_again.written(),
);
}
Space::Available(source_handle) => {
match dest_again.check_space_bmp() {
Space::Full(dst_written) => {
return (
DecoderResult::OutputFull,
source_handle.consumed(),
dst_written,
);
}
Space::Available(mut destination_handle) => {
let (mut b, unread_handle) = source_handle.read();
let source_again = unread_handle.commit();
'innermost: loop {
if b > 127 {
non_ascii = b;
handle = destination_handle;
continue 'middle;
}
// Testing on Haswell says that we should write the
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
let dest_again_again = destination_handle.write_ascii(b);
if b < 60 {
// We've got punctuation
match source_again.check_available() {
Space::Full(src_consumed_again) => {
return (
DecoderResult::InputEmpty,
src_consumed_again,
dest_again_again.written(),
);
}
Space::Available(source_handle_again) => {
match dest_again_again.check_space_bmp() {
Space::Full(dst_written_again) => {
return (
DecoderResult::OutputFull,
source_handle_again.consumed(),
dst_written_again,
);
}
Space::Available(
destination_handle_again,
) => {
let (b_again, _unread_handle_again) =
source_handle_again.read();
b = b_again;
destination_handle =
destination_handle_again;
continue 'innermost;
}
}
}
}
}
// We've got markup or ASCII text
continue 'outermost;
}
}
}
}
}
},
}
}
}
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(DecoderResult::OutputFull, dst.len())
} else {
(DecoderResult::InputEmpty, src.len())
};
let mut converted = 0usize;
'outermost: loop {
match unsafe {
ascii_to_basic_latin(
src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted),
length - converted,
)
} {
None => {
return (pending, length, length);
}
Some((mut non_ascii, consumed)) => {
converted += consumed;
'middle: loop {
// `converted` doesn't count the reading of `non_ascii` yet.
// Since the non-ASCIIness of `non_ascii` is hidden from
// the optimizer, it can't figure out that it's OK to
// statically omit the bound check when accessing
// `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`.
let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize];
if mapped == 0u16 {
return (
DecoderResult::Malformed(1, 0),
converted + 1, // +1 `for non_ascii`
converted,
);
}
unsafe {
// The bound check has already been performed
*(dst.get_unchecked_mut(converted)) = mapped;
}
converted += 1;
// Next, handle ASCII punctuation and non-ASCII without
// going back to ASCII acceleration. Non-ASCII scripts
// use ASCII punctuation, so this avoid going to
// acceleration just for punctuation/space and then
// failing. This is a significant boost to non-ASCII
// scripts.
// TODO: Split out Latin converters without this part
// this stuff makes Latin script-conversion slower.
if converted == length {
return (pending, length, length);
}
let mut b = unsafe { *(src.get_unchecked(converted)) };
'innermost: loop {
if b > 127 {
non_ascii = b;
continue 'middle;
}
// Testing on Haswell says that we should write the
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
unsafe {
*(dst.get_unchecked_mut(converted)) = u16::from(b);
}
converted += 1;
if b < 60 {
// We've got punctuation
if converted == length {
return (pending, length, length);
}
b = unsafe { *(src.get_unchecked(converted)) };
continue 'innermost;
}
// We've got markup or ASCII text
continue 'outermost;
}
}
}
}
}
}
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
let mut bytes = buffer;
let mut total = 0;
loop {
if let Some((non_ascii, offset)) = validate_ascii(bytes) {
total += offset;
let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
if mapped != u16::from(non_ascii) {
return total;
}
total += 1;
bytes = &bytes[offset + 1..];
} else {
return total;
}
}
}
}
pub struct SingleByteEncoder {
table: &'static [u16; 128],
run_bmp_offset: usize,
run_byte_offset: usize,
run_length: usize,
}
impl SingleByteEncoder {
pub fn new(
encoding: &'static Encoding,
data: &'static [u16; 128],
run_bmp_offset: u16,
run_byte_offset: u8,
run_length: u8,
) -> Encoder {
Encoder::new(
encoding,
VariantEncoder::SingleByte(SingleByteEncoder {
table: data,
run_bmp_offset: run_bmp_offset as usize,
run_byte_offset: run_byte_offset as usize,
run_length: run_length as usize,
}),
)
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
Some(u16_length)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
Some(byte_length)
}
#[inline(always)]
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
// First, we see if the code unit falls into a run of consecutive
// code units that can be mapped by offset. This is very efficient
// for most non-Latin encodings as well as Latin1-ish encodings.
//
// For encodings that don't fit this pattern, the run (which may
// have the length of just one) just establishes the starting point
// for the next rule.
//
// Next, we do a forward linear search in the part of the index
// after the run. Even in non-Latin1-ish Latin encodings (except
// macintosh), the lower case letters are here.
//
// Next, we search the third quadrant up to the start of the run
// (upper case letters in Latin encodings except macintosh, in
// Greek and in KOI encodings) and then the second quadrant,
// except if the run stared before the third quadrant, we search
// the second quadrant up to the run.
//
// Last, we search the first quadrant, which has unused controls
// or punctuation in most encodings. This is bad for macintosh
// and IBM866, but those are rare.
// Run of consecutive units
let unit_as_usize = code_unit as usize;
let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
if offset < self.run_length {
return Some((128 + self.run_byte_offset + offset) as u8);
}
// Search after the run
let tail_start = self.run_byte_offset + self.run_length;
if let Some(pos) = position(&self.table[tail_start..], code_unit) {
return Some((128 + tail_start + pos) as u8);
}
if self.run_byte_offset >= 64 {
// Search third quadrant before the run
if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
return Some(((128 + 64) + pos) as u8);
}
// Search second quadrant
if let Some(pos) = position(&self.table[32..64], code_unit) {
return Some(((128 + 32) + pos) as u8);
}
} else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
// Search second quadrant before the run
return Some(((128 + 32) + pos) as u8);
}
// Search first quadrant
if let Some(pos) = position(&self.table[..32], code_unit) {
return Some((128 + pos) as u8);
}
None
}
ascii_compatible_bmp_encoder_function!(
{
match self.encode_u16(bmp) {
Some(byte) => handle.write_one(byte),
None => {
return (
EncoderResult::unmappable_from_bmp(bmp),
source.consumed(),
handle.written(),
);
}
}
},
bmp,
self,
source,
handle,
copy_ascii_to_check_space_one,
check_space_one,
encode_from_utf8_raw,
str,
Utf8Source,
true
);
pub fn encode_from_utf16_raw(
&mut self,
src: &[u16],
dst: &mut [u8],
_last: bool,
) -> (EncoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(EncoderResult::OutputFull, dst.len())
} else {
(EncoderResult::InputEmpty, src.len())
};
let mut converted = 0usize;
'outermost: loop {
match unsafe {
basic_latin_to_ascii(
src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted),
length - converted,
)
} {
None => {
return (pending, length, length);
}
Some((mut non_ascii, consumed)) => {
converted += consumed;
'middle: loop {
// `converted` doesn't count the reading of `non_ascii` yet.
match self.encode_u16(non_ascii) {
Some(byte) => {
unsafe {
*(dst.get_unchecked_mut(converted)) = byte;
}
converted += 1;
}
None => {
// At this point, we need to know if we
// have a surrogate.
let high_bits = non_ascii & 0xFC00u16;
if high_bits == 0xD800u16 {
// high surrogate
if converted + 1 == length {
// End of buffer. This surrogate is unpaired.
return (
EncoderResult::Unmappable('\u{FFFD}'),
converted + 1, // +1 `for non_ascii`
converted,
);
}
let second =
u32::from(unsafe { *src.get_unchecked(converted + 1) });
if second & 0xFC00u32 != 0xDC00u32 {
return (
EncoderResult::Unmappable('\u{FFFD}'),
converted + 1, // +1 `for non_ascii`
converted,
);
}
// The next code unit is a low surrogate.
let astral: char = unsafe {
::core::char::from_u32_unchecked(
(u32::from(non_ascii) << 10) + second
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
)
};
return (
EncoderResult::Unmappable(astral),
converted + 2, // +2 `for non_ascii` and `second`
converted,
);
}
if high_bits == 0xDC00u16 {
// Unpaired low surrogate
return (
EncoderResult::Unmappable('\u{FFFD}'),
converted + 1, // +1 `for non_ascii`
converted,
);
}
return (
EncoderResult::unmappable_from_bmp(non_ascii),
converted + 1, // +1 `for non_ascii`
converted,
);
}
}
// Next, handle ASCII punctuation and non-ASCII without
// going back to ASCII acceleration. Non-ASCII scripts
// use ASCII punctuation, so this avoid going to
// acceleration just for punctuation/space and then
// failing. This is a significant boost to non-ASCII
// scripts.
// TODO: Split out Latin converters without this part
// this stuff makes Latin script-conversion slower.
if converted == length {
return (pending, length, length);
}
let mut unit = unsafe { *(src.get_unchecked(converted)) };
'innermost: loop {
if unit > 127 {
non_ascii = unit;
continue 'middle;
}
// Testing on Haswell says that we should write the
// byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride.
unsafe {
*(dst.get_unchecked_mut(converted)) = unit as u8;
}
converted += 1;
if unit < 60 {
// We've got punctuation
if converted == length {
return (pending, length, length);
}
unit = unsafe { *(src.get_unchecked(converted)) };
continue 'innermost;
}
// We've got markup or ASCII text
continue 'outermost;
}
}
}
}
}
}
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
#[test]
fn test_windows_1255_ca() {
decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
}
#[test]
fn test_ascii_punctuation() {
let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
\u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
\u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
decode(WINDOWS_1253, bytes, characters);
encode(WINDOWS_1253, characters, bytes);
}
#[test]
fn test_decode_malformed() {
decode(
WINDOWS_1253,
b"\xC1\xF5\xD2\xF4\xFC",
"\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
);
}
#[test]
fn test_encode_unmappables() {
encode(
WINDOWS_1253,
"\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
b"\xC1\xF5&#9731;\xF4\xFC",
);
encode(
WINDOWS_1253,
"\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
b"\xC1\xF5&#128169;\xF4\xFC",
);
}
#[test]
fn test_encode_unpaired_surrogates() {
encode_from_utf16(
WINDOWS_1253,
&[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
b"\xC1\xF5&#65533;\xF4\xFC",
);
encode_from_utf16(
WINDOWS_1253,
&[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
b"\xC1\xF5&#65533;\xF4\xFC",
);
encode_from_utf16(
WINDOWS_1253,
&[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
b"\xC1\xF5\xF4\xFC&#65533;",
);
}
pub const HIGH_BYTES: &'static [u8; 128] = &[
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
];
fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
let mut with_replacement = [0u16; 128];
let mut it = data.iter().enumerate();
loop {
match it.next() {
Some((i, code_point)) => {
if *code_point == 0 {
with_replacement[i] = 0xFFFD;
} else {
with_replacement[i] = *code_point;
}
}
None => {
break;
}
}
}
decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
}
fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
let mut with_zeros = [0u8; 128];
let mut it = data.iter().enumerate();
loop {
match it.next() {
Some((i, code_point)) => {
if *code_point == 0 {
with_zeros[i] = 0;
} else {
with_zeros[i] = HIGH_BYTES[i];
}
}
None => {
break;
}
}
}
encode_from_utf16(encoding, data, &with_zeros[..]);
}
#[test]
fn test_single_byte_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = WINDOWS_1253.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
// These tests are so self-referential that they are pretty useless.
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
#[test]
fn test_single_byte_decode() {
decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
if cfg!(miri) {
// Miri is too slow
return;
}
decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
}
#[test]
fn test_single_byte_encode() {
encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
if cfg!(miri) {
// Miri is too slow
return;
}
encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
}
// END GENERATED CODE
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,242 @@
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
use super::*;
#[test]
fn test_all_labels() {
assert_eq!(Encoding::for_label(b"l1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"l2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"l3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"l4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"l5"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"l6"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"l9"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"mac"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"koi"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"gbk"), Some(GBK));
assert_eq!(Encoding::for_label(b"big5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"koi8"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"sjis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"ucs-2"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"ms932"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"cp866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"cp819"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"ascii"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"x-gbk"), Some(GBK));
assert_eq!(Encoding::for_label(b"greek"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"cp1250"), Some(WINDOWS_1250));
assert_eq!(Encoding::for_label(b"cp1251"), Some(WINDOWS_1251));
assert_eq!(Encoding::for_label(b"latin1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"gb2312"), Some(GBK));
assert_eq!(Encoding::for_label(b"cp1252"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"latin2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"cp1253"), Some(WINDOWS_1253));
assert_eq!(Encoding::for_label(b"latin3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"cp1254"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"latin4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"cp1255"), Some(WINDOWS_1255));
assert_eq!(Encoding::for_label(b"csbig5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"latin5"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"utf-16"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"cp1256"), Some(WINDOWS_1256));
assert_eq!(Encoding::for_label(b"ibm866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"latin6"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"cp1257"), Some(WINDOWS_1257));
assert_eq!(Encoding::for_label(b"cp1258"), Some(WINDOWS_1258));
assert_eq!(Encoding::for_label(b"greek8"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"ibm819"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"arabic"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"visual"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"korean"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"euc-jp"), Some(EUC_JP));
assert_eq!(Encoding::for_label(b"koi8-r"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"koi8_r"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"euc-kr"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"x-sjis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"koi8-u"), Some(KOI8_U));
assert_eq!(Encoding::for_label(b"hebrew"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"tis-620"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"gb18030"), Some(GB18030));
assert_eq!(Encoding::for_label(b"ksc5601"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"gb_2312"), Some(GBK));
assert_eq!(Encoding::for_label(b"dos-874"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"cn-big5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"unicode"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"chinese"), Some(GBK));
assert_eq!(Encoding::for_label(b"logical"), Some(ISO_8859_8_I));
assert_eq!(Encoding::for_label(b"cskoi8r"), Some(KOI8_R));
assert_eq!(Encoding::for_label(b"cseuckr"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"koi8-ru"), Some(KOI8_U));
assert_eq!(Encoding::for_label(b"x-cp1250"), Some(WINDOWS_1250));
assert_eq!(Encoding::for_label(b"ksc_5601"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"x-cp1251"), Some(WINDOWS_1251));
assert_eq!(Encoding::for_label(b"iso88591"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"csgb2312"), Some(GBK));
assert_eq!(Encoding::for_label(b"x-cp1252"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso88592"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"x-cp1253"), Some(WINDOWS_1253));
assert_eq!(Encoding::for_label(b"iso88593"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"ecma-114"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"x-cp1254"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso88594"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"x-cp1255"), Some(WINDOWS_1255));
assert_eq!(Encoding::for_label(b"iso88595"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"x-x-big5"), Some(BIG5));
assert_eq!(Encoding::for_label(b"x-cp1256"), Some(WINDOWS_1256));
assert_eq!(Encoding::for_label(b"csibm866"), Some(IBM866));
assert_eq!(Encoding::for_label(b"iso88596"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"x-cp1257"), Some(WINDOWS_1257));
assert_eq!(Encoding::for_label(b"iso88597"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"asmo-708"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"ecma-118"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"elot_928"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"x-cp1258"), Some(WINDOWS_1258));
assert_eq!(Encoding::for_label(b"iso88598"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso88599"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"cyrillic"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"utf-16be"), Some(UTF_16BE));
assert_eq!(Encoding::for_label(b"utf-16le"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"us-ascii"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"ms_kanji"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"x-euc-jp"), Some(EUC_JP));
assert_eq!(Encoding::for_label(b"iso885910"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso8859-1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso885911"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"iso8859-2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso8859-3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso885913"), Some(ISO_8859_13));
assert_eq!(Encoding::for_label(b"iso8859-4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso885914"), Some(ISO_8859_14));
assert_eq!(Encoding::for_label(b"iso8859-5"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso885915"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"iso8859-6"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso8859-7"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso8859-8"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-ir-58"), Some(GBK));
assert_eq!(Encoding::for_label(b"iso8859-9"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"csunicode"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"macintosh"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"shift-jis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"shift_jis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"iso-ir-100"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso8859-10"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso-ir-110"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"gb_2312-80"), Some(GBK));
assert_eq!(Encoding::for_label(b"iso-8859-1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso_8859-1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso-ir-101"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso8859-11"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"iso-8859-2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso_8859-2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"hz-gb-2312"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"iso-8859-3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso_8859-3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso8859-13"), Some(ISO_8859_13));
assert_eq!(Encoding::for_label(b"iso-8859-4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso_8859-4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso8859-14"), Some(ISO_8859_14));
assert_eq!(Encoding::for_label(b"iso-ir-144"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso-8859-5"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso_8859-5"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso8859-15"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"iso-8859-6"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso_8859-6"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-ir-126"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso-8859-7"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso_8859-7"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso-ir-127"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-ir-157"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso-8859-8"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso_8859-8"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-ir-138"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-ir-148"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso-8859-9"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso_8859-9"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso-ir-109"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso-ir-149"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"big5-hkscs"), Some(BIG5));
assert_eq!(Encoding::for_label(b"csshiftjis"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"iso-8859-10"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"iso-8859-11"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"csisolatin1"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"csisolatin2"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso-8859-13"), Some(ISO_8859_13));
assert_eq!(Encoding::for_label(b"csisolatin3"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso-8859-14"), Some(ISO_8859_14));
assert_eq!(Encoding::for_label(b"windows-874"), Some(WINDOWS_874));
assert_eq!(Encoding::for_label(b"csisolatin4"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso-8859-15"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"iso_8859-15"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"csisolatin5"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"iso-8859-16"), Some(ISO_8859_16));
assert_eq!(Encoding::for_label(b"csisolatin6"), Some(ISO_8859_10));
assert_eq!(Encoding::for_label(b"windows-949"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"csisolatin9"), Some(ISO_8859_15));
assert_eq!(Encoding::for_label(b"csiso88596e"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"csiso88598e"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"unicodefffe"), Some(UTF_16BE));
assert_eq!(Encoding::for_label(b"unicodefeff"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"csmacintosh"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"csiso88596i"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"csiso88598i"), Some(ISO_8859_8_I));
assert_eq!(Encoding::for_label(b"windows-31j"), Some(SHIFT_JIS));
assert_eq!(Encoding::for_label(b"x-mac-roman"), Some(MACINTOSH));
assert_eq!(Encoding::for_label(b"iso-2022-cn"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"iso-2022-jp"), Some(ISO_2022_JP));
assert_eq!(Encoding::for_label(b"csiso2022jp"), Some(ISO_2022_JP));
assert_eq!(Encoding::for_label(b"iso-2022-kr"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"csiso2022kr"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"replacement"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"windows-1250"), Some(WINDOWS_1250));
assert_eq!(Encoding::for_label(b"windows-1251"), Some(WINDOWS_1251));
assert_eq!(Encoding::for_label(b"windows-1252"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"windows-1253"), Some(WINDOWS_1253));
assert_eq!(Encoding::for_label(b"windows-1254"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"windows-1255"), Some(WINDOWS_1255));
assert_eq!(Encoding::for_label(b"windows-1256"), Some(WINDOWS_1256));
assert_eq!(Encoding::for_label(b"windows-1257"), Some(WINDOWS_1257));
assert_eq!(Encoding::for_label(b"windows-1258"), Some(WINDOWS_1258));
assert_eq!(Encoding::for_label(b"iso-8859-6-e"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-8859-8-e"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"iso-8859-6-i"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso-8859-8-i"), Some(ISO_8859_8_I));
assert_eq!(Encoding::for_label(b"sun_eu_greek"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"csksc56011987"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"unicode20utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"unicode11utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"ks_c_5601-1987"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"ansi_x3.4-1968"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"ks_c_5601-1989"), Some(EUC_KR));
assert_eq!(Encoding::for_label(b"x-mac-cyrillic"), Some(X_MAC_CYRILLIC));
assert_eq!(Encoding::for_label(b"x-user-defined"), Some(X_USER_DEFINED));
assert_eq!(Encoding::for_label(b"csiso58gb231280"), Some(GBK));
assert_eq!(Encoding::for_label(b"iso-10646-ucs-2"), Some(UTF_16LE));
assert_eq!(Encoding::for_label(b"iso_8859-1:1987"), Some(WINDOWS_1252));
assert_eq!(Encoding::for_label(b"iso_8859-2:1987"), Some(ISO_8859_2));
assert_eq!(Encoding::for_label(b"iso_8859-6:1987"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"iso_8859-7:1987"), Some(ISO_8859_7));
assert_eq!(Encoding::for_label(b"iso_8859-3:1988"), Some(ISO_8859_3));
assert_eq!(Encoding::for_label(b"iso_8859-4:1988"), Some(ISO_8859_4));
assert_eq!(Encoding::for_label(b"iso_8859-5:1988"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"iso_8859-8:1988"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"x-unicode20utf8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"iso_8859-9:1989"), Some(WINDOWS_1254));
assert_eq!(Encoding::for_label(b"csisolatingreek"), Some(ISO_8859_7));
assert_eq!(
Encoding::for_label(b"x-mac-ukrainian"),
Some(X_MAC_CYRILLIC)
);
assert_eq!(Encoding::for_label(b"iso-2022-cn-ext"), Some(REPLACEMENT));
assert_eq!(Encoding::for_label(b"csisolatinarabic"), Some(ISO_8859_6));
assert_eq!(Encoding::for_label(b"csisolatinhebrew"), Some(ISO_8859_8));
assert_eq!(Encoding::for_label(b"unicode-1-1-utf-8"), Some(UTF_8));
assert_eq!(Encoding::for_label(b"csisolatincyrillic"), Some(ISO_8859_5));
assert_eq!(Encoding::for_label(b"cseucpkdfmtjapanese"), Some(EUC_JP));
}

View File

@@ -0,0 +1,262 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
pub fn decode(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
let mut vec = Vec::with_capacity(bytes.len() + 32);
let mut string = String::with_capacity(expect.len() + 32);
let range = if cfg!(miri) {
0usize..4usize
} else {
0usize..32usize
};
for i in range {
vec.clear();
string.clear();
for j in 0usize..i {
let c = 0x40u8 + (j as u8);
vec.push(c);
string.push(c as char);
}
vec.extend_from_slice(bytes);
string.push_str(expect);
decode_without_padding_impl(encoding, &vec[..], &string[..], i);
}
}
pub fn decode_without_padding(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
decode_without_padding_impl(encoding, bytes, expect, 0);
}
fn decode_without_padding_impl(
encoding: &'static Encoding,
bytes: &[u8],
expect: &str,
padding: usize,
) {
decode_to_utf8_impl(encoding, bytes, expect, padding);
decode_to_utf16_impl(encoding, bytes, &utf16_from_utf8(expect)[..], padding);
decode_to_string(encoding, bytes, expect);
}
pub fn encode(encoding: &'static Encoding, str: &str, expect: &[u8]) {
let mut vec = Vec::with_capacity(expect.len() + 32);
let mut string = String::with_capacity(str.len() + 32);
let range = if cfg!(miri) {
0usize..4usize
} else {
0usize..32usize
};
for i in range {
vec.clear();
string.clear();
for j in 0usize..i {
let c = 0x40u8 + (j as u8);
vec.push(c);
string.push(c as char);
}
vec.extend_from_slice(expect);
string.push_str(str);
encode_without_padding(encoding, &string[..], &vec[..]);
}
}
pub fn encode_without_padding(encoding: &'static Encoding, string: &str, expect: &[u8]) {
encode_from_utf8(encoding, string, expect);
encode_from_utf16(encoding, &utf16_from_utf8(string)[..], expect);
encode_to_vec(encoding, string, expect);
}
pub fn decode_to_utf16(encoding: &'static Encoding, bytes: &[u8], expect: &[u16]) {
decode_to_utf16_impl(encoding, bytes, expect, 0);
}
pub fn decode_to_utf16_impl(
encoding: &'static Encoding,
bytes: &[u8],
expect: &[u16],
padding: usize,
) {
for i in padding..bytes.len() {
let (head, tail) = bytes.split_at(i);
decode_to_utf16_with_boundary(encoding, head, tail, expect);
}
}
pub fn decode_to_utf16_with_boundary(
encoding: &'static Encoding,
head: &[u8],
tail: &[u8],
expect: &[u16],
) {
let mut decoder = encoding.new_decoder();
let mut dest: Vec<u16> = Vec::with_capacity(
decoder
.max_utf16_buffer_length(head.len() + tail.len())
.unwrap(),
);
let capacity = dest.capacity();
dest.resize(capacity, 0u16);
let mut total_read = 0;
let mut total_written = 0;
{
let (complete, read, written, _) = decoder.decode_to_utf16(head, &mut dest, false);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
{
let (complete, read, written, _) =
decoder.decode_to_utf16(tail, &mut dest[total_written..], true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
assert_eq!(total_read, head.len() + tail.len());
assert_eq!(total_written, expect.len());
dest.truncate(total_written);
assert_eq!(&dest[..], expect);
}
pub fn decode_to_utf8(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
decode_to_utf8_impl(encoding, bytes, expect, 0);
}
pub fn decode_to_utf8_impl(
encoding: &'static Encoding,
bytes: &[u8],
expect: &str,
padding: usize,
) {
for i in padding..bytes.len() {
let (head, tail) = bytes.split_at(i);
decode_to_utf8_with_boundary(encoding, head, tail, expect);
}
}
pub fn decode_to_utf8_with_boundary(
encoding: &'static Encoding,
head: &[u8],
tail: &[u8],
expect: &str,
) {
let mut decoder = encoding.new_decoder();
let mut dest: Vec<u8> = Vec::with_capacity(
decoder
.max_utf8_buffer_length(head.len() + tail.len())
.unwrap(),
);
let capacity = dest.capacity();
dest.resize(capacity, 0u8);
let mut total_read = 0;
let mut total_written = 0;
{
let (complete, read, written, _) = decoder.decode_to_utf8(head, &mut dest, false);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
{
let (complete, read, written, _) =
decoder.decode_to_utf8(tail, &mut dest[total_written..], true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
total_read += read;
total_written += written;
}
assert_eq!(total_read, head.len() + tail.len());
assert_eq!(total_written, expect.len());
dest.truncate(total_written);
assert_eq!(&dest[..], expect.as_bytes());
}
pub fn decode_to_string(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
let (cow, _, _) = encoding.decode(bytes);
assert_eq!(&cow[..], expect);
}
pub fn encode_from_utf8(encoding: &'static Encoding, string: &str, expect: &[u8]) {
let mut encoder = encoding.new_encoder();
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
let capacity = dest.capacity();
dest.resize(capacity, 0u8);
let (complete, read, written, _) = encoder.encode_from_utf8(string, &mut dest, true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
assert_eq!(read, string.len());
assert_eq!(written, expect.len());
dest.truncate(written);
assert_eq!(&dest[..], expect);
}
pub fn encode_from_utf16(encoding: &'static Encoding, string: &[u16], expect: &[u8]) {
let mut encoder = encoding.new_encoder();
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
let capacity = dest.capacity();
dest.resize(capacity, 0u8);
let (complete, read, written, _) = encoder.encode_from_utf16(string, &mut dest, true);
match complete {
CoderResult::InputEmpty => {}
CoderResult::OutputFull => {
unreachable!();
}
}
assert_eq!(read, string.len());
// assert_eq!(written, expect.len());
dest.truncate(written);
assert_eq!(&dest[..], expect);
}
pub fn encode_to_vec(encoding: &'static Encoding, string: &str, expect: &[u8]) {
let (cow, _, _) = encoding.encode(string);
assert_eq!(&cow[..], expect);
}
pub fn utf16_from_utf8(string: &str) -> Vec<u16> {
let mut decoder = UTF_8.new_decoder_without_bom_handling();
let mut vec = Vec::with_capacity(decoder.max_utf16_buffer_length(string.len()).unwrap());
let capacity = vec.capacity();
vec.resize(capacity, 0);
let (result, read, written) =
decoder.decode_to_utf16_without_replacement(string.as_bytes(), &mut vec[..], true);
match result {
DecoderResult::InputEmpty => {
debug_assert_eq!(read, string.len());
vec.resize(written, 0);
vec
}
DecoderResult::Malformed(_, _) => unreachable!("Malformed"),
DecoderResult::OutputFull => unreachable!("Output full"),
}
}

472
zeroidc/vendor/encoding_rs/src/utf_16.rs vendored Normal file
View File

@@ -0,0 +1,472 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::handles::*;
use crate::variant::*;
pub struct Utf16Decoder {
lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
lead_byte: Option<u8>,
be: bool,
pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
}
impl Utf16Decoder {
pub fn new(big_endian: bool) -> VariantDecoder {
VariantDecoder::Utf16(Utf16Decoder {
lead_surrogate: 0,
lead_byte: None,
be: big_endian,
pending_bmp: false,
})
}
pub fn additional_from_state(&self) -> usize {
1 + if self.lead_byte.is_some() { 1 } else { 0 }
+ if self.lead_surrogate == 0 { 0 } else { 2 }
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_mul(
3,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
),
)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
checked_add(
1,
checked_mul(
3,
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
),
)
}
decoder_functions!(
{
if self.pending_bmp {
match dest.check_space_bmp() {
Space::Full(_) => {
return (DecoderResult::OutputFull, 0, 0);
}
Space::Available(destination_handle) => {
destination_handle.write_bmp(self.lead_surrogate);
self.pending_bmp = false;
self.lead_surrogate = 0;
}
}
}
},
{
// This is the fast path. The rest runs only at the
// start and end for partial sequences.
if self.lead_byte.is_none() && self.lead_surrogate == 0 {
if let Some((read, written)) = if self.be {
dest.copy_utf16_from::<BigEndian>(&mut source)
} else {
dest.copy_utf16_from::<LittleEndian>(&mut source)
} {
return (DecoderResult::Malformed(2, 0), read, written);
}
}
},
{
debug_assert!(!self.pending_bmp);
if self.lead_surrogate != 0 || self.lead_byte.is_some() {
// We need to check space without intent to write in order to
// make sure that there is space for the replacement character.
match dest.check_space_bmp() {
Space::Full(_) => {
return (DecoderResult::OutputFull, 0, 0);
}
Space::Available(_) => {
if self.lead_surrogate != 0 {
self.lead_surrogate = 0;
match self.lead_byte {
None => {
return (
DecoderResult::Malformed(2, 0),
src_consumed,
dest.written(),
);
}
Some(_) => {
self.lead_byte = None;
return (
DecoderResult::Malformed(3, 0),
src_consumed,
dest.written(),
);
}
}
}
debug_assert!(self.lead_byte.is_some());
self.lead_byte = None;
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
}
}
}
},
{
match self.lead_byte {
None => {
self.lead_byte = Some(b);
continue;
}
Some(lead) => {
self.lead_byte = None;
let code_unit = if self.be {
u16::from(lead) << 8 | u16::from(b)
} else {
u16::from(b) << 8 | u16::from(lead)
};
let high_bits = code_unit & 0xFC00u16;
if high_bits == 0xD800u16 {
// high surrogate
if self.lead_surrogate != 0 {
// The previous high surrogate was in
// error and this one becomes the new
// pending one.
self.lead_surrogate = code_unit as u16;
return (
DecoderResult::Malformed(2, 2),
unread_handle.consumed(),
destination_handle.written(),
);
}
self.lead_surrogate = code_unit;
continue;
}
if high_bits == 0xDC00u16 {
// low surrogate
if self.lead_surrogate == 0 {
return (
DecoderResult::Malformed(2, 0),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
self.lead_surrogate = 0;
continue;
}
// bmp
if self.lead_surrogate != 0 {
// The previous high surrogate was in
// error and this code unit becomes a
// pending BMP character.
self.lead_surrogate = code_unit;
self.pending_bmp = true;
return (
DecoderResult::Malformed(2, 2),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_bmp(code_unit);
continue;
}
}
},
self,
src_consumed,
dest,
source,
b,
destination_handle,
unread_handle,
check_space_astral
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_utf_16le(bytes: &[u8], expect: &str) {
decode_without_padding(UTF_16LE, bytes, expect);
}
fn decode_utf_16be(bytes: &[u8], expect: &str) {
decode_without_padding(UTF_16BE, bytes, expect);
}
fn encode_utf_16le(string: &str, expect: &[u8]) {
encode(UTF_16LE, string, expect);
}
fn encode_utf_16be(string: &str, expect: &[u8]) {
encode(UTF_16BE, string, expect);
}
#[test]
fn test_utf_16_decode() {
decode_utf_16le(b"", "");
decode_utf_16be(b"", "");
decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
// The \xFF makes sure that the parts before and after have different alignment
let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
}
#[test]
fn test_utf_16_encode() {
// Empty
encode_utf_16be("", b"");
encode_utf_16le("", b"");
// Encodes as UTF-8
assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
}
#[test]
fn test_utf_16be_decode_one_by_one() {
let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
for b in input.chunks(1) {
assert_eq!(b.len(), 1);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_one_by_one() {
let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
for b in input.chunks(1) {
assert_eq!(b.len(), 1);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert!(!had_errors);
}
}
#[test]
fn test_utf_16be_decode_three_at_a_time() {
let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
for b in input.chunks(3) {
assert_eq!(b.len(), 3);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, b.len());
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_three_at_a_time() {
let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
for b in input.chunks(3) {
assert_eq!(b.len(), 3);
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
let (result, read, _, had_errors) =
decoder.decode_to_utf16(b, &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, b.len());
assert!(!had_errors);
}
}
#[test]
fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
}
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(!had_errors);
assert_eq!(output[0], 0xFDFF);
}
}
#[test]
fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
}
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(!had_errors);
assert_eq!(output[0], 0xFEFD);
}
}
#[test]
fn test_utf_16le_decode_bom_prefix() {
let mut output = [0u16; 20];
let mut decoder = UTF_16LE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(had_errors);
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16be_decode_bom_prefix() {
let mut output = [0u16; 20];
let mut decoder = UTF_16BE.new_decoder();
{
let needed = decoder.max_utf16_buffer_length(1).unwrap();
let (result, read, written, had_errors) =
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 1);
assert!(had_errors);
assert_eq!(output[0], 0xFFFD);
}
}
#[test]
fn test_utf_16le_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16LE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
#[test]
fn test_utf_16be_decode_near_end() {
let mut output = [0u8; 4];
let mut decoder = UTF_16BE.new_decoder();
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x26], &mut output[..], false);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 1);
assert_eq!(written, 0);
assert!(!had_errors);
assert_eq!(output[0], 0x0);
}
{
let (result, read, written, had_errors) =
decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
assert_eq!(result, CoderResult::OutputFull);
assert_eq!(read, 1);
assert_eq!(written, 3);
assert!(!had_errors);
assert_eq!(output[0], 0xE2);
assert_eq!(output[1], 0x98);
assert_eq!(output[2], 0x83);
assert_eq!(output[3], 0x00);
}
}
}

1631
zeroidc/vendor/encoding_rs/src/utf_8.rs vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,400 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py
//! This module provides enums that wrap the various decoders and encoders.
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
//! dispatch explicitly for a finite set of specialized decoders and encoders.
//! Unfortunately, this means the compiler doesn't generate the dispatch code
//! and it has to be written here instead.
//!
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
//! allocation in Rust code, including the convenience methods on `Encoding`.
use super::*;
use big5::*;
use euc_jp::*;
use euc_kr::*;
use gb18030::*;
use iso_2022_jp::*;
use replacement::*;
use shift_jis::*;
use single_byte::*;
use utf_16::*;
use utf_8::*;
use x_user_defined::*;
pub enum VariantDecoder {
SingleByte(SingleByteDecoder),
Utf8(Utf8Decoder),
Gb18030(Gb18030Decoder),
Big5(Big5Decoder),
EucJp(EucJpDecoder),
Iso2022Jp(Iso2022JpDecoder),
ShiftJis(ShiftJisDecoder),
EucKr(EucKrDecoder),
Replacement(ReplacementDecoder),
UserDefined(UserDefinedDecoder),
Utf16(Utf16Decoder),
}
impl VariantDecoder {
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Utf8(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Gb18030(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Big5(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::EucJp(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Iso2022Jp(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::ShiftJis(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::EucKr(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Replacement(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::UserDefined(ref v) => v.max_utf16_buffer_length(byte_length),
VariantDecoder::Utf16(ref v) => v.max_utf16_buffer_length(byte_length),
}
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Utf8(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Gb18030(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Big5(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::EucJp(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Iso2022Jp(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::ShiftJis(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::EucKr(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Replacement(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::UserDefined(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
VariantDecoder::Utf16(ref v) => {
v.max_utf8_buffer_length_without_replacement(byte_length)
}
}
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Utf8(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Gb18030(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Big5(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::EucJp(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Iso2022Jp(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::ShiftJis(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::EucKr(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Replacement(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::UserDefined(ref v) => v.max_utf8_buffer_length(byte_length),
VariantDecoder::Utf16(ref v) => v.max_utf8_buffer_length(byte_length),
}
}
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
last: bool,
) -> (DecoderResult, usize, usize) {
match *self {
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Big5(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
}
}
pub fn decode_to_utf8_raw(
&mut self,
src: &[u8],
dst: &mut [u8],
last: bool,
) -> (DecoderResult, usize, usize) {
match *self {
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Big5(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
}
}
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
match *self {
VariantDecoder::SingleByte(ref v) => {
return Some(v.latin1_byte_compatible_up_to(buffer));
}
VariantDecoder::Utf8(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::Gb18030(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::Big5(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::EucJp(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::Iso2022Jp(ref v) => {
if v.in_neutral_state() {
return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
}
return None;
}
VariantDecoder::ShiftJis(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::EucKr(ref v) => {
if !v.in_neutral_state() {
return None;
}
}
VariantDecoder::UserDefined(_) => {}
VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
return None;
}
};
Some(Encoding::ascii_valid_up_to(buffer))
}
}
pub enum VariantEncoder {
SingleByte(SingleByteEncoder),
Utf8(Utf8Encoder),
Gb18030(Gb18030Encoder),
Big5(Big5Encoder),
EucJp(EucJpEncoder),
Iso2022Jp(Iso2022JpEncoder),
ShiftJis(ShiftJisEncoder),
EucKr(EucKrEncoder),
UserDefined(UserDefinedEncoder),
}
impl VariantEncoder {
pub fn has_pending_state(&self) -> bool {
match *self {
VariantEncoder::Iso2022Jp(ref v) => v.has_pending_state(),
_ => false,
}
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
match *self {
VariantEncoder::SingleByte(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Utf8(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Gb18030(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Big5(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::EucJp(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::Iso2022Jp(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::ShiftJis(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::EucKr(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
VariantEncoder::UserDefined(ref v) => {
v.max_buffer_length_from_utf16_without_replacement(u16_length)
}
}
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
match *self {
VariantEncoder::SingleByte(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Utf8(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Gb18030(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Big5(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::EucJp(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::Iso2022Jp(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::ShiftJis(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::EucKr(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
VariantEncoder::UserDefined(ref v) => {
v.max_buffer_length_from_utf8_without_replacement(byte_length)
}
}
}
pub fn encode_from_utf16_raw(
&mut self,
src: &[u16],
dst: &mut [u8],
last: bool,
) -> (EncoderResult, usize, usize) {
match *self {
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Big5(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
}
}
pub fn encode_from_utf8_raw(
&mut self,
src: &str,
dst: &mut [u8],
last: bool,
) -> (EncoderResult, usize, usize) {
match *self {
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Big5(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
}
}
}
pub enum VariantEncoding {
SingleByte(&'static [u16; 128], u16, u8, u8),
Utf8,
Gbk,
Gb18030,
Big5,
EucJp,
Iso2022Jp,
ShiftJis,
EucKr,
Replacement,
Utf16Be,
Utf16Le,
UserDefined,
}
impl VariantEncoding {
pub fn new_variant_decoder(&self) -> VariantDecoder {
match *self {
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
VariantEncoding::Utf8 => Utf8Decoder::new(),
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
VariantEncoding::Big5 => Big5Decoder::new(),
VariantEncoding::EucJp => EucJpDecoder::new(),
VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
VariantEncoding::EucKr => EucKrDecoder::new(),
VariantEncoding::Replacement => ReplacementDecoder::new(),
VariantEncoding::UserDefined => UserDefinedDecoder::new(),
VariantEncoding::Utf16Be => Utf16Decoder::new(true),
VariantEncoding::Utf16Le => Utf16Decoder::new(false),
}
}
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
match *self {
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => {
SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length)
}
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
VariantEncoding::Big5 => Big5Encoder::new(encoding),
VariantEncoding::EucJp => EucJpEncoder::new(encoding),
VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
VariantEncoding::EucKr => EucKrEncoder::new(encoding),
VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
VariantEncoding::Utf16Be | VariantEncoding::Replacement | VariantEncoding::Utf16Le => {
unreachable!()
}
}
}
pub fn is_single_byte(&self) -> bool {
match *self {
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
_ => false,
}
}
}

View File

@@ -0,0 +1,249 @@
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
use super::*;
use crate::handles::*;
use crate::variant::*;
cfg_if! {
if #[cfg(feature = "simd-accel")] {
use simd_funcs::*;
use packed_simd::u16x8;
#[inline(always)]
fn shift_upper(unpacked: u16x8) -> u16x8 {
let highest_ascii = u16x8::splat(0x7F);
unpacked + unpacked.gt(highest_ascii).select(u16x8::splat(0xF700), u16x8::splat(0)) }
} else {
}
}
pub struct UserDefinedDecoder;
impl UserDefinedDecoder {
pub fn new() -> VariantDecoder {
VariantDecoder::UserDefined(UserDefinedDecoder)
}
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
Some(byte_length)
}
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
byte_length.checked_mul(3)
}
decoder_function!(
{},
{},
{},
{
if b < 0x80 {
// ASCII run not optimized, because binary data expected
destination_handle.write_ascii(b);
continue;
}
destination_handle.write_upper_bmp(u16::from(b) + 0xF700);
continue;
},
self,
src_consumed,
dest,
source,
b,
destination_handle,
_unread_handle,
check_space_bmp,
decode_to_utf8_raw,
u8,
Utf8Destination
);
#[cfg(not(feature = "simd-accel"))]
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(DecoderResult::OutputFull, dst.len())
} else {
(DecoderResult::InputEmpty, src.len())
};
let src_trim = &src[..length];
let dst_trim = &mut dst[..length];
src_trim
.iter()
.zip(dst_trim.iter_mut())
.for_each(|(from, to)| {
*to = {
let unit = *from;
if unit < 0x80 {
u16::from(unit)
} else {
u16::from(unit) + 0xF700
}
}
});
(pending, length, length)
}
#[cfg(feature = "simd-accel")]
pub fn decode_to_utf16_raw(
&mut self,
src: &[u8],
dst: &mut [u16],
_last: bool,
) -> (DecoderResult, usize, usize) {
let (pending, length) = if dst.len() < src.len() {
(DecoderResult::OutputFull, dst.len())
} else {
(DecoderResult::InputEmpty, src.len())
};
// Not bothering with alignment
let tail_start = length & !0xF;
let simd_iterations = length >> 4;
let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr();
for i in 0..simd_iterations {
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
let (first, second) = simd_unpack(input);
unsafe {
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
}
}
let src_tail = &src[tail_start..length];
let dst_tail = &mut dst[tail_start..length];
src_tail
.iter()
.zip(dst_tail.iter_mut())
.for_each(|(from, to)| {
*to = {
let unit = *from;
if unit < 0x80 {
u16::from(unit)
} else {
u16::from(unit) + 0xF700
}
}
});
(pending, length, length)
}
}
pub struct UserDefinedEncoder;
impl UserDefinedEncoder {
pub fn new(encoding: &'static Encoding) -> Encoder {
Encoder::new(encoding, VariantEncoder::UserDefined(UserDefinedEncoder))
}
pub fn max_buffer_length_from_utf16_without_replacement(
&self,
u16_length: usize,
) -> Option<usize> {
Some(u16_length)
}
pub fn max_buffer_length_from_utf8_without_replacement(
&self,
byte_length: usize,
) -> Option<usize> {
Some(byte_length)
}
encoder_functions!(
{},
{
if c <= '\u{7F}' {
// TODO optimize ASCII run
destination_handle.write_one(c as u8);
continue;
}
if c < '\u{F780}' || c > '\u{F7FF}' {
return (
EncoderResult::Unmappable(c),
unread_handle.consumed(),
destination_handle.written(),
);
}
destination_handle.write_one((u32::from(c) - 0xF700) as u8);
continue;
},
self,
src_consumed,
source,
dest,
c,
destination_handle,
unread_handle,
check_space_one
);
}
// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
#[cfg(all(test, feature = "alloc"))]
mod tests {
use super::super::testing::*;
use super::super::*;
fn decode_x_user_defined(bytes: &[u8], expect: &str) {
decode(X_USER_DEFINED, bytes, expect);
}
fn encode_x_user_defined(string: &str, expect: &[u8]) {
encode(X_USER_DEFINED, string, expect);
}
#[test]
fn test_x_user_defined_decode() {
// Empty
decode_x_user_defined(b"", "");
// ASCII
decode_x_user_defined(b"\x61\x62", "\u{0061}\u{0062}");
decode_x_user_defined(b"\x80\xFF", "\u{F780}\u{F7FF}");
decode_x_user_defined(b"\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62", "\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}");
}
#[test]
fn test_x_user_defined_encode() {
// Empty
encode_x_user_defined("", b"");
// ASCII
encode_x_user_defined("\u{0061}\u{0062}", b"\x61\x62");
encode_x_user_defined("\u{F780}\u{F7FF}", b"\x80\xFF");
encode_x_user_defined("\u{F77F}\u{F800}", b"&#63359;&#63488;");
}
#[test]
fn test_x_user_defined_from_two_low_surrogates() {
let expectation = b"&#65533;&#65533;";
let mut output = [0u8; 40];
let mut encoder = X_USER_DEFINED.new_encoder();
let (result, read, written, had_errors) =
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
assert_eq!(result, CoderResult::InputEmpty);
assert_eq!(read, 2);
assert_eq!(written, expectation.len());
assert!(had_errors);
assert_eq!(&output[..written], expectation);
}
}