RPM build fix (reverted CI changes which will need to be un-reverted or made conditional) and vendor Rust dependencies to make builds much faster in any CI system.
This commit is contained in:
1548
zeroidc/vendor/encoding_rs/src/ascii.rs
vendored
Normal file
1548
zeroidc/vendor/encoding_rs/src/ascii.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
427
zeroidc/vendor/encoding_rs/src/big5.rs
vendored
Normal file
427
zeroidc/vendor/encoding_rs/src/big5.rs
vendored
Normal file
@@ -0,0 +1,427 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range32;
|
||||
|
||||
pub struct Big5Decoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl Big5Decoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Big5(Big5Decoder { lead: None })
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.lead.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// If there is a lead but the next byte isn't a valid trail, an
|
||||
// error is generated for the lead (+1). Then another iteration checks
|
||||
// space, which needs +1 to account for the possibility of astral
|
||||
// output or combining pair.
|
||||
checked_add(1, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// No need to account for REPLACEMENT CHARACTERS.
|
||||
// Cases:
|
||||
// ASCII: 1 to 1
|
||||
// Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
|
||||
// lead set and first byte is trail: 1 to 4 worst case
|
||||
//
|
||||
// When checking for space for the last byte:
|
||||
// no lead: the last byte must be ASCII (or fatal error): 1 to 1
|
||||
// lead set: space for 4 bytes was already checked when reading the
|
||||
// lead, hence the last lead and the last trail together are worst
|
||||
// case 2 to 4.
|
||||
//
|
||||
// If lead set and the input is a single trail byte, the worst-case
|
||||
// output is 4, so we need to add one before multiplying if lead is
|
||||
// set.
|
||||
//
|
||||
// Finally, add two so that if input is non-zero, the output is at
|
||||
// least 4.
|
||||
checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// If there is a lead but the next byte isn't a valid trail, an
|
||||
// error is generated for the lead (+(1*3)). Then another iteration
|
||||
// checks space, which needs +3 to account for the possibility of astral
|
||||
// output or combining pair. In between start and end, the worst case
|
||||
// is that every byte is bad: *3.
|
||||
checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0x40 and 0x7E, inclusive,
|
||||
// subtract offset 0x40. Else if trail is
|
||||
// between 0xA1 and 0xFE, inclusive, subtract
|
||||
// offset 0x62.
|
||||
// TODO: Find out which range is more probable.
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start =
|
||||
byte.wrapping_sub(0xA1);
|
||||
if trail_minus_range_start >
|
||||
(0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = byte - 0x62;
|
||||
}
|
||||
let pointer = lead_minus_offset as usize *
|
||||
157usize +
|
||||
trail_minus_offset as usize;
|
||||
let rebased_pointer = pointer.wrapping_sub(942);
|
||||
let low_bits = big5_low_bits(rebased_pointer);
|
||||
if low_bits == 0 {
|
||||
match pointer {
|
||||
1133 => {
|
||||
handle.write_big5_combination(0x00CAu16,
|
||||
0x0304u16)
|
||||
}
|
||||
1135 => {
|
||||
handle.write_big5_combination(0x00CAu16,
|
||||
0x030Cu16)
|
||||
}
|
||||
1164 => {
|
||||
handle.write_big5_combination(0x00EAu16,
|
||||
0x0304u16)
|
||||
}
|
||||
1166 => {
|
||||
handle.write_big5_combination(0x00EAu16,
|
||||
0x030Cu16)
|
||||
}
|
||||
_ => {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
} else if big5_is_astral(rebased_pointer) {
|
||||
handle.write_astral(u32::from(low_bits) |
|
||||
0x20000u32)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(low_bits)
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_astral,
|
||||
check_space_astral,
|
||||
false);
|
||||
}
|
||||
|
||||
pub struct Big5Encoder;
|
||||
|
||||
impl Big5Encoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
// Astral: 2 to 2
|
||||
// ASCII: 1 to 1
|
||||
// Other: 1 to 2
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
// Astral: 4 to 2
|
||||
// Upper BMP: 3 to 2
|
||||
// Lower BMP: 2 to 2
|
||||
// ASCII: 1 to 1
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_encoder_functions!(
|
||||
{
|
||||
// For simplicity, unified ideographs
|
||||
// in the pointer range 11206...11212 are handled
|
||||
// as Level 1 Hanzi.
|
||||
if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
let pointer = if let Some(pointer) = big5_box_encode(bmp) {
|
||||
pointer
|
||||
} else if let Some(pointer) = big5_other_encode(bmp) {
|
||||
pointer
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
let lead = pointer / 157 + 0x81;
|
||||
let remainder = pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
}
|
||||
},
|
||||
{
|
||||
if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
|
||||
if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
|
||||
// big5_astral_encode returns rebased pointer,
|
||||
// so adding 0x87 instead of 0x81.
|
||||
let lead = rebased_pointer / 157 + 0x87;
|
||||
let remainder = rebased_pointer % 157;
|
||||
let trail = if remainder < 0x3F {
|
||||
remainder + 0x40
|
||||
} else {
|
||||
remainder + 0x62
|
||||
};
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
astral,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_big5(bytes: &[u8], expect: &str) {
|
||||
decode(BIG5, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_big5(string: &str, expect: &[u8]) {
|
||||
encode(BIG5, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_decode() {
|
||||
// Empty
|
||||
decode_big5(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
|
||||
|
||||
// Edge cases
|
||||
decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
|
||||
decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
|
||||
decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
|
||||
decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
|
||||
decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
|
||||
decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
|
||||
decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
|
||||
decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
|
||||
decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
|
||||
decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
|
||||
decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
|
||||
decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
|
||||
|
||||
// Edge cases surrounded with ASCII
|
||||
decode_big5(
|
||||
&[0x61u8, 0x87u8, 0x40u8, 0x62u8],
|
||||
&"\u{0061}\u{43F0}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
|
||||
&"\u{0061}\u{79D4}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
|
||||
&"\u{0061}\u{2910D}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x62u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{0304}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x64u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{030C}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0x66u8, 0x62u8],
|
||||
&"\u{0061}\u{00CA}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{0304}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{030C}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
|
||||
&"\u{0061}\u{00EA}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
|
||||
&"\u{0061}\u{8991}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
|
||||
&"\u{0061}\u{27967}\u{0062}",
|
||||
);
|
||||
decode_big5(
|
||||
&[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
|
||||
&"\u{0061}\u{8A29}\u{0062}",
|
||||
);
|
||||
|
||||
// Bad sequences
|
||||
decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
|
||||
decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
|
||||
decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
|
||||
decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
|
||||
decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
|
||||
decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_big5_encode() {
|
||||
// Empty
|
||||
encode_big5("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_big5("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
// Edge cases
|
||||
encode_big5("\u{9EA6}\u{0061}", b"麦\x61");
|
||||
encode_big5("\u{2626B}\u{0061}", b"𦉫\x61");
|
||||
encode_big5("\u{3000}", b"\xA1\x40");
|
||||
encode_big5("\u{20AC}", b"\xA3\xE1");
|
||||
encode_big5("\u{4E00}", b"\xA4\x40");
|
||||
encode_big5("\u{27607}", b"\xC8\xA4");
|
||||
encode_big5("\u{FFE2}", b"\xC8\xCD");
|
||||
encode_big5("\u{79D4}", b"\xFE\xFE");
|
||||
|
||||
// Not in index
|
||||
encode_big5("\u{2603}\u{0061}", b"☃\x61");
|
||||
}
|
||||
|
||||
// duplicate low bits
|
||||
encode_big5("\u{203B5}", b"\xFD\x6A");
|
||||
encode_big5("\u{25605}", b"\xFE\x46");
|
||||
|
||||
// prefer last
|
||||
encode_big5("\u{2550}", b"\xF9\xF9");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_big5_decode_all() {
|
||||
let input = include_bytes!("test_data/big5_in.txt");
|
||||
let expectation = include_str!("test_data/big5_in_ref.txt");
|
||||
let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_big5_encode_all() {
|
||||
let input = include_str!("test_data/big5_out.txt");
|
||||
let expectation = include_bytes!("test_data/big5_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = BIG5.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, BIG5);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_big5_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = BIG5.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
114378
zeroidc/vendor/encoding_rs/src/data.rs
vendored
Normal file
114378
zeroidc/vendor/encoding_rs/src/data.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
469
zeroidc/vendor/encoding_rs/src/euc_jp.rs
vendored
Normal file
469
zeroidc/vendor/encoding_rs/src/euc_jp.rs
vendored
Normal file
@@ -0,0 +1,469 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
enum EucJpPending {
|
||||
None,
|
||||
Jis0208Lead(u8),
|
||||
Jis0212Shift,
|
||||
Jis0212Lead(u8),
|
||||
HalfWidthKatakana,
|
||||
}
|
||||
|
||||
impl EucJpPending {
|
||||
fn is_none(&self) -> bool {
|
||||
match *self {
|
||||
EucJpPending::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
match *self {
|
||||
EucJpPending::None => 0,
|
||||
EucJpPending::Jis0208Lead(_)
|
||||
| EucJpPending::Jis0212Shift
|
||||
| EucJpPending::HalfWidthKatakana => 1,
|
||||
EucJpPending::Jis0212Lead(_) => 2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucJpDecoder {
|
||||
pending: EucJpPending,
|
||||
}
|
||||
|
||||
impl EucJpDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::EucJp(EucJpDecoder {
|
||||
pending: EucJpPending::None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.pending.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 2 to 3
|
||||
let len = self.plus_one_if_lead(byte_length);
|
||||
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
euc_jp_decoder_functions!(
|
||||
{
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
|
||||
} else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
|
||||
} else if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
} else {
|
||||
let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
|
||||
} else {
|
||||
let ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// If lead is between 0xA1 and 0xFE, inclusive,
|
||||
// subtract 0xA1.
|
||||
let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
|
||||
if jis0212_lead_minus_offset > (0xFE - 0xA1) {
|
||||
if lead < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
unread_handle_jis0212.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_jis0212.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
jis0212_lead_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0xA1 and 0xFE, inclusive,
|
||||
// subtract 0xA1.
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset > (0xFE - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(3, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
|
||||
let pointer_minus_kanji = pointer.wrapping_sub(1410);
|
||||
if pointer_minus_kanji < JIS0212_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
|
||||
} else if let Some(bmp) = jis0212_accented_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
|
||||
if pointer_minus_upper_cyrillic <= (607 - 597) {
|
||||
handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
|
||||
} else {
|
||||
let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
|
||||
if pointer_minus_lower_cyrillic <= (655 - 645) {
|
||||
handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
|
||||
} else {
|
||||
return (
|
||||
DecoderResult::Malformed(3, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// If trail is between 0xA1 and 0xDF, inclusive,
|
||||
// subtract 0xA1 and map to half-width Katakana.
|
||||
let trail_minus_offset = byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset > (0xDF - 0xA1) {
|
||||
if byte < 0x80 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
jis0208_lead_minus_offset,
|
||||
byte,
|
||||
unread_handle_trail,
|
||||
jis0212_lead_minus_offset,
|
||||
lead,
|
||||
unread_handle_jis0212,
|
||||
source,
|
||||
handle
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_euc_jp_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
Some((0xA1, 0xB8))
|
||||
} else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
|
||||
Some((lead, trail))
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
let lead = (pos / 94) + 0xD0;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
Some((lead as u8, trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucJpEncoder;
|
||||
|
||||
impl EucJpEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
|
||||
} else if bmp == 0xA5 {
|
||||
handle.write_one(0x5Cu8)
|
||||
} else if bmp == 0x203E {
|
||||
handle.write_one(0x7Eu8)
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0xA1u8, 0xDDu8)
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|
||||
|| bmp == 0xF929
|
||||
|| bmp == 0xF9DC
|
||||
{
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
let lead = (pos / 94) + 0xF9;
|
||||
let trail = (pos % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pointer) = ibm_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
let lead = (pointer / 94) + 0xA1;
|
||||
let trail = (pointer % 94) + 0xA1;
|
||||
handle.write_two(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_euc_jp(bytes: &[u8], expect: &str) {
|
||||
decode(EUC_JP, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_euc_jp(string: &str, expect: &[u8]) {
|
||||
encode(EUC_JP, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_jp_decode() {
|
||||
// Empty
|
||||
decode_euc_jp(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// Half-width
|
||||
decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
|
||||
decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
|
||||
decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8E", "\u{FFFD}");
|
||||
|
||||
// JIS 0212
|
||||
decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
|
||||
decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
|
||||
decode_euc_jp(b"\x8F", "\u{FFFD}");
|
||||
|
||||
// JIS 0208
|
||||
decode_euc_jp(b"\xA1\xA1", "\u{3000}");
|
||||
decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
|
||||
decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
|
||||
decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
|
||||
decode_euc_jp(b"\xA1", "\u{FFFD}");
|
||||
|
||||
// Bad leads
|
||||
decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
|
||||
|
||||
// Bad ASCII trail
|
||||
decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_jp_encode() {
|
||||
// Empty
|
||||
encode_euc_jp("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Exceptional code points
|
||||
encode_euc_jp("\u{00A5}", b"\x5C");
|
||||
encode_euc_jp("\u{203E}", b"\x7E");
|
||||
encode_euc_jp("\u{2212}", b"\xA1\xDD");
|
||||
|
||||
// Half-width
|
||||
encode_euc_jp("\u{FF61}", b"\x8E\xA1");
|
||||
encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
|
||||
|
||||
// JIS 0212
|
||||
encode_euc_jp("\u{02D8}", b"˘");
|
||||
|
||||
// JIS 0208
|
||||
encode_euc_jp("\u{3000}", b"\xA1\xA1");
|
||||
encode_euc_jp("\u{FF02}", b"\xFC\xFE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_jis0208_decode_all() {
|
||||
let input = include_bytes!("test_data/jis0208_in.txt");
|
||||
let expectation = include_str!("test_data/jis0208_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_jis0208_encode_all() {
|
||||
let input = include_str!("test_data/jis0208_out.txt");
|
||||
let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = EUC_JP.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, EUC_JP);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_jis0212_decode_all() {
|
||||
let input = include_bytes!("test_data/jis0212_in.txt");
|
||||
let expectation = include_str!("test_data/jis0212_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
}
|
||||
442
zeroidc/vendor/encoding_rs/src/euc_kr.rs
vendored
Normal file
442
zeroidc/vendor/encoding_rs/src/euc_kr.rs
vendored
Normal file
@@ -0,0 +1,442 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
use super::in_range16;
|
||||
|
||||
pub struct EucKrDecoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl EucKrDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::EucKr(EucKrDecoder { lead: None })
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.lead.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 2 to 3
|
||||
let len = self.plus_one_if_lead(byte_length);
|
||||
checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
if lead_minus_offset >= 0x20 {
|
||||
// Not the extension range above KS X 1001
|
||||
let trail_minus_offset =
|
||||
byte.wrapping_sub(0xA1);
|
||||
if trail_minus_offset <= (0xFE - 0xA1) {
|
||||
// KS X 1001
|
||||
let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
|
||||
let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
|
||||
if hangul_pointer < KSX1001_HANGUL.len() {
|
||||
let upper_bmp = KSX1001_HANGUL[hangul_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if ksx_pointer < KSX1001_SYMBOLS.len() {
|
||||
let bmp = KSX1001_SYMBOLS[ksx_pointer];
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
|
||||
if hanja_pointer < KSX1001_HANJA.len() {
|
||||
let upper_bmp = KSX1001_HANJA[hanja_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
|
||||
let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
|
||||
if mid_bmp == 0 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_mid_bmp(mid_bmp)
|
||||
} else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
|
||||
let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
|
||||
handle.write_mid_bmp(mid_bmp)
|
||||
} else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
|
||||
let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else {
|
||||
let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
|
||||
if other_pointer < 0x039F {
|
||||
let bmp = ksx1001_other_decode(other_pointer as u16);
|
||||
// ASCII range means unassigned
|
||||
if bmp < 0x80 {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Extension range to the left of
|
||||
// KS X 1001
|
||||
let left_lead = lead_minus_offset - 0x20;
|
||||
let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
|
||||
byte - (12 + 0x41)
|
||||
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
|
||||
byte - (6 + 0x41)
|
||||
} else if byte.wrapping_sub(0x41) < 0x1A {
|
||||
byte - 0x41
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
|
||||
if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
|
||||
let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Extension range above KS X 1001
|
||||
let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
|
||||
byte - (12 + 0x41)
|
||||
} else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
|
||||
byte - (6 + 0x41)
|
||||
} else if byte.wrapping_sub(0x41) < 0x1A {
|
||||
byte - 0x41
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
};
|
||||
let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
|
||||
let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_bmp,
|
||||
check_space_bmp,
|
||||
true);
|
||||
}
|
||||
|
||||
fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
|
||||
if in_inclusive_range16(bmp, 0x3000, 0x3015) {
|
||||
if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
|
||||
return Some((0xA1, pos + 0xA1));
|
||||
}
|
||||
}
|
||||
if let Some(other_pointer) = ksx1001_other_encode(bmp) {
|
||||
let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
|
||||
let other_trail = ((other_pointer as usize) % 94) + 0xA1;
|
||||
return Some((other_lead, other_trail));
|
||||
}
|
||||
if in_range16(bmp, 0x00AA, 0x0168) {
|
||||
// Latin
|
||||
if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
|
||||
return Some((0x81 + 0x28, 0xA1 + pos));
|
||||
}
|
||||
if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
|
||||
return Some((0x81 + 0x27, 0xA1 + pos));
|
||||
}
|
||||
} else if in_range16(bmp, 0x2500, 0x254C) {
|
||||
if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
|
||||
return Some((0x81 + 0x25, 0xA1 + pos));
|
||||
}
|
||||
}
|
||||
if in_inclusive_range16(bmp, 0x2015, 0x266D)
|
||||
|| in_inclusive_range16(bmp, 0x321C, 0x33D8)
|
||||
|| in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
|
||||
|| in_inclusive_range16(bmp, 0x00A1, 0x00F7)
|
||||
|| in_inclusive_range16(bmp, 0x02C7, 0x02DD)
|
||||
{
|
||||
if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
|
||||
if pos < (94 - 3) {
|
||||
return Some((0xA1, pos + 0xA1 + 3));
|
||||
}
|
||||
return Some((0xA2, pos - (94 - 3) + 0xA1));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-hangul-encode"))]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
|
||||
match KSX1001_HANGUL.binary_search(&bmp) {
|
||||
Ok(ksx_hangul_pointer) => {
|
||||
let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
|
||||
let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
|
||||
(ksx_hangul_lead as u8, ksx_hangul_trail as u8)
|
||||
}
|
||||
Err(_) => {
|
||||
let (lead, cp949_trail) = if bmp < 0xC8A5 {
|
||||
// Above KS X 1001
|
||||
let top_pointer = cp949_top_hangul_encode(bmp) as usize;
|
||||
let top_lead = (top_pointer / (190 - 12)) + 0x81;
|
||||
let top_trail = top_pointer % (190 - 12);
|
||||
(top_lead as u8, top_trail as u8)
|
||||
} else {
|
||||
// To the left of KS X 1001
|
||||
let left_pointer = cp949_left_hangul_encode(bmp) as usize;
|
||||
let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
|
||||
let left_trail = left_pointer % (190 - 94 - 12);
|
||||
(left_lead as u8, left_trail as u8)
|
||||
};
|
||||
let offset = if cp949_trail >= (0x40 - 12) {
|
||||
0x41 + 12
|
||||
} else if cp949_trail >= (0x20 - 6) {
|
||||
0x41 + 6
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
(lead as u8, (cp949_trail + offset) as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-hangul-encode")]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
|
||||
cp949_hangul_encode(bmp_minus_hangul_start)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-hanja-encode"))]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
|
||||
if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
|
||||
let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
|
||||
let hanja_trail = (hanja_pointer % 94) + 0xA1;
|
||||
Some((hanja_lead as u8, hanja_trail as u8))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-hanja-encode")]
|
||||
#[inline(always)]
|
||||
fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
|
||||
if bmp < 0xF900 {
|
||||
ksx1001_unified_hangul_encode(bmp)
|
||||
} else {
|
||||
Some(ksx1001_compatibility_hangul_encode(bmp))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EucKrEncoder;
|
||||
|
||||
impl EucKrEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
|
||||
let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
|
||||
// Hangul
|
||||
ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
|
||||
} else if in_range16(bmp, 0x33DE, 0xFF01) {
|
||||
// Vast range that includes no other
|
||||
// mappables except Hangul (already
|
||||
// processed) and Hanja.
|
||||
// Narrow the range further to Unified and
|
||||
// Compatibility ranges of Hanja.
|
||||
if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
|
||||
if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
|
||||
(hanja_lead, hanja_trail)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
|
||||
(lead as u8, trail as u8)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
handle.write_two(lead, trail)
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_euc_kr(bytes: &[u8], expect: &str) {
|
||||
decode(EUC_KR, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_euc_kr(string: &str, expect: &[u8]) {
|
||||
encode(EUC_KR, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_decode() {
|
||||
// Empty
|
||||
decode_euc_kr(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_euc_kr(b"\x81\x41", "\u{AC02}");
|
||||
decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
|
||||
decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
|
||||
decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
|
||||
decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
|
||||
decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode() {
|
||||
// Empty
|
||||
encode_euc_kr("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
encode_euc_kr("\u{AC02}", b"\x81\x41");
|
||||
encode_euc_kr("\u{8A70}", b"\xFD\xFE");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_euc_kr_decode_all() {
|
||||
let input = include_bytes!("test_data/euc_kr_in.txt");
|
||||
let expectation = include_str!("test_data/euc_kr_in_ref.txt");
|
||||
let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_euc_kr_encode_all() {
|
||||
let input = include_str!("test_data/euc_kr_out.txt");
|
||||
let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = EUC_KR.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, EUC_KR);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_euc_kr_encode_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = EUC_KR.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
767
zeroidc/vendor/encoding_rs/src/gb18030.rs
vendored
Normal file
767
zeroidc/vendor/encoding_rs/src/gb18030.rs
vendored
Normal file
@@ -0,0 +1,767 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range16;
|
||||
use super::in_range16;
|
||||
|
||||
enum Gb18030Pending {
|
||||
None,
|
||||
One(u8),
|
||||
Two(u8, u8),
|
||||
Three(u8, u8, u8),
|
||||
}
|
||||
|
||||
impl Gb18030Pending {
|
||||
fn is_none(&self) -> bool {
|
||||
match *self {
|
||||
Gb18030Pending::None => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn count(&self) -> usize {
|
||||
match *self {
|
||||
Gb18030Pending::None => 0,
|
||||
Gb18030Pending::One(_) => 1,
|
||||
Gb18030Pending::Two(_, _) => 2,
|
||||
Gb18030Pending::Three(_, _, _) => 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Gb18030Decoder {
|
||||
first: Option<u8>,
|
||||
second: Option<u8>,
|
||||
third: Option<u8>,
|
||||
pending: Gb18030Pending,
|
||||
pending_ascii: Option<u8>,
|
||||
}
|
||||
|
||||
impl Gb18030Decoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Gb18030(Gb18030Decoder {
|
||||
first: None,
|
||||
second: None,
|
||||
third: None,
|
||||
pending: Gb18030Pending::None,
|
||||
pending_ascii: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.first.is_none()
|
||||
&& self.second.is_none()
|
||||
&& self.third.is_none()
|
||||
&& self.pending.is_none()
|
||||
&& self.pending_ascii.is_none()
|
||||
}
|
||||
|
||||
fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(
|
||||
self.pending.count()
|
||||
+ match self.first {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.second {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.third {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
}
|
||||
+ match self.pending_ascii {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
// ASCII: 1 to 1 (worst case)
|
||||
// gbk: 2 to 1
|
||||
// ranges: 4 to 1 or 4 to 2
|
||||
checked_add(1, self.extra_from_state(byte_length))
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// ASCII: 1 to 1
|
||||
// gbk: 2 to 2 or 2 to 3
|
||||
// ranges: 4 to 2, 4 to 3 or 4 to 4
|
||||
// 0x80: 1 to 3 (worst case)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
|
||||
}
|
||||
|
||||
gb18030_decoder_functions!(
|
||||
{
|
||||
// If first is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0xFE - 0x81) {
|
||||
if non_ascii == 0x80 {
|
||||
handle.write_upper_bmp(0x20ACu16);
|
||||
continue 'outermost;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// Two-byte (or error)
|
||||
if first_minus_offset >= 0x20 {
|
||||
// Not the gbk ideograph range above GB2312
|
||||
let trail_minus_offset = second.wrapping_sub(0xA1);
|
||||
if trail_minus_offset <= (0xFE - 0xA1) {
|
||||
// GB2312
|
||||
let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
|
||||
if hanzi_lead < (0x77 - 0x2F) {
|
||||
// Level 1 Hanzi, Level 2 Hanzi
|
||||
// or one of the 5 PUA code
|
||||
// points in between.
|
||||
let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
|
||||
let upper_bmp = GB2312_HANZI[hanzi_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if first_minus_offset == 0x20 {
|
||||
// Symbols (starting with ideographic space)
|
||||
let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
|
||||
handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
|
||||
} else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
|
||||
handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
|
||||
} else if first_minus_offset > 0x76 {
|
||||
// Bottom PUA
|
||||
let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
|
||||
handle.write_upper_bmp(pua)
|
||||
} else {
|
||||
let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
}
|
||||
} else {
|
||||
// gbk range on the left
|
||||
let mut trail_minus_offset = second.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start = second.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xA0 - 0x80) {
|
||||
if second < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_second.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_second.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = second - 0x41;
|
||||
}
|
||||
// Zero-base lead
|
||||
let left_lead = first_minus_offset - 0x20;
|
||||
let left_pointer = left_lead as usize * (190 - 94) +
|
||||
trail_minus_offset as usize;
|
||||
let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
|
||||
if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
|
||||
let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
} else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
|
||||
let bmp = gbk_other_decode(left_pointer as u16);
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
|
||||
let upper_bmp = GBK_BOTTOM[bottom_pointer];
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// gbk ideograph range above GB2312
|
||||
let mut trail_minus_offset = second.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start = second.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xFE - 0x80) {
|
||||
if second < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_second.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_second.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = second - 0x41;
|
||||
}
|
||||
let pointer = first_minus_offset as usize * 190usize +
|
||||
trail_minus_offset as usize;
|
||||
let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
|
||||
handle.write_upper_bmp(upper_bmp)
|
||||
}
|
||||
},
|
||||
{
|
||||
// If third is between 0x81 and 0xFE, inclusive,
|
||||
// subtract offset 0x81.
|
||||
let third_minus_offset = third.wrapping_sub(0x81);
|
||||
if third_minus_offset > (0xFE - 0x81) {
|
||||
// We have an error. Let's inline what's going
|
||||
// to happen when `second` is
|
||||
// reprocessed. (`third` gets unread.)
|
||||
// `second` is guaranteed ASCII, so let's
|
||||
// put it in `pending_ascii`. Recompute
|
||||
// `second` from `second_minus_offset`.
|
||||
self.pending_ascii = Some(second_minus_offset + 0x30);
|
||||
// Now unread `third` and designate the previous
|
||||
// `first` as being in error.
|
||||
return (DecoderResult::Malformed(1, 1),
|
||||
unread_handle_third.unread(),
|
||||
handle.written());
|
||||
}
|
||||
third_minus_offset
|
||||
},
|
||||
{
|
||||
// If fourth is between 0x30 and 0x39, inclusive,
|
||||
// subtract offset 0x30.
|
||||
//
|
||||
// If we have an error, we'll inline what's going
|
||||
// to happen when `second` and `third` are
|
||||
// reprocessed. (`fourth` gets unread.)
|
||||
// `second` is guaranteed ASCII, so let's
|
||||
// put it in `pending_ascii`. Recompute
|
||||
// `second` from `second_minus_offset` to
|
||||
// make this block reusable when `second`
|
||||
// is not in scope.
|
||||
//
|
||||
// `third` is guaranteed to be in the range
|
||||
// that makes it become the new `self.first`.
|
||||
//
|
||||
// `fourth` gets unread and the previous
|
||||
// `first` gets designates as being in error.
|
||||
let fourth_minus_offset = fourth.wrapping_sub(0x30);
|
||||
if fourth_minus_offset > (0x39 - 0x30) {
|
||||
self.pending_ascii = Some(second_minus_offset + 0x30);
|
||||
self.pending = Gb18030Pending::One(third_minus_offset);
|
||||
return (DecoderResult::Malformed(1, 2),
|
||||
unread_handle_fourth.unread(),
|
||||
handle.written());
|
||||
}
|
||||
let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
|
||||
(second_minus_offset as usize * (10 * 126)) +
|
||||
(third_minus_offset as usize * 10) +
|
||||
fourth_minus_offset as usize;
|
||||
if pointer <= 39419 {
|
||||
// BMP
|
||||
if pointer == 7457 {
|
||||
handle.write_upper_bmp(0xE7C7)
|
||||
} else {
|
||||
handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
|
||||
}
|
||||
} else if pointer >= 189_000 && pointer <= 1_237_575 {
|
||||
// Astral
|
||||
handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
|
||||
} else {
|
||||
return (DecoderResult::Malformed(4, 0),
|
||||
unread_handle_fourth.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
first_minus_offset,
|
||||
second,
|
||||
second_minus_offset,
|
||||
unread_handle_second,
|
||||
third,
|
||||
third_minus_offset,
|
||||
unread_handle_third,
|
||||
fourth,
|
||||
fourth_minus_offset,
|
||||
unread_handle_fourth,
|
||||
source,
|
||||
handle,
|
||||
'outermost);
|
||||
}
|
||||
|
||||
// XXX Experiment with inline directives
|
||||
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
|
||||
// Try ideographic punctuation first as it's the most likely case.
|
||||
// Throwing in the check for full-width currencies and tilde is probably
|
||||
// more size-efficient here than elsewhere.
|
||||
if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
|
||||
return Some((0xA1, pos + 0xA1));
|
||||
}
|
||||
}
|
||||
// Ext A
|
||||
if in_range16(bmp, 0x3400, 0x4E00) {
|
||||
return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
|
||||
(
|
||||
0xFE,
|
||||
pos + if pos < (0x3F - 16) {
|
||||
0x40 + 16
|
||||
} else {
|
||||
0x41 + 16
|
||||
},
|
||||
)
|
||||
});
|
||||
}
|
||||
// Compatibility ideographs
|
||||
if in_range16(bmp, 0xF900, 0xFB00) {
|
||||
return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
|
||||
if pos < 5 {
|
||||
// end of second to last row
|
||||
(0xFD, pos + (190 - 94 - 5 + 0x41))
|
||||
} else {
|
||||
// last row
|
||||
(0xFE, pos + (0x40 - 5))
|
||||
}
|
||||
});
|
||||
}
|
||||
// Handle everything below U+02CA, which is in GBK_OTHER.
|
||||
if bmp < 0x02CA {
|
||||
if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
|
||||
// Pinyin except U+1E3F
|
||||
if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
|
||||
return Some((0xA8, pos + 0xA1));
|
||||
}
|
||||
} else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
|
||||
|| in_inclusive_range16(bmp, 0x02C7, 0x02C9)
|
||||
{
|
||||
// Diacritics and Latin 1 symbols
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
|
||||
return Some((0xA1, pos + 0xA1 + 3));
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
if bmp >= 0xE794 {
|
||||
// Various brackets, all in PUA or full-width regions
|
||||
if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
|
||||
return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
|
||||
}
|
||||
} else if bmp == 0x1E3F {
|
||||
// The one Pinyin placed elsewhere on the BMP
|
||||
return Some((0xA8, 0x7B - 0x60 + 0xA1));
|
||||
} else if in_range16(bmp, 0xA000, 0xD800) {
|
||||
// Since Korean has usage in China, let's spend a branch to fast-track
|
||||
// Hangul.
|
||||
return None;
|
||||
}
|
||||
// GB2312 other (except bottom PUA and PUA between Hanzi levels).
|
||||
if let Some(other_pointer) = gb2312_other_encode(bmp) {
|
||||
let other_lead = other_pointer as usize / 94;
|
||||
let other_trail = other_pointer as usize % 94;
|
||||
return Some((0xA2 + other_lead, 0xA1 + other_trail));
|
||||
}
|
||||
// At this point, we've handled all mappable characters above U+02D9 but
|
||||
// below U+2010. Let's check for that range in order to let lower BMP
|
||||
// characters used for minority languages in China avoid the subsequent
|
||||
// search that deals mainly with various symbols.
|
||||
if in_range16(bmp, 0x02DA, 0x2010) {
|
||||
return None;
|
||||
}
|
||||
// GBK other (except radicals and PUA in GBK_BOTTOM).
|
||||
if let Some(other_pointer) = gbk_other_encode(bmp) {
|
||||
let other_lead = other_pointer as usize / (190 - 94);
|
||||
let other_trail = other_pointer as usize % (190 - 94);
|
||||
let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
return Some((other_lead + (0x81 + 0x20), other_trail + offset));
|
||||
}
|
||||
// CJK Radicals Supplement or PUA in GBK_BOTTOM
|
||||
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
|
||||
if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
|
||||
let trail = pos + 16;
|
||||
let offset = if trail < 0x3F { 0x40 } else { 0x41 };
|
||||
return Some((0xFE, trail + offset));
|
||||
}
|
||||
}
|
||||
// GB2312 bottom PUA
|
||||
let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
|
||||
if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
|
||||
let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
|
||||
let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
|
||||
return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
|
||||
}
|
||||
// PUA between Hanzi Levels
|
||||
let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
|
||||
if bmp_minus_pua_between_hanzi < 5 {
|
||||
return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
|
||||
if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
|
||||
(lead, trail)
|
||||
} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
|
||||
let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
|
||||
let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
|
||||
(hanzi_lead as u8, hanzi_trail as u8)
|
||||
} else {
|
||||
let (lead, gbk_trail) = if bmp < 0x72DC {
|
||||
// Above GB2312
|
||||
let pointer = gbk_top_ideograph_encode(bmp) as usize;
|
||||
let lead = (pointer / 190) + 0x81;
|
||||
let gbk_trail = pointer % 190;
|
||||
(lead, gbk_trail)
|
||||
} else {
|
||||
// To the left of GB2312
|
||||
let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
|
||||
let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
|
||||
let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
|
||||
(lead, gbk_trail)
|
||||
};
|
||||
let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
|
||||
(lead as u8, (gbk_trail + offset) as u8)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-gb-hanzi-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
|
||||
gbk_hanzi_encode(bmp_minus_unified_start)
|
||||
}
|
||||
|
||||
pub struct Gb18030Encoder {
|
||||
extended: bool,
|
||||
}
|
||||
|
||||
impl Gb18030Encoder {
|
||||
pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::Gb18030(Gb18030Encoder {
|
||||
extended: extended_range,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
if self.extended {
|
||||
u16_length.checked_mul(4)
|
||||
} else {
|
||||
// Need to add, because space check is done with the four-byte
|
||||
// assumption.
|
||||
checked_add(2, u16_length.checked_mul(2))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
if self.extended {
|
||||
// 1 to 1
|
||||
// 2 to 2
|
||||
// 3 to 2
|
||||
// 2 to 4 (worst)
|
||||
// 3 to 4
|
||||
// 4 to 4
|
||||
checked_add(2, byte_length.checked_mul(2))
|
||||
} else {
|
||||
// 1 to 1
|
||||
// 2 to 2
|
||||
// 3 to 2
|
||||
// Need to add, because space check is done with the four-byte
|
||||
// assumption.
|
||||
byte_length.checked_add(3)
|
||||
}
|
||||
}
|
||||
|
||||
ascii_compatible_encoder_functions!(
|
||||
{
|
||||
let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
|
||||
if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
|
||||
// CJK Unified Ideographs
|
||||
// Can't fail now, since all are
|
||||
// mapped.
|
||||
let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
|
||||
handle.write_two(lead, trail)
|
||||
} else if bmp == 0xE5E5 {
|
||||
// It's not optimal to check for the unmappable
|
||||
// and for euro at this stage, but getting
|
||||
// the out of the way makes the rest of the
|
||||
// code less messy.
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
} else if bmp == 0x20AC && !self.extended {
|
||||
handle.write_one(0x80u8)
|
||||
} else {
|
||||
match gbk_encode_non_unified(bmp) {
|
||||
Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
|
||||
None => {
|
||||
if !self.extended {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let range_pointer = gb18030_range_encode(bmp);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
let rem_second = rem_first % (10 * 126);
|
||||
let third = rem_second / 10;
|
||||
let fourth = rem_second % 10;
|
||||
handle.write_four(
|
||||
(first + 0x81) as u8,
|
||||
(second + 0x30) as u8,
|
||||
(third + 0x81) as u8,
|
||||
(fourth + 0x30) as u8,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
if !self.extended {
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
|
||||
let first = range_pointer / (10 * 126 * 10);
|
||||
let rem_first = range_pointer % (10 * 126 * 10);
|
||||
let second = rem_first / (10 * 126);
|
||||
let rem_second = rem_first % (10 * 126);
|
||||
let third = rem_second / 10;
|
||||
let fourth = rem_second % 10;
|
||||
handle.write_four(
|
||||
(first + 0x81) as u8,
|
||||
(second + 0x30) as u8,
|
||||
(third + 0x81) as u8,
|
||||
(fourth + 0x30) as u8,
|
||||
)
|
||||
},
|
||||
bmp,
|
||||
astral,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_four,
|
||||
check_space_four,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_gb18030(bytes: &[u8], expect: &str) {
|
||||
decode(GB18030, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_gb18030(string: &str, expect: &[u8]) {
|
||||
encode(GB18030, string, expect);
|
||||
}
|
||||
|
||||
fn encode_gbk(string: &str, expect: &[u8]) {
|
||||
encode(GBK, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_decode() {
|
||||
// Empty
|
||||
decode_gb18030(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// euro
|
||||
decode_gb18030(b"\x80", "\u{20AC}");
|
||||
decode_gb18030(b"\xA2\xE3", "\u{20AC}");
|
||||
|
||||
// two bytes
|
||||
decode_gb18030(b"\x81\x40", "\u{4E02}");
|
||||
decode_gb18030(b"\x81\x7E", "\u{4E8A}");
|
||||
decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
|
||||
decode_gb18030(b"\x81\x80", "\u{4E90}");
|
||||
decode_gb18030(b"\x81\xFE", "\u{4FA2}");
|
||||
decode_gb18030(b"\xFE\x40", "\u{FA0C}");
|
||||
decode_gb18030(b"\xFE\x7E", "\u{E843}");
|
||||
decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
|
||||
decode_gb18030(b"\xFE\x80", "\u{4723}");
|
||||
decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
|
||||
|
||||
// The difference from the original GB18030
|
||||
decode_gb18030(b"\xA3\xA0", "\u{3000}");
|
||||
decode_gb18030(b"\xA1\xA1", "\u{3000}");
|
||||
|
||||
// 0xFF
|
||||
decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
|
||||
decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
|
||||
decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
|
||||
decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
|
||||
decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
|
||||
decode_gb18030(
|
||||
b"\xFF\x32\x9A\x33\x00",
|
||||
"\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
|
||||
);
|
||||
|
||||
// Four bytes
|
||||
decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
|
||||
decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
|
||||
decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
|
||||
decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
|
||||
decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
|
||||
decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
|
||||
decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode() {
|
||||
// Empty
|
||||
encode_gb18030("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// euro
|
||||
encode_gb18030("\u{20AC}", b"\xA2\xE3");
|
||||
|
||||
// two bytes
|
||||
encode_gb18030("\u{4E02}", b"\x81\x40");
|
||||
encode_gb18030("\u{4E8A}", b"\x81\x7E");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gb18030("\u{4E90}", b"\x81\x80");
|
||||
encode_gb18030("\u{4FA2}", b"\x81\xFE");
|
||||
encode_gb18030("\u{FA0C}", b"\xFE\x40");
|
||||
encode_gb18030("\u{E843}", b"\xFE\x7E");
|
||||
encode_gb18030("\u{4723}", b"\xFE\x80");
|
||||
encode_gb18030("\u{E4C5}", b"\xFE\xFE");
|
||||
}
|
||||
|
||||
// The difference from the original GB18030
|
||||
encode_gb18030("\u{E5E5}", b"");
|
||||
encode_gb18030("\u{3000}", b"\xA1\xA1");
|
||||
|
||||
// Four bytes
|
||||
encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
|
||||
encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
|
||||
encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
|
||||
encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
|
||||
}
|
||||
|
||||
// Edge cases
|
||||
encode_gb18030("\u{00F7}", b"\xA1\xC2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gbk_encode() {
|
||||
// Empty
|
||||
encode_gbk("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// euro
|
||||
encode_gbk("\u{20AC}", b"\x80");
|
||||
|
||||
// two bytes
|
||||
encode_gbk("\u{4E02}", b"\x81\x40");
|
||||
encode_gbk("\u{4E8A}", b"\x81\x7E");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gbk("\u{4E90}", b"\x81\x80");
|
||||
encode_gbk("\u{4FA2}", b"\x81\xFE");
|
||||
encode_gbk("\u{FA0C}", b"\xFE\x40");
|
||||
encode_gbk("\u{E843}", b"\xFE\x7E");
|
||||
encode_gbk("\u{4723}", b"\xFE\x80");
|
||||
encode_gbk("\u{E4C5}", b"\xFE\xFE");
|
||||
}
|
||||
|
||||
// The difference from the original gb18030
|
||||
encode_gbk("\u{E5E5}", b"");
|
||||
encode_gbk("\u{3000}", b"\xA1\xA1");
|
||||
|
||||
// Four bytes
|
||||
encode_gbk("\u{0080}", b"€");
|
||||
encode_gbk("\u{E7C7}", b"");
|
||||
if !cfg!(miri) {
|
||||
// Miri is too slow
|
||||
encode_gbk("\u{2603}", b"☃");
|
||||
encode_gbk("\u{1F4A9}", b"💩");
|
||||
encode_gbk("\u{10FFFF}", b"");
|
||||
}
|
||||
|
||||
// Edge cases
|
||||
encode_gbk("\u{00F7}", b"\xA1\xC2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_gb18030_decode_all() {
|
||||
let input = include_bytes!("test_data/gb18030_in.txt");
|
||||
let expectation = include_str!("test_data/gb18030_in_ref.txt");
|
||||
let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_gb18030_encode_all() {
|
||||
let input = include_str!("test_data/gb18030_out.txt");
|
||||
let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = GB18030.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, GB18030);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gb18030_encode_from_utf16_max_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut encoder = GB18030.new_encoder();
|
||||
{
|
||||
let needed = encoder
|
||||
.max_buffer_length_from_utf16_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) = encoder.encode_from_utf16_without_replacement(
|
||||
&[0x3000],
|
||||
&mut output[..needed],
|
||||
true,
|
||||
);
|
||||
assert_eq!(result, EncoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 2);
|
||||
assert_eq!(output[0], 0xA1);
|
||||
assert_eq!(output[1], 0xA1);
|
||||
}
|
||||
}
|
||||
}
|
||||
1969
zeroidc/vendor/encoding_rs/src/handles.rs
vendored
Normal file
1969
zeroidc/vendor/encoding_rs/src/handles.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1068
zeroidc/vendor/encoding_rs/src/iso_2022_jp.rs
vendored
Normal file
1068
zeroidc/vendor/encoding_rs/src/iso_2022_jp.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
6113
zeroidc/vendor/encoding_rs/src/lib.rs
vendored
Normal file
6113
zeroidc/vendor/encoding_rs/src/lib.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1622
zeroidc/vendor/encoding_rs/src/macros.rs
vendored
Normal file
1622
zeroidc/vendor/encoding_rs/src/macros.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
3356
zeroidc/vendor/encoding_rs/src/mem.rs
vendored
Normal file
3356
zeroidc/vendor/encoding_rs/src/mem.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
104
zeroidc/vendor/encoding_rs/src/replacement.rs
vendored
Normal file
104
zeroidc/vendor/encoding_rs/src/replacement.rs
vendored
Normal file
@@ -0,0 +1,104 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::variant::*;
|
||||
|
||||
pub struct ReplacementDecoder {
|
||||
emitted: bool,
|
||||
}
|
||||
|
||||
impl ReplacementDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::Replacement(ReplacementDecoder { emitted: false })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, _u16_length: usize) -> Option<usize> {
|
||||
Some(1)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, _byte_length: usize) -> Option<usize> {
|
||||
Some(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, _byte_length: usize) -> Option<usize> {
|
||||
Some(3)
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
// Don't err if the input stream is empty. See
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.is_empty() {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
self.emitted = true;
|
||||
(DecoderResult::Malformed(1, 0), 1, 0)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
// Don't err if the input stream is empty. See
|
||||
// https://github.com/whatwg/encoding/issues/33
|
||||
if self.emitted || src.is_empty() {
|
||||
(DecoderResult::InputEmpty, src.len(), 0)
|
||||
} else if dst.len() < 3 {
|
||||
// Make sure there's room for the replacement character.
|
||||
(DecoderResult::OutputFull, 0, 0)
|
||||
} else {
|
||||
self.emitted = true;
|
||||
(DecoderResult::Malformed(1, 0), 1, 0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_replacement(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(REPLACEMENT, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_replacement(string: &str, expect: &[u8]) {
|
||||
encode(REPLACEMENT, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replacement_decode() {
|
||||
decode_replacement(b"", "");
|
||||
decode_replacement(b"A", "\u{FFFD}");
|
||||
decode_replacement(b"AB", "\u{FFFD}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replacement_encode() {
|
||||
// Empty
|
||||
encode_replacement("", b"");
|
||||
|
||||
assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
|
||||
encode_replacement("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
}
|
||||
}
|
||||
426
zeroidc/vendor/encoding_rs/src/shift_jis.rs
vendored
Normal file
426
zeroidc/vendor/encoding_rs/src/shift_jis.rs
vendored
Normal file
@@ -0,0 +1,426 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::data::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
// Rust 1.14.0 requires the following despite the asterisk above.
|
||||
use super::in_inclusive_range;
|
||||
use super::in_inclusive_range16;
|
||||
|
||||
pub struct ShiftJisDecoder {
|
||||
lead: Option<u8>,
|
||||
}
|
||||
|
||||
impl ShiftJisDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
|
||||
}
|
||||
|
||||
pub fn in_neutral_state(&self) -> bool {
|
||||
self.lead.is_none()
|
||||
}
|
||||
|
||||
fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_add(match self.lead {
|
||||
None => 0,
|
||||
Some(_) => 1,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
self.plus_one_if_lead(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
// worst case: 1 to 3 (half-width katakana)
|
||||
self.max_utf8_buffer_length(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_mul(3, self.plus_one_if_lead(byte_length))
|
||||
}
|
||||
|
||||
ascii_compatible_two_byte_decoder_functions!(
|
||||
{
|
||||
// If lead is between 0x81 and 0x9F, inclusive,
|
||||
// subtract offset 0x81. Else if lead is
|
||||
// between 0xE0 and 0xFC, inclusive, subtract
|
||||
// offset 0xC1. Else if lead is between
|
||||
// 0xA1 and 0xDF, inclusive, map to half-width
|
||||
// Katakana. Else if lead is 0x80, pass through.
|
||||
let mut non_ascii_minus_offset =
|
||||
non_ascii.wrapping_sub(0x81);
|
||||
if non_ascii_minus_offset > (0x9F - 0x81) {
|
||||
let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
|
||||
if non_ascii_minus_range_start > (0xFC - 0xE0) {
|
||||
let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
|
||||
if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
|
||||
if non_ascii == 0x80 {
|
||||
handle.write_mid_bmp(0x80);
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
|
||||
// Not caring about optimizing subsequent non-ASCII
|
||||
continue 'outermost;
|
||||
}
|
||||
non_ascii_minus_offset = non_ascii - 0xC1;
|
||||
}
|
||||
non_ascii_minus_offset
|
||||
},
|
||||
{
|
||||
// If trail is between 0x40 and 0x7E, inclusive,
|
||||
// subtract offset 0x40. Else if trail is
|
||||
// between 0x80 and 0xFC, inclusive, subtract
|
||||
// offset 0x41.
|
||||
// Fast-track Hiragana (60% according to Lunde)
|
||||
// and Katakana (10% acconding to Lunde).
|
||||
// Hiragana doesn't cross 0x7F, but Katakana does.
|
||||
// We can check for Hiragana before normalizing
|
||||
// trail.
|
||||
let trail_minus_hiragana = byte.wrapping_sub(0x9F);
|
||||
if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
|
||||
// Hiragana
|
||||
handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
|
||||
} else {
|
||||
let mut trail_minus_offset =
|
||||
byte.wrapping_sub(0x40);
|
||||
if trail_minus_offset > (0x7E - 0x40) {
|
||||
let trail_minus_range_start =
|
||||
byte.wrapping_sub(0x80);
|
||||
if trail_minus_range_start > (0xFC - 0x80) {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
trail_minus_offset = byte - 0x41;
|
||||
}
|
||||
if lead_minus_offset == 0x02 &&
|
||||
trail_minus_offset < 0x56 {
|
||||
// Katakana
|
||||
handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
|
||||
} else {
|
||||
let pointer = lead_minus_offset as usize *
|
||||
188usize +
|
||||
trail_minus_offset as usize;
|
||||
let level1_pointer = pointer.wrapping_sub(1410);
|
||||
if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
|
||||
} else {
|
||||
let level2_pointer = pointer.wrapping_sub(4418);
|
||||
if level2_pointer <
|
||||
JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
|
||||
handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
|
||||
} else {
|
||||
let upper_ibm_pointer = pointer.wrapping_sub(10744);
|
||||
if upper_ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
|
||||
} else {
|
||||
let lower_ibm_pointer = pointer.wrapping_sub(8272);
|
||||
if lower_ibm_pointer < IBM_KANJI.len() {
|
||||
handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
|
||||
} else if in_inclusive_range(pointer, 8836, 10715) {
|
||||
handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
|
||||
} else if let Some(bmp) = jis0208_symbol_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else if let Some(bmp) = jis0208_range_decode(pointer) {
|
||||
handle.write_bmp_excl_ascii(bmp)
|
||||
} else {
|
||||
if byte < 0x80 {
|
||||
return (DecoderResult::Malformed(1, 0),
|
||||
unread_handle_trail.unread(),
|
||||
handle.written());
|
||||
}
|
||||
return (DecoderResult::Malformed(2, 0),
|
||||
unread_handle_trail.consumed(),
|
||||
handle.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
non_ascii,
|
||||
byte,
|
||||
lead_minus_offset,
|
||||
unread_handle_trail,
|
||||
source,
|
||||
handle,
|
||||
'outermost,
|
||||
copy_ascii_from_check_space_bmp,
|
||||
check_space_bmp,
|
||||
false);
|
||||
}
|
||||
|
||||
#[cfg(feature = "fast-kanji-encode")]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
jis0208_kanji_shift_jis_encode(bmp)
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "fast-kanji-encode"))]
|
||||
#[inline(always)]
|
||||
fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
|
||||
if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
|
||||
return Some((lead, trail));
|
||||
}
|
||||
let pointer = if 0x4EDD == bmp {
|
||||
// Ideograph on the symbol row!
|
||||
23
|
||||
} else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
|
||||
4418 + pos
|
||||
} else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
|
||||
10744 + pos
|
||||
} else {
|
||||
return None;
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
|
||||
}
|
||||
|
||||
pub struct ShiftJisEncoder;
|
||||
|
||||
impl ShiftJisEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
u16_length.checked_mul(2)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
byte_length.checked_add(1)
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_functions!(
|
||||
{
|
||||
// Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
|
||||
let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
|
||||
if bmp_minus_hiragana < 0x53 {
|
||||
handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
|
||||
} else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
|
||||
if let Some((lead, trail)) = encode_kanji(bmp) {
|
||||
handle.write_two(lead, trail)
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
|
||||
if bmp_minus_katakana < 0x56 {
|
||||
let trail_offset = if bmp_minus_katakana < 0x3F {
|
||||
0x40
|
||||
} else {
|
||||
0x41
|
||||
};
|
||||
handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
|
||||
} else {
|
||||
let bmp_minus_space = bmp.wrapping_sub(0x3000);
|
||||
if bmp_minus_space < 3 {
|
||||
// fast-track common punctuation
|
||||
handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
|
||||
} else if bmp == 0xA5 {
|
||||
handle.write_one(0x5Cu8)
|
||||
} else if bmp == 0x80 {
|
||||
handle.write_one(0x80u8)
|
||||
} else if bmp == 0x203E {
|
||||
handle.write_one(0x7Eu8)
|
||||
} else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
|
||||
handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
|
||||
} else if bmp == 0x2212 {
|
||||
handle.write_two(0x81u8, 0x7Cu8)
|
||||
} else {
|
||||
let bmp_minus_roman = bmp.wrapping_sub(0x2170);
|
||||
let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
|
||||
10716 + bmp_minus_roman as usize
|
||||
} else if let Some(pointer) = jis0208_range_encode(bmp) {
|
||||
pointer
|
||||
} else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
|
||||
|| bmp == 0xF929
|
||||
|| bmp == 0xF9DC
|
||||
{
|
||||
// Guaranteed to be found in IBM_KANJI
|
||||
let pos = position(&IBM_KANJI[..], bmp).unwrap();
|
||||
10744 + pos
|
||||
} else if let Some(pointer) = jis0208_symbol_encode(bmp) {
|
||||
pointer
|
||||
} else {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
};
|
||||
let lead = pointer / 188;
|
||||
let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
|
||||
let trail = pointer % 188;
|
||||
let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
|
||||
handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_two,
|
||||
check_space_two,
|
||||
false
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_shift_jis(bytes: &[u8], expect: &str) {
|
||||
decode(SHIFT_JIS, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_shift_jis(string: &str, expect: &[u8]) {
|
||||
encode(SHIFT_JIS, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_decode() {
|
||||
// Empty
|
||||
decode_shift_jis(b"", &"");
|
||||
|
||||
// ASCII
|
||||
decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
// Half-width
|
||||
decode_shift_jis(b"\xA1", "\u{FF61}");
|
||||
decode_shift_jis(b"\xDF", "\u{FF9F}");
|
||||
decode_shift_jis(b"\xA0", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xE0", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xA0+", "\u{FFFD}+");
|
||||
decode_shift_jis(b"\xE0+", "\u{FFFD}+");
|
||||
|
||||
// EUDC
|
||||
decode_shift_jis(b"\xF0\x40", "\u{E000}");
|
||||
decode_shift_jis(b"\xF9\xFC", "\u{E757}");
|
||||
decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xFA\x40", "\u{2170}");
|
||||
|
||||
// JIS 0208
|
||||
decode_shift_jis(b"\x81\x40", "\u{3000}");
|
||||
decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
|
||||
decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
|
||||
decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
|
||||
decode_shift_jis(b"\xFA\x40", "\u{2170}");
|
||||
decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
|
||||
decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
|
||||
decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
|
||||
//
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_encode() {
|
||||
// Empty
|
||||
encode_shift_jis("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
// Exceptional code points
|
||||
encode_shift_jis("\u{0080}", b"\x80");
|
||||
encode_shift_jis("\u{00A5}", b"\x5C");
|
||||
encode_shift_jis("\u{203E}", b"\x7E");
|
||||
encode_shift_jis("\u{2212}", b"\x81\x7C");
|
||||
|
||||
// Half-width
|
||||
encode_shift_jis("\u{FF61}", b"\xA1");
|
||||
encode_shift_jis("\u{FF9F}", b"\xDF");
|
||||
|
||||
// EUDC
|
||||
encode_shift_jis("\u{E000}", b"");
|
||||
encode_shift_jis("\u{E757}", b"");
|
||||
|
||||
// JIS 0212
|
||||
encode_shift_jis("\u{02D8}", b"˘");
|
||||
|
||||
// JIS 0208
|
||||
encode_shift_jis("\u{3000}", b"\x81\x40");
|
||||
encode_shift_jis("\u{FF02}", b"\xFA\x57");
|
||||
encode_shift_jis("\u{2170}", b"\xFA\x40");
|
||||
encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_shift_jis_decode_all() {
|
||||
let input = include_bytes!("test_data/shift_jis_in.txt");
|
||||
let expectation = include_str!("test_data/shift_jis_in_ref.txt");
|
||||
let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
|
||||
assert!(had_errors, "Should have had errors.");
|
||||
assert_eq!(&cow[..], expectation);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg_attr(miri, ignore)] // Miri is too slow
|
||||
fn test_shift_jis_encode_all() {
|
||||
let input = include_str!("test_data/shift_jis_out.txt");
|
||||
let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
|
||||
let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
|
||||
assert!(!had_errors, "Should not have had errors.");
|
||||
assert_eq!(encoding, SHIFT_JIS);
|
||||
assert_eq!(&cow[..], &expectation[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shift_jis_half_width_katakana_length() {
|
||||
let mut output = [0u8; 20];
|
||||
let mut decoder = SHIFT_JIS.new_decoder();
|
||||
{
|
||||
let needed = decoder
|
||||
.max_utf8_buffer_length_without_replacement(1)
|
||||
.unwrap();
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
|
||||
assert_eq!(result, DecoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert_eq!(output[0], 0xEF);
|
||||
assert_eq!(output[1], 0xBD);
|
||||
assert_eq!(output[2], 0xA1);
|
||||
}
|
||||
}
|
||||
}
|
||||
455
zeroidc/vendor/encoding_rs/src/simd_funcs.rs
vendored
Normal file
455
zeroidc/vendor/encoding_rs/src/simd_funcs.rs
vendored
Normal file
@@ -0,0 +1,455 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use packed_simd::u16x8;
|
||||
use packed_simd::u8x16;
|
||||
use packed_simd::FromBits;
|
||||
|
||||
// TODO: Migrate unaligned access to stdlib code if/when the RFC
|
||||
// https://github.com/rust-lang/rfcs/pull/1725 is implemented.
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
|
||||
let mut simd = ::core::mem::uninitialized();
|
||||
::core::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
|
||||
simd
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
|
||||
*(ptr as *const u8x16)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
|
||||
::core::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
|
||||
*(ptr as *mut u8x16) = s;
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
|
||||
let mut simd = ::core::mem::uninitialized();
|
||||
::core::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
|
||||
simd
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
|
||||
*(ptr as *const u16x8)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
|
||||
::core::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[inline(always)]
|
||||
pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
|
||||
*(ptr as *mut u16x8) = s;
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(all(target_feature = "sse2", target_arch = "x86_64"))] {
|
||||
use core::arch::x86_64::__m128i;
|
||||
use core::arch::x86_64::_mm_movemask_epi8;
|
||||
use core::arch::x86_64::_mm_packus_epi16;
|
||||
} else if #[cfg(all(target_feature = "sse2", target_arch = "x86"))] {
|
||||
use core::arch::x86::__m128i;
|
||||
use core::arch::x86::_mm_movemask_epi8;
|
||||
use core::arch::x86::_mm_packus_epi16;
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
use core::arch::aarch64::uint8x16_t;
|
||||
use core::arch::aarch64::uint16x8_t;
|
||||
use core::arch::aarch64::vmaxvq_u8;
|
||||
use core::arch::aarch64::vmaxvq_u16;
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// #[inline(always)]
|
||||
// fn simd_byte_swap_u8(s: u8x16) -> u8x16 {
|
||||
// unsafe {
|
||||
// shuffle!(s, s, [1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14])
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[inline(always)]
|
||||
// pub fn simd_byte_swap(s: u16x8) -> u16x8 {
|
||||
// to_u16_lanes(simd_byte_swap_u8(to_u8_lanes(s)))
|
||||
// }
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_byte_swap(s: u16x8) -> u16x8 {
|
||||
let left = s << 8;
|
||||
let right = s >> 8;
|
||||
left | right
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn to_u16_lanes(s: u8x16) -> u16x8 {
|
||||
u16x8::from_bits(s)
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
|
||||
// Expose low-level mask instead of higher-level conclusion,
|
||||
// because the non-ASCII case would perform less well otherwise.
|
||||
#[inline(always)]
|
||||
pub fn mask_ascii(s: u8x16) -> i32 {
|
||||
unsafe {
|
||||
_mm_movemask_epi8(__m128i::from_bits(s))
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
_mm_movemask_epi8(__m128i::from_bits(s)) == 0
|
||||
}
|
||||
}
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
#[inline(always)]
|
||||
pub fn simd_is_ascii(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0x80
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_ascii(s: u8x16) -> bool {
|
||||
// This optimizes better on ARM than
|
||||
// the lt formulation.
|
||||
let highest_ascii = u8x16::splat(0x7F);
|
||||
!s.gt(highest_ascii).any()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_str_latin1(s: u8x16) -> bool {
|
||||
if simd_is_ascii(s) {
|
||||
return true;
|
||||
}
|
||||
let above_str_latin1 = u8x16::splat(0xC4);
|
||||
s.lt(above_str_latin1).all()
|
||||
}
|
||||
} else if #[cfg(target_arch = "aarch64")]{
|
||||
#[inline(always)]
|
||||
pub fn simd_is_str_latin1(s: u8x16) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u8(uint8x16_t::from_bits(s)) < 0xC4
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_str_latin1(s: u8x16) -> bool {
|
||||
let above_str_latin1 = u8x16::splat(0xC4);
|
||||
s.lt(above_str_latin1).all()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_arch = "aarch64")]{
|
||||
#[inline(always)]
|
||||
pub fn simd_is_basic_latin(s: u16x8) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x80
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_is_latin1(s: u16x8) -> bool {
|
||||
unsafe {
|
||||
vmaxvq_u16(uint16x8_t::from_bits(s)) < 0x100
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_is_basic_latin(s: u16x8) -> bool {
|
||||
let above_ascii = u16x8::splat(0x80);
|
||||
s.lt(above_ascii).all()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_is_latin1(s: u16x8) -> bool {
|
||||
// For some reason, on SSE2 this formulation
|
||||
// seems faster in this case while the above
|
||||
// function is better the other way round...
|
||||
let highest_latin1 = u16x8::splat(0xFF);
|
||||
!s.gt(highest_latin1).any()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn contains_surrogates(s: u16x8) -> bool {
|
||||
let mask = u16x8::splat(0xF800);
|
||||
let surrogate_bits = u16x8::splat(0xD800);
|
||||
(s & mask).eq(surrogate_bits).any()
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_arch = "aarch64")]{
|
||||
macro_rules! aarch64_return_false_if_below_hebrew {
|
||||
($s:ident) => ({
|
||||
unsafe {
|
||||
if vmaxvq_u16(uint16x8_t::from_bits($s)) < 0x0590 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
macro_rules! non_aarch64_return_false_if_all {
|
||||
($s:ident) => ()
|
||||
}
|
||||
} else {
|
||||
macro_rules! aarch64_return_false_if_below_hebrew {
|
||||
($s:ident) => ()
|
||||
}
|
||||
|
||||
macro_rules! non_aarch64_return_false_if_all {
|
||||
($s:ident) => ({
|
||||
if $s.all() {
|
||||
return false;
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! in_range16x8 {
|
||||
($s:ident, $start:expr, $end:expr) => {{
|
||||
// SIMD sub is wrapping
|
||||
($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
|
||||
}};
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn is_u16x8_bidi(s: u16x8) -> bool {
|
||||
// We try to first quickly refute the RTLness of the vector. If that
|
||||
// fails, we do the real RTL check, so in that case we end up wasting
|
||||
// the work for the up-front quick checks. Even the quick-check is
|
||||
// two-fold in order to return `false` ASAP if everything is below
|
||||
// Hebrew.
|
||||
|
||||
aarch64_return_false_if_below_hebrew!(s);
|
||||
|
||||
let below_hebrew = s.lt(u16x8::splat(0x0590));
|
||||
|
||||
non_aarch64_return_false_if_all!(below_hebrew);
|
||||
|
||||
if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quick refutation failed. Let's do the full check.
|
||||
|
||||
(in_range16x8!(s, 0x0590, 0x0900)
|
||||
| in_range16x8!(s, 0xFB1D, 0xFE00)
|
||||
| in_range16x8!(s, 0xFE70, 0xFEFF)
|
||||
| in_range16x8!(s, 0xD802, 0xD804)
|
||||
| in_range16x8!(s, 0xD83A, 0xD83C)
|
||||
| s.eq(u16x8::splat(0x200F))
|
||||
| s.eq(u16x8::splat(0x202B))
|
||||
| s.eq(u16x8::splat(0x202E))
|
||||
| s.eq(u16x8::splat(0x2067)))
|
||||
.any()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
|
||||
unsafe {
|
||||
let first: u8x16 = shuffle!(
|
||||
s,
|
||||
u8x16::splat(0),
|
||||
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
|
||||
);
|
||||
let second: u8x16 = shuffle!(
|
||||
s,
|
||||
u8x16::splat(0),
|
||||
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
|
||||
);
|
||||
(u16x8::from_bits(first), u16x8::from_bits(second))
|
||||
}
|
||||
}
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(target_feature = "sse2")] {
|
||||
#[inline(always)]
|
||||
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
|
||||
unsafe {
|
||||
u8x16::from_bits(_mm_packus_epi16(__m128i::from_bits(a), __m128i::from_bits(b)))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
#[inline(always)]
|
||||
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
|
||||
unsafe {
|
||||
let first = u8x16::from_bits(a);
|
||||
let second = u8x16::from_bits(b);
|
||||
shuffle!(
|
||||
first,
|
||||
second,
|
||||
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use alloc::vec::Vec;
|
||||
|
||||
#[test]
|
||||
fn test_unpack() {
|
||||
let ascii: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let basic_latin: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u16);
|
||||
let (first, second) = simd_unpack(simd);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
unsafe {
|
||||
store8_unaligned(ptr, first);
|
||||
store8_unaligned(ptr.add(8), second);
|
||||
}
|
||||
assert_eq!(&vec[..], &basic_latin[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_success() {
|
||||
let ascii: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let basic_latin: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(basic_latin.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(basic_latin.as_ptr().add(8)) };
|
||||
let mut vec = Vec::with_capacity(16);
|
||||
vec.resize(16, 0u8);
|
||||
let ptr = vec.as_mut_ptr();
|
||||
assert!(simd_is_basic_latin(first | second));
|
||||
unsafe {
|
||||
store16_unaligned(ptr, simd_pack(first, second));
|
||||
}
|
||||
assert_eq!(&vec[..], &ascii[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_c0() {
|
||||
let input: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_0fff() {
|
||||
let input: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0x0FFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_basic_latin_ffff() {
|
||||
let input: [u16; 16] = [
|
||||
0x61, 0x62, 0x63, 0xFFFF, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let first = unsafe { load8_unaligned(input.as_ptr()) };
|
||||
let second = unsafe { load8_unaligned(input.as_ptr().add(8)) };
|
||||
assert!(!simd_is_basic_latin(first | second));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_ascii_success() {
|
||||
let ascii: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(ascii.as_ptr()) };
|
||||
assert!(simd_is_ascii(simd));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simd_is_ascii_failure() {
|
||||
let input: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(input.as_ptr()) };
|
||||
assert!(!simd_is_ascii(simd));
|
||||
}
|
||||
|
||||
#[cfg(target_feature = "sse2")]
|
||||
#[test]
|
||||
fn test_check_ascii() {
|
||||
let input: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let simd = unsafe { load16_unaligned(input.as_ptr()) };
|
||||
let mask = mask_ascii(simd);
|
||||
assert_ne!(mask, 0);
|
||||
assert_eq!(mask.trailing_zeros(), 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_alu() {
|
||||
let input: [u8; 16] = [
|
||||
0x61, 0x62, 0x63, 0x64, 0x81, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74,
|
||||
0x75, 0x76,
|
||||
];
|
||||
let mut alu = 0u64;
|
||||
unsafe {
|
||||
::core::ptr::copy_nonoverlapping(input.as_ptr(), &mut alu as *mut u64 as *mut u8, 8);
|
||||
}
|
||||
let masked = alu & 0x8080808080808080;
|
||||
assert_eq!(masked.trailing_zeros(), 39);
|
||||
}
|
||||
}
|
||||
714
zeroidc/vendor/encoding_rs/src/single_byte.rs
vendored
Normal file
714
zeroidc/vendor/encoding_rs/src/single_byte.rs
vendored
Normal file
@@ -0,0 +1,714 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::ascii::*;
|
||||
use crate::data::position;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
|
||||
pub struct SingleByteDecoder {
|
||||
table: &'static [u16; 128],
|
||||
}
|
||||
|
||||
impl SingleByteDecoder {
|
||||
pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
|
||||
VariantDecoder::SingleByte(SingleByteDecoder { table: data })
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let mut source = ByteSource::new(src);
|
||||
let mut dest = Utf8Destination::new(dst);
|
||||
'outermost: loop {
|
||||
match dest.copy_ascii_from_check_space_bmp(&mut source) {
|
||||
CopyAsciiResult::Stop(ret) => return ret,
|
||||
CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
|
||||
// Start non-boilerplate
|
||||
//
|
||||
// Since the non-ASCIIness of `non_ascii` is hidden from
|
||||
// the optimizer, it can't figure out that it's OK to
|
||||
// statically omit the bound check when accessing
|
||||
// `[u16; 128]` with an index
|
||||
// `non_ascii as usize - 0x80usize`.
|
||||
let mapped =
|
||||
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
// let mapped = self.table[non_ascii as usize - 0x80usize];
|
||||
if mapped == 0u16 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
let dest_again = handle.write_bmp_excl_ascii(mapped);
|
||||
// End non-boilerplate
|
||||
match source.check_available() {
|
||||
Space::Full(src_consumed) => {
|
||||
return (
|
||||
DecoderResult::InputEmpty,
|
||||
src_consumed,
|
||||
dest_again.written(),
|
||||
);
|
||||
}
|
||||
Space::Available(source_handle) => {
|
||||
match dest_again.check_space_bmp() {
|
||||
Space::Full(dst_written) => {
|
||||
return (
|
||||
DecoderResult::OutputFull,
|
||||
source_handle.consumed(),
|
||||
dst_written,
|
||||
);
|
||||
}
|
||||
Space::Available(mut destination_handle) => {
|
||||
let (mut b, unread_handle) = source_handle.read();
|
||||
let source_again = unread_handle.commit();
|
||||
'innermost: loop {
|
||||
if b > 127 {
|
||||
non_ascii = b;
|
||||
handle = destination_handle;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
let dest_again_again = destination_handle.write_ascii(b);
|
||||
if b < 60 {
|
||||
// We've got punctuation
|
||||
match source_again.check_available() {
|
||||
Space::Full(src_consumed_again) => {
|
||||
return (
|
||||
DecoderResult::InputEmpty,
|
||||
src_consumed_again,
|
||||
dest_again_again.written(),
|
||||
);
|
||||
}
|
||||
Space::Available(source_handle_again) => {
|
||||
match dest_again_again.check_space_bmp() {
|
||||
Space::Full(dst_written_again) => {
|
||||
return (
|
||||
DecoderResult::OutputFull,
|
||||
source_handle_again.consumed(),
|
||||
dst_written_again,
|
||||
);
|
||||
}
|
||||
Space::Available(
|
||||
destination_handle_again,
|
||||
) => {
|
||||
let (b_again, _unread_handle_again) =
|
||||
source_handle_again.read();
|
||||
b = b_again;
|
||||
destination_handle =
|
||||
destination_handle_again;
|
||||
continue 'innermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let mut converted = 0usize;
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
ascii_to_basic_latin(
|
||||
src.as_ptr().add(converted),
|
||||
dst.as_mut_ptr().add(converted),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
None => {
|
||||
return (pending, length, length);
|
||||
}
|
||||
Some((mut non_ascii, consumed)) => {
|
||||
converted += consumed;
|
||||
'middle: loop {
|
||||
// `converted` doesn't count the reading of `non_ascii` yet.
|
||||
// Since the non-ASCIIness of `non_ascii` is hidden from
|
||||
// the optimizer, it can't figure out that it's OK to
|
||||
// statically omit the bound check when accessing
|
||||
// `[u16; 128]` with an index
|
||||
// `non_ascii as usize - 0x80usize`.
|
||||
let mapped =
|
||||
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
// let mapped = self.table[non_ascii as usize - 0x80usize];
|
||||
if mapped == 0u16 {
|
||||
return (
|
||||
DecoderResult::Malformed(1, 0),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
unsafe {
|
||||
// The bound check has already been performed
|
||||
*(dst.get_unchecked_mut(converted)) = mapped;
|
||||
}
|
||||
converted += 1;
|
||||
// Next, handle ASCII punctuation and non-ASCII without
|
||||
// going back to ASCII acceleration. Non-ASCII scripts
|
||||
// use ASCII punctuation, so this avoid going to
|
||||
// acceleration just for punctuation/space and then
|
||||
// failing. This is a significant boost to non-ASCII
|
||||
// scripts.
|
||||
// TODO: Split out Latin converters without this part
|
||||
// this stuff makes Latin script-conversion slower.
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
let mut b = unsafe { *(src.get_unchecked(converted)) };
|
||||
'innermost: loop {
|
||||
if b > 127 {
|
||||
non_ascii = b;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = u16::from(b);
|
||||
}
|
||||
converted += 1;
|
||||
if b < 60 {
|
||||
// We've got punctuation
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
b = unsafe { *(src.get_unchecked(converted)) };
|
||||
continue 'innermost;
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
|
||||
let mut bytes = buffer;
|
||||
let mut total = 0;
|
||||
loop {
|
||||
if let Some((non_ascii, offset)) = validate_ascii(bytes) {
|
||||
total += offset;
|
||||
let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
|
||||
if mapped != u16::from(non_ascii) {
|
||||
return total;
|
||||
}
|
||||
total += 1;
|
||||
bytes = &bytes[offset + 1..];
|
||||
} else {
|
||||
return total;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SingleByteEncoder {
|
||||
table: &'static [u16; 128],
|
||||
run_bmp_offset: usize,
|
||||
run_byte_offset: usize,
|
||||
run_length: usize,
|
||||
}
|
||||
|
||||
impl SingleByteEncoder {
|
||||
pub fn new(
|
||||
encoding: &'static Encoding,
|
||||
data: &'static [u16; 128],
|
||||
run_bmp_offset: u16,
|
||||
run_byte_offset: u8,
|
||||
run_length: u8,
|
||||
) -> Encoder {
|
||||
Encoder::new(
|
||||
encoding,
|
||||
VariantEncoder::SingleByte(SingleByteEncoder {
|
||||
table: data,
|
||||
run_bmp_offset: run_bmp_offset as usize,
|
||||
run_byte_offset: run_byte_offset as usize,
|
||||
run_length: run_length as usize,
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(u16_length)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn encode_u16(&self, code_unit: u16) -> Option<u8> {
|
||||
// First, we see if the code unit falls into a run of consecutive
|
||||
// code units that can be mapped by offset. This is very efficient
|
||||
// for most non-Latin encodings as well as Latin1-ish encodings.
|
||||
//
|
||||
// For encodings that don't fit this pattern, the run (which may
|
||||
// have the length of just one) just establishes the starting point
|
||||
// for the next rule.
|
||||
//
|
||||
// Next, we do a forward linear search in the part of the index
|
||||
// after the run. Even in non-Latin1-ish Latin encodings (except
|
||||
// macintosh), the lower case letters are here.
|
||||
//
|
||||
// Next, we search the third quadrant up to the start of the run
|
||||
// (upper case letters in Latin encodings except macintosh, in
|
||||
// Greek and in KOI encodings) and then the second quadrant,
|
||||
// except if the run stared before the third quadrant, we search
|
||||
// the second quadrant up to the run.
|
||||
//
|
||||
// Last, we search the first quadrant, which has unused controls
|
||||
// or punctuation in most encodings. This is bad for macintosh
|
||||
// and IBM866, but those are rare.
|
||||
|
||||
// Run of consecutive units
|
||||
let unit_as_usize = code_unit as usize;
|
||||
let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
|
||||
if offset < self.run_length {
|
||||
return Some((128 + self.run_byte_offset + offset) as u8);
|
||||
}
|
||||
|
||||
// Search after the run
|
||||
let tail_start = self.run_byte_offset + self.run_length;
|
||||
if let Some(pos) = position(&self.table[tail_start..], code_unit) {
|
||||
return Some((128 + tail_start + pos) as u8);
|
||||
}
|
||||
|
||||
if self.run_byte_offset >= 64 {
|
||||
// Search third quadrant before the run
|
||||
if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
|
||||
return Some(((128 + 64) + pos) as u8);
|
||||
}
|
||||
|
||||
// Search second quadrant
|
||||
if let Some(pos) = position(&self.table[32..64], code_unit) {
|
||||
return Some(((128 + 32) + pos) as u8);
|
||||
}
|
||||
} else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
|
||||
// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
|
||||
// Search second quadrant before the run
|
||||
return Some(((128 + 32) + pos) as u8);
|
||||
}
|
||||
|
||||
// Search first quadrant
|
||||
if let Some(pos) = position(&self.table[..32], code_unit) {
|
||||
return Some((128 + pos) as u8);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
ascii_compatible_bmp_encoder_function!(
|
||||
{
|
||||
match self.encode_u16(bmp) {
|
||||
Some(byte) => handle.write_one(byte),
|
||||
None => {
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(bmp),
|
||||
source.consumed(),
|
||||
handle.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
},
|
||||
bmp,
|
||||
self,
|
||||
source,
|
||||
handle,
|
||||
copy_ascii_to_check_space_one,
|
||||
check_space_one,
|
||||
encode_from_utf8_raw,
|
||||
str,
|
||||
Utf8Source,
|
||||
true
|
||||
);
|
||||
|
||||
pub fn encode_from_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u16],
|
||||
dst: &mut [u8],
|
||||
_last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(EncoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(EncoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let mut converted = 0usize;
|
||||
'outermost: loop {
|
||||
match unsafe {
|
||||
basic_latin_to_ascii(
|
||||
src.as_ptr().add(converted),
|
||||
dst.as_mut_ptr().add(converted),
|
||||
length - converted,
|
||||
)
|
||||
} {
|
||||
None => {
|
||||
return (pending, length, length);
|
||||
}
|
||||
Some((mut non_ascii, consumed)) => {
|
||||
converted += consumed;
|
||||
'middle: loop {
|
||||
// `converted` doesn't count the reading of `non_ascii` yet.
|
||||
match self.encode_u16(non_ascii) {
|
||||
Some(byte) => {
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = byte;
|
||||
}
|
||||
converted += 1;
|
||||
}
|
||||
None => {
|
||||
// At this point, we need to know if we
|
||||
// have a surrogate.
|
||||
let high_bits = non_ascii & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
// high surrogate
|
||||
if converted + 1 == length {
|
||||
// End of buffer. This surrogate is unpaired.
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
let second =
|
||||
u32::from(unsafe { *src.get_unchecked(converted + 1) });
|
||||
if second & 0xFC00u32 != 0xDC00u32 {
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
// The next code unit is a low surrogate.
|
||||
let astral: char = unsafe {
|
||||
::core::char::from_u32_unchecked(
|
||||
(u32::from(non_ascii) << 10) + second
|
||||
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
|
||||
)
|
||||
};
|
||||
return (
|
||||
EncoderResult::Unmappable(astral),
|
||||
converted + 2, // +2 `for non_ascii` and `second`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
if high_bits == 0xDC00u16 {
|
||||
// Unpaired low surrogate
|
||||
return (
|
||||
EncoderResult::Unmappable('\u{FFFD}'),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
return (
|
||||
EncoderResult::unmappable_from_bmp(non_ascii),
|
||||
converted + 1, // +1 `for non_ascii`
|
||||
converted,
|
||||
);
|
||||
}
|
||||
}
|
||||
// Next, handle ASCII punctuation and non-ASCII without
|
||||
// going back to ASCII acceleration. Non-ASCII scripts
|
||||
// use ASCII punctuation, so this avoid going to
|
||||
// acceleration just for punctuation/space and then
|
||||
// failing. This is a significant boost to non-ASCII
|
||||
// scripts.
|
||||
// TODO: Split out Latin converters without this part
|
||||
// this stuff makes Latin script-conversion slower.
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
let mut unit = unsafe { *(src.get_unchecked(converted)) };
|
||||
'innermost: loop {
|
||||
if unit > 127 {
|
||||
non_ascii = unit;
|
||||
continue 'middle;
|
||||
}
|
||||
// Testing on Haswell says that we should write the
|
||||
// byte unconditionally instead of trying to unread it
|
||||
// to make it part of the next SIMD stride.
|
||||
unsafe {
|
||||
*(dst.get_unchecked_mut(converted)) = unit as u8;
|
||||
}
|
||||
converted += 1;
|
||||
if unit < 60 {
|
||||
// We've got punctuation
|
||||
if converted == length {
|
||||
return (pending, length, length);
|
||||
}
|
||||
unit = unsafe { *(src.get_unchecked(converted)) };
|
||||
continue 'innermost;
|
||||
}
|
||||
// We've got markup or ASCII text
|
||||
continue 'outermost;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
#[test]
|
||||
fn test_windows_1255_ca() {
|
||||
decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
|
||||
encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ascii_punctuation() {
|
||||
let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
|
||||
let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
|
||||
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
|
||||
\u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
|
||||
\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
|
||||
\u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
|
||||
decode(WINDOWS_1253, bytes, characters);
|
||||
encode(WINDOWS_1253, characters, bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_malformed() {
|
||||
decode(
|
||||
WINDOWS_1253,
|
||||
b"\xC1\xF5\xD2\xF4\xFC",
|
||||
"\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unmappables() {
|
||||
encode(
|
||||
WINDOWS_1253,
|
||||
"\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
|
||||
b"\xC1\xF5☃\xF4\xFC",
|
||||
);
|
||||
encode(
|
||||
WINDOWS_1253,
|
||||
"\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
|
||||
b"\xC1\xF5💩\xF4\xFC",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_unpaired_surrogates() {
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
|
||||
b"\xC1\xF5�\xF4\xFC",
|
||||
);
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
|
||||
b"\xC1\xF5�\xF4\xFC",
|
||||
);
|
||||
encode_from_utf16(
|
||||
WINDOWS_1253,
|
||||
&[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
|
||||
b"\xC1\xF5\xF4\xFC�",
|
||||
);
|
||||
}
|
||||
|
||||
pub const HIGH_BYTES: &'static [u8; 128] = &[
|
||||
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
|
||||
0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
|
||||
0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
|
||||
0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
|
||||
0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
|
||||
0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
|
||||
0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
|
||||
0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
];
|
||||
|
||||
fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
|
||||
let mut with_replacement = [0u16; 128];
|
||||
let mut it = data.iter().enumerate();
|
||||
loop {
|
||||
match it.next() {
|
||||
Some((i, code_point)) => {
|
||||
if *code_point == 0 {
|
||||
with_replacement[i] = 0xFFFD;
|
||||
} else {
|
||||
with_replacement[i] = *code_point;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
|
||||
}
|
||||
|
||||
fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
|
||||
let mut with_zeros = [0u8; 128];
|
||||
let mut it = data.iter().enumerate();
|
||||
loop {
|
||||
match it.next() {
|
||||
Some((i, code_point)) => {
|
||||
if *code_point == 0 {
|
||||
with_zeros[i] = 0;
|
||||
} else {
|
||||
with_zeros[i] = HIGH_BYTES[i];
|
||||
}
|
||||
}
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
encode_from_utf16(encoding, data, &with_zeros[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = WINDOWS_1253.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
|
||||
// These tests are so self-referential that they are pretty useless.
|
||||
|
||||
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_decode() {
|
||||
decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
|
||||
decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
|
||||
if cfg!(miri) {
|
||||
// Miri is too slow
|
||||
return;
|
||||
}
|
||||
decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
|
||||
decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
|
||||
decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
|
||||
decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
|
||||
decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
|
||||
decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
|
||||
decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
|
||||
decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
|
||||
decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
|
||||
decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
|
||||
decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
|
||||
decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
|
||||
decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
|
||||
decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
|
||||
decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
|
||||
decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
|
||||
decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
|
||||
decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
|
||||
decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
|
||||
decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
|
||||
decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
|
||||
decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
|
||||
decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
|
||||
decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
|
||||
decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_single_byte_encode() {
|
||||
encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
|
||||
encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
|
||||
if cfg!(miri) {
|
||||
// Miri is too slow
|
||||
return;
|
||||
}
|
||||
encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
|
||||
encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
|
||||
encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
|
||||
encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
|
||||
encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
|
||||
encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
|
||||
encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
|
||||
encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
|
||||
encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
|
||||
encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
|
||||
encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
|
||||
encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
|
||||
encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
|
||||
encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
|
||||
encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
|
||||
encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
|
||||
encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
|
||||
encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
|
||||
encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
|
||||
encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
|
||||
encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
|
||||
encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
|
||||
encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
|
||||
encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
|
||||
encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
|
||||
}
|
||||
// END GENERATED CODE
|
||||
}
|
||||
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in.txt
vendored
Normal file
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in_ref.txt
vendored
Normal file
19787
zeroidc/vendor/encoding_rs/src/test_data/big5_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out.txt
vendored
Normal file
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out_ref.txt
vendored
Normal file
14601
zeroidc/vendor/encoding_rs/src/test_data/big5_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in_ref.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out.txt
vendored
Normal file
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out_ref.txt
vendored
Normal file
17053
zeroidc/vendor/encoding_rs/src/test_data/euc_kr_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in_ref.txt
vendored
Normal file
23945
zeroidc/vendor/encoding_rs/src/test_data/gb18030_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out.txt
vendored
Normal file
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out_ref.txt
vendored
Normal file
23944
zeroidc/vendor/encoding_rs/src/test_data/gb18030_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in_ref.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out.txt
vendored
Normal file
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out_ref.txt
vendored
Normal file
7404
zeroidc/vendor/encoding_rs/src/test_data/iso_2022_jp_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in_ref.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0208_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out.txt
vendored
Normal file
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out_ref.txt
vendored
Normal file
7341
zeroidc/vendor/encoding_rs/src/test_data/jis0208_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in_ref.txt
vendored
Normal file
8841
zeroidc/vendor/encoding_rs/src/test_data/jis0212_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in.txt
vendored
Normal file
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in_ref.txt
vendored
Normal file
11285
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_in_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out.txt
vendored
Normal file
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out_ref.txt
vendored
Normal file
7355
zeroidc/vendor/encoding_rs/src/test_data/shift_jis_out_ref.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
242
zeroidc/vendor/encoding_rs/src/test_labels_names.rs
vendored
Normal file
242
zeroidc/vendor/encoding_rs/src/test_labels_names.rs
vendored
Normal file
@@ -0,0 +1,242 @@
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_all_labels() {
|
||||
assert_eq!(Encoding::for_label(b"l1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"l2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"l3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"l4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"l5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"l6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"l9"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"mac"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"koi"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"gbk"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"koi8"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"sjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"ucs-2"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"ms932"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"cp866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"cp819"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ascii"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"x-gbk"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"greek"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"cp1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"cp1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"latin1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"gb2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"cp1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"latin2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"cp1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"latin3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"cp1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"latin4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"cp1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"csbig5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"latin5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"utf-16"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"cp1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"ibm866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"latin6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"cp1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"cp1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"greek8"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"ibm819"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"arabic"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"visual"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"korean"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"euc-jp"), Some(EUC_JP));
|
||||
assert_eq!(Encoding::for_label(b"koi8-r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"koi8_r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"euc-kr"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-sjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"koi8-u"), Some(KOI8_U));
|
||||
assert_eq!(Encoding::for_label(b"hebrew"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"tis-620"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"gb18030"), Some(GB18030));
|
||||
assert_eq!(Encoding::for_label(b"ksc5601"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"gb_2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"dos-874"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"cn-big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"unicode"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"chinese"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"logical"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"cskoi8r"), Some(KOI8_R));
|
||||
assert_eq!(Encoding::for_label(b"cseuckr"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"koi8-ru"), Some(KOI8_U));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"ksc_5601"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"iso88591"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"csgb2312"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso88592"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"iso88593"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"ecma-114"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso88594"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"iso88595"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"x-x-big5"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"csibm866"), Some(IBM866));
|
||||
assert_eq!(Encoding::for_label(b"iso88596"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"iso88597"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"asmo-708"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"ecma-118"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"elot_928"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"x-cp1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"iso88598"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso88599"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"cyrillic"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"utf-16be"), Some(UTF_16BE));
|
||||
assert_eq!(Encoding::for_label(b"utf-16le"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"us-ascii"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ms_kanji"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"x-euc-jp"), Some(EUC_JP));
|
||||
assert_eq!(Encoding::for_label(b"iso885910"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso885911"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso885913"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso885914"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso885915"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-58"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"csunicode"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"macintosh"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"shift-jis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"shift_jis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-100"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-10"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-110"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"gb_2312-80"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-101"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-11"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"hz-gb-2312"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-13"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-14"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-144"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-5"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-6"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-126"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-7"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-127"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-157"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-8"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-138"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-148"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-9"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-109"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso-ir-149"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"big5-hkscs"), Some(BIG5));
|
||||
assert_eq!(Encoding::for_label(b"csshiftjis"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-10"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-11"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin1"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin2"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-13"), Some(ISO_8859_13));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin3"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-14"), Some(ISO_8859_14));
|
||||
assert_eq!(Encoding::for_label(b"windows-874"), Some(WINDOWS_874));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin4"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-15"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin5"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-16"), Some(ISO_8859_16));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin6"), Some(ISO_8859_10));
|
||||
assert_eq!(Encoding::for_label(b"windows-949"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"csisolatin9"), Some(ISO_8859_15));
|
||||
assert_eq!(Encoding::for_label(b"csiso88596e"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csiso88598e"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"unicodefffe"), Some(UTF_16BE));
|
||||
assert_eq!(Encoding::for_label(b"unicodefeff"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"csmacintosh"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"csiso88596i"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csiso88598i"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"windows-31j"), Some(SHIFT_JIS));
|
||||
assert_eq!(Encoding::for_label(b"x-mac-roman"), Some(MACINTOSH));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-cn"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-jp"), Some(ISO_2022_JP));
|
||||
assert_eq!(Encoding::for_label(b"csiso2022jp"), Some(ISO_2022_JP));
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-kr"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"csiso2022kr"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"replacement"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"windows-1250"), Some(WINDOWS_1250));
|
||||
assert_eq!(Encoding::for_label(b"windows-1251"), Some(WINDOWS_1251));
|
||||
assert_eq!(Encoding::for_label(b"windows-1252"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"windows-1253"), Some(WINDOWS_1253));
|
||||
assert_eq!(Encoding::for_label(b"windows-1254"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"windows-1255"), Some(WINDOWS_1255));
|
||||
assert_eq!(Encoding::for_label(b"windows-1256"), Some(WINDOWS_1256));
|
||||
assert_eq!(Encoding::for_label(b"windows-1257"), Some(WINDOWS_1257));
|
||||
assert_eq!(Encoding::for_label(b"windows-1258"), Some(WINDOWS_1258));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6-e"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8-e"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-6-i"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso-8859-8-i"), Some(ISO_8859_8_I));
|
||||
assert_eq!(Encoding::for_label(b"sun_eu_greek"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"csksc56011987"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"unicode20utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"unicode11utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"ks_c_5601-1987"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"ansi_x3.4-1968"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"ks_c_5601-1989"), Some(EUC_KR));
|
||||
assert_eq!(Encoding::for_label(b"x-mac-cyrillic"), Some(X_MAC_CYRILLIC));
|
||||
assert_eq!(Encoding::for_label(b"x-user-defined"), Some(X_USER_DEFINED));
|
||||
assert_eq!(Encoding::for_label(b"csiso58gb231280"), Some(GBK));
|
||||
assert_eq!(Encoding::for_label(b"iso-10646-ucs-2"), Some(UTF_16LE));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-1:1987"), Some(WINDOWS_1252));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-2:1987"), Some(ISO_8859_2));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-6:1987"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-7:1987"), Some(ISO_8859_7));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-3:1988"), Some(ISO_8859_3));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-4:1988"), Some(ISO_8859_4));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-5:1988"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-8:1988"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"x-unicode20utf8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"iso_8859-9:1989"), Some(WINDOWS_1254));
|
||||
assert_eq!(Encoding::for_label(b"csisolatingreek"), Some(ISO_8859_7));
|
||||
assert_eq!(
|
||||
Encoding::for_label(b"x-mac-ukrainian"),
|
||||
Some(X_MAC_CYRILLIC)
|
||||
);
|
||||
assert_eq!(Encoding::for_label(b"iso-2022-cn-ext"), Some(REPLACEMENT));
|
||||
assert_eq!(Encoding::for_label(b"csisolatinarabic"), Some(ISO_8859_6));
|
||||
assert_eq!(Encoding::for_label(b"csisolatinhebrew"), Some(ISO_8859_8));
|
||||
assert_eq!(Encoding::for_label(b"unicode-1-1-utf-8"), Some(UTF_8));
|
||||
assert_eq!(Encoding::for_label(b"csisolatincyrillic"), Some(ISO_8859_5));
|
||||
assert_eq!(Encoding::for_label(b"cseucpkdfmtjapanese"), Some(EUC_JP));
|
||||
}
|
||||
262
zeroidc/vendor/encoding_rs/src/testing.rs
vendored
Normal file
262
zeroidc/vendor/encoding_rs/src/testing.rs
vendored
Normal file
@@ -0,0 +1,262 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
|
||||
pub fn decode(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let mut vec = Vec::with_capacity(bytes.len() + 32);
|
||||
let mut string = String::with_capacity(expect.len() + 32);
|
||||
let range = if cfg!(miri) {
|
||||
0usize..4usize
|
||||
} else {
|
||||
0usize..32usize
|
||||
};
|
||||
for i in range {
|
||||
vec.clear();
|
||||
string.clear();
|
||||
for j in 0usize..i {
|
||||
let c = 0x40u8 + (j as u8);
|
||||
vec.push(c);
|
||||
string.push(c as char);
|
||||
}
|
||||
vec.extend_from_slice(bytes);
|
||||
string.push_str(expect);
|
||||
decode_without_padding_impl(encoding, &vec[..], &string[..], i);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_without_padding(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
decode_without_padding_impl(encoding, bytes, expect, 0);
|
||||
}
|
||||
|
||||
fn decode_without_padding_impl(
|
||||
encoding: &'static Encoding,
|
||||
bytes: &[u8],
|
||||
expect: &str,
|
||||
padding: usize,
|
||||
) {
|
||||
decode_to_utf8_impl(encoding, bytes, expect, padding);
|
||||
decode_to_utf16_impl(encoding, bytes, &utf16_from_utf8(expect)[..], padding);
|
||||
decode_to_string(encoding, bytes, expect);
|
||||
}
|
||||
|
||||
pub fn encode(encoding: &'static Encoding, str: &str, expect: &[u8]) {
|
||||
let mut vec = Vec::with_capacity(expect.len() + 32);
|
||||
let mut string = String::with_capacity(str.len() + 32);
|
||||
let range = if cfg!(miri) {
|
||||
0usize..4usize
|
||||
} else {
|
||||
0usize..32usize
|
||||
};
|
||||
for i in range {
|
||||
vec.clear();
|
||||
string.clear();
|
||||
for j in 0usize..i {
|
||||
let c = 0x40u8 + (j as u8);
|
||||
vec.push(c);
|
||||
string.push(c as char);
|
||||
}
|
||||
vec.extend_from_slice(expect);
|
||||
string.push_str(str);
|
||||
encode_without_padding(encoding, &string[..], &vec[..]);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_without_padding(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
encode_from_utf8(encoding, string, expect);
|
||||
encode_from_utf16(encoding, &utf16_from_utf8(string)[..], expect);
|
||||
encode_to_vec(encoding, string, expect);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16(encoding: &'static Encoding, bytes: &[u8], expect: &[u16]) {
|
||||
decode_to_utf16_impl(encoding, bytes, expect, 0);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_impl(
|
||||
encoding: &'static Encoding,
|
||||
bytes: &[u8],
|
||||
expect: &[u16],
|
||||
padding: usize,
|
||||
) {
|
||||
for i in padding..bytes.len() {
|
||||
let (head, tail) = bytes.split_at(i);
|
||||
decode_to_utf16_with_boundary(encoding, head, tail, expect);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_with_boundary(
|
||||
encoding: &'static Encoding,
|
||||
head: &[u8],
|
||||
tail: &[u8],
|
||||
expect: &[u16],
|
||||
) {
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let mut dest: Vec<u16> = Vec::with_capacity(
|
||||
decoder
|
||||
.max_utf16_buffer_length(head.len() + tail.len())
|
||||
.unwrap(),
|
||||
);
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u16);
|
||||
let mut total_read = 0;
|
||||
let mut total_written = 0;
|
||||
{
|
||||
let (complete, read, written, _) = decoder.decode_to_utf16(head, &mut dest, false);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
{
|
||||
let (complete, read, written, _) =
|
||||
decoder.decode_to_utf16(tail, &mut dest[total_written..], true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
assert_eq!(total_read, head.len() + tail.len());
|
||||
assert_eq!(total_written, expect.len());
|
||||
dest.truncate(total_written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
decode_to_utf8_impl(encoding, bytes, expect, 0);
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_impl(
|
||||
encoding: &'static Encoding,
|
||||
bytes: &[u8],
|
||||
expect: &str,
|
||||
padding: usize,
|
||||
) {
|
||||
for i in padding..bytes.len() {
|
||||
let (head, tail) = bytes.split_at(i);
|
||||
decode_to_utf8_with_boundary(encoding, head, tail, expect);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_with_boundary(
|
||||
encoding: &'static Encoding,
|
||||
head: &[u8],
|
||||
tail: &[u8],
|
||||
expect: &str,
|
||||
) {
|
||||
let mut decoder = encoding.new_decoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(
|
||||
decoder
|
||||
.max_utf8_buffer_length(head.len() + tail.len())
|
||||
.unwrap(),
|
||||
);
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let mut total_read = 0;
|
||||
let mut total_written = 0;
|
||||
{
|
||||
let (complete, read, written, _) = decoder.decode_to_utf8(head, &mut dest, false);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
{
|
||||
let (complete, read, written, _) =
|
||||
decoder.decode_to_utf8(tail, &mut dest[total_written..], true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
total_read += read;
|
||||
total_written += written;
|
||||
}
|
||||
assert_eq!(total_read, head.len() + tail.len());
|
||||
assert_eq!(total_written, expect.len());
|
||||
dest.truncate(total_written);
|
||||
assert_eq!(&dest[..], expect.as_bytes());
|
||||
}
|
||||
|
||||
pub fn decode_to_string(encoding: &'static Encoding, bytes: &[u8], expect: &str) {
|
||||
let (cow, _, _) = encoding.decode(bytes);
|
||||
assert_eq!(&cow[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = encoder.encode_from_utf8(string, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, string.len());
|
||||
assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_from_utf16(encoding: &'static Encoding, string: &[u16], expect: &[u8]) {
|
||||
let mut encoder = encoding.new_encoder();
|
||||
let mut dest: Vec<u8> = Vec::with_capacity(10 * (string.len() + 1)); // 10 is replacement worst case
|
||||
let capacity = dest.capacity();
|
||||
dest.resize(capacity, 0u8);
|
||||
let (complete, read, written, _) = encoder.encode_from_utf16(string, &mut dest, true);
|
||||
match complete {
|
||||
CoderResult::InputEmpty => {}
|
||||
CoderResult::OutputFull => {
|
||||
unreachable!();
|
||||
}
|
||||
}
|
||||
assert_eq!(read, string.len());
|
||||
// assert_eq!(written, expect.len());
|
||||
dest.truncate(written);
|
||||
assert_eq!(&dest[..], expect);
|
||||
}
|
||||
|
||||
pub fn encode_to_vec(encoding: &'static Encoding, string: &str, expect: &[u8]) {
|
||||
let (cow, _, _) = encoding.encode(string);
|
||||
assert_eq!(&cow[..], expect);
|
||||
}
|
||||
|
||||
pub fn utf16_from_utf8(string: &str) -> Vec<u16> {
|
||||
let mut decoder = UTF_8.new_decoder_without_bom_handling();
|
||||
let mut vec = Vec::with_capacity(decoder.max_utf16_buffer_length(string.len()).unwrap());
|
||||
let capacity = vec.capacity();
|
||||
vec.resize(capacity, 0);
|
||||
|
||||
let (result, read, written) =
|
||||
decoder.decode_to_utf16_without_replacement(string.as_bytes(), &mut vec[..], true);
|
||||
match result {
|
||||
DecoderResult::InputEmpty => {
|
||||
debug_assert_eq!(read, string.len());
|
||||
vec.resize(written, 0);
|
||||
vec
|
||||
}
|
||||
DecoderResult::Malformed(_, _) => unreachable!("Malformed"),
|
||||
DecoderResult::OutputFull => unreachable!("Output full"),
|
||||
}
|
||||
}
|
||||
472
zeroidc/vendor/encoding_rs/src/utf_16.rs
vendored
Normal file
472
zeroidc/vendor/encoding_rs/src/utf_16.rs
vendored
Normal file
@@ -0,0 +1,472 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
|
||||
pub struct Utf16Decoder {
|
||||
lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
|
||||
lead_byte: Option<u8>,
|
||||
be: bool,
|
||||
pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
|
||||
}
|
||||
|
||||
impl Utf16Decoder {
|
||||
pub fn new(big_endian: bool) -> VariantDecoder {
|
||||
VariantDecoder::Utf16(Utf16Decoder {
|
||||
lead_surrogate: 0,
|
||||
lead_byte: None,
|
||||
be: big_endian,
|
||||
pending_bmp: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn additional_from_state(&self) -> usize {
|
||||
1 + if self.lead_byte.is_some() { 1 } else { 0 }
|
||||
+ if self.lead_surrogate == 0 { 0 } else { 2 }
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_mul(
|
||||
3,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
checked_add(
|
||||
1,
|
||||
checked_mul(
|
||||
3,
|
||||
checked_div(byte_length.checked_add(self.additional_from_state()), 2),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
decoder_functions!(
|
||||
{
|
||||
if self.pending_bmp {
|
||||
match dest.check_space_bmp() {
|
||||
Space::Full(_) => {
|
||||
return (DecoderResult::OutputFull, 0, 0);
|
||||
}
|
||||
Space::Available(destination_handle) => {
|
||||
destination_handle.write_bmp(self.lead_surrogate);
|
||||
self.pending_bmp = false;
|
||||
self.lead_surrogate = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
// This is the fast path. The rest runs only at the
|
||||
// start and end for partial sequences.
|
||||
if self.lead_byte.is_none() && self.lead_surrogate == 0 {
|
||||
if let Some((read, written)) = if self.be {
|
||||
dest.copy_utf16_from::<BigEndian>(&mut source)
|
||||
} else {
|
||||
dest.copy_utf16_from::<LittleEndian>(&mut source)
|
||||
} {
|
||||
return (DecoderResult::Malformed(2, 0), read, written);
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
debug_assert!(!self.pending_bmp);
|
||||
if self.lead_surrogate != 0 || self.lead_byte.is_some() {
|
||||
// We need to check space without intent to write in order to
|
||||
// make sure that there is space for the replacement character.
|
||||
match dest.check_space_bmp() {
|
||||
Space::Full(_) => {
|
||||
return (DecoderResult::OutputFull, 0, 0);
|
||||
}
|
||||
Space::Available(_) => {
|
||||
if self.lead_surrogate != 0 {
|
||||
self.lead_surrogate = 0;
|
||||
match self.lead_byte {
|
||||
None => {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
src_consumed,
|
||||
dest.written(),
|
||||
);
|
||||
}
|
||||
Some(_) => {
|
||||
self.lead_byte = None;
|
||||
return (
|
||||
DecoderResult::Malformed(3, 0),
|
||||
src_consumed,
|
||||
dest.written(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
debug_assert!(self.lead_byte.is_some());
|
||||
self.lead_byte = None;
|
||||
return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
match self.lead_byte {
|
||||
None => {
|
||||
self.lead_byte = Some(b);
|
||||
continue;
|
||||
}
|
||||
Some(lead) => {
|
||||
self.lead_byte = None;
|
||||
let code_unit = if self.be {
|
||||
u16::from(lead) << 8 | u16::from(b)
|
||||
} else {
|
||||
u16::from(b) << 8 | u16::from(lead)
|
||||
};
|
||||
let high_bits = code_unit & 0xFC00u16;
|
||||
if high_bits == 0xD800u16 {
|
||||
// high surrogate
|
||||
if self.lead_surrogate != 0 {
|
||||
// The previous high surrogate was in
|
||||
// error and this one becomes the new
|
||||
// pending one.
|
||||
self.lead_surrogate = code_unit as u16;
|
||||
return (
|
||||
DecoderResult::Malformed(2, 2),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
self.lead_surrogate = code_unit;
|
||||
continue;
|
||||
}
|
||||
if high_bits == 0xDC00u16 {
|
||||
// low surrogate
|
||||
if self.lead_surrogate == 0 {
|
||||
return (
|
||||
DecoderResult::Malformed(2, 0),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
|
||||
self.lead_surrogate = 0;
|
||||
continue;
|
||||
}
|
||||
// bmp
|
||||
if self.lead_surrogate != 0 {
|
||||
// The previous high surrogate was in
|
||||
// error and this code unit becomes a
|
||||
// pending BMP character.
|
||||
self.lead_surrogate = code_unit;
|
||||
self.pending_bmp = true;
|
||||
return (
|
||||
DecoderResult::Malformed(2, 2),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_bmp(code_unit);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_astral
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_utf_16le(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(UTF_16LE, bytes, expect);
|
||||
}
|
||||
|
||||
fn decode_utf_16be(bytes: &[u8], expect: &str) {
|
||||
decode_without_padding(UTF_16BE, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_utf_16le(string: &str, expect: &[u8]) {
|
||||
encode(UTF_16LE, string, expect);
|
||||
}
|
||||
|
||||
fn encode_utf_16be(string: &str, expect: &[u8]) {
|
||||
encode(UTF_16BE, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16_decode() {
|
||||
decode_utf_16le(b"", "");
|
||||
decode_utf_16be(b"", "");
|
||||
|
||||
decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
|
||||
decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
|
||||
decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
|
||||
|
||||
decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
|
||||
decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
|
||||
decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
|
||||
decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
|
||||
|
||||
decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
|
||||
decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
|
||||
|
||||
decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
|
||||
decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
|
||||
|
||||
// The \xFF makes sure that the parts before and after have different alignment
|
||||
let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
|
||||
let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
|
||||
let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
|
||||
decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
|
||||
decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
|
||||
decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
|
||||
decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16_encode() {
|
||||
// Empty
|
||||
encode_utf_16be("", b"");
|
||||
encode_utf_16le("", b"");
|
||||
|
||||
// Encodes as UTF-8
|
||||
assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
|
||||
assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
|
||||
encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_one_by_one() {
|
||||
let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
for b in input.chunks(1) {
|
||||
assert_eq!(b.len(), 1);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_one_by_one() {
|
||||
let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
for b in input.chunks(1) {
|
||||
assert_eq!(b.len(), 1);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_three_at_a_time() {
|
||||
let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
for b in input.chunks(3) {
|
||||
assert_eq!(b.len(), 3);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, b.len());
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_three_at_a_time() {
|
||||
let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
for b in input.chunks(3) {
|
||||
assert_eq!(b.len(), 3);
|
||||
let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
|
||||
let (result, read, _, had_errors) =
|
||||
decoder.decode_to_utf16(b, &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, b.len());
|
||||
assert!(!had_errors);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xFDFF);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
}
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xFEFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_bom_prefix() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_bom_prefix() {
|
||||
let mut output = [0u16; 20];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let needed = decoder.max_utf16_buffer_length(1).unwrap();
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 1);
|
||||
assert!(had_errors);
|
||||
assert_eq!(output[0], 0xFFFD);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16le_decode_near_end() {
|
||||
let mut output = [0u8; 4];
|
||||
let mut decoder = UTF_16LE.new_decoder();
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x03], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0x0);
|
||||
}
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::OutputFull);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xE2);
|
||||
assert_eq!(output[1], 0x98);
|
||||
assert_eq!(output[2], 0x83);
|
||||
assert_eq!(output[3], 0x00);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf_16be_decode_near_end() {
|
||||
let mut output = [0u8; 4];
|
||||
let mut decoder = UTF_16BE.new_decoder();
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x26], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 0);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0x0);
|
||||
}
|
||||
{
|
||||
let (result, read, written, had_errors) =
|
||||
decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
|
||||
assert_eq!(result, CoderResult::OutputFull);
|
||||
assert_eq!(read, 1);
|
||||
assert_eq!(written, 3);
|
||||
assert!(!had_errors);
|
||||
assert_eq!(output[0], 0xE2);
|
||||
assert_eq!(output[1], 0x98);
|
||||
assert_eq!(output[2], 0x83);
|
||||
assert_eq!(output[3], 0x00);
|
||||
}
|
||||
}
|
||||
}
|
||||
1631
zeroidc/vendor/encoding_rs/src/utf_8.rs
vendored
Normal file
1631
zeroidc/vendor/encoding_rs/src/utf_8.rs
vendored
Normal file
File diff suppressed because it is too large
Load Diff
400
zeroidc/vendor/encoding_rs/src/variant.rs
vendored
Normal file
400
zeroidc/vendor/encoding_rs/src/variant.rs
vendored
Normal file
@@ -0,0 +1,400 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
|
||||
// Instead, please regenerate using generate-encoding-data.py
|
||||
|
||||
//! This module provides enums that wrap the various decoders and encoders.
|
||||
//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
|
||||
//! dispatch explicitly for a finite set of specialized decoders and encoders.
|
||||
//! Unfortunately, this means the compiler doesn't generate the dispatch code
|
||||
//! and it has to be written here instead.
|
||||
//!
|
||||
//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
|
||||
//! allocation in Rust code, including the convenience methods on `Encoding`.
|
||||
|
||||
use super::*;
|
||||
use big5::*;
|
||||
use euc_jp::*;
|
||||
use euc_kr::*;
|
||||
use gb18030::*;
|
||||
use iso_2022_jp::*;
|
||||
use replacement::*;
|
||||
use shift_jis::*;
|
||||
use single_byte::*;
|
||||
use utf_16::*;
|
||||
use utf_8::*;
|
||||
use x_user_defined::*;
|
||||
|
||||
pub enum VariantDecoder {
|
||||
SingleByte(SingleByteDecoder),
|
||||
Utf8(Utf8Decoder),
|
||||
Gb18030(Gb18030Decoder),
|
||||
Big5(Big5Decoder),
|
||||
EucJp(EucJpDecoder),
|
||||
Iso2022Jp(Iso2022JpDecoder),
|
||||
ShiftJis(ShiftJisDecoder),
|
||||
EucKr(EucKrDecoder),
|
||||
Replacement(ReplacementDecoder),
|
||||
UserDefined(UserDefinedDecoder),
|
||||
Utf16(Utf16Decoder),
|
||||
}
|
||||
|
||||
impl VariantDecoder {
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Utf8(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Gb18030(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Big5(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::EucJp(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Iso2022Jp(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::ShiftJis(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::EucKr(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Replacement(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::UserDefined(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
VariantDecoder::Utf16(ref v) => v.max_utf16_buffer_length(byte_length),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Utf8(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Gb18030(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Big5(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::EucJp(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Iso2022Jp(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::ShiftJis(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::EucKr(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Replacement(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::UserDefined(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
VariantDecoder::Utf16(ref v) => {
|
||||
v.max_utf8_buffer_length_without_replacement(byte_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Utf8(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Gb18030(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Big5(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::EucJp(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Iso2022Jp(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::ShiftJis(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::EucKr(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Replacement(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::UserDefined(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
VariantDecoder::Utf16(ref v) => v.max_utf8_buffer_length(byte_length),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Big5(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf16_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_to_utf8_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u8],
|
||||
last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Utf8(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Gb18030(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Big5(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::EucJp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Iso2022Jp(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::ShiftJis(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::EucKr(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Replacement(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::UserDefined(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
VariantDecoder::Utf16(ref mut v) => v.decode_to_utf8_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
|
||||
match *self {
|
||||
VariantDecoder::SingleByte(ref v) => {
|
||||
return Some(v.latin1_byte_compatible_up_to(buffer));
|
||||
}
|
||||
VariantDecoder::Utf8(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::Gb18030(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::Big5(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::EucJp(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::Iso2022Jp(ref v) => {
|
||||
if v.in_neutral_state() {
|
||||
return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
|
||||
}
|
||||
return None;
|
||||
}
|
||||
VariantDecoder::ShiftJis(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::EucKr(ref v) => {
|
||||
if !v.in_neutral_state() {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
VariantDecoder::UserDefined(_) => {}
|
||||
VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(Encoding::ascii_valid_up_to(buffer))
|
||||
}
|
||||
}
|
||||
|
||||
pub enum VariantEncoder {
|
||||
SingleByte(SingleByteEncoder),
|
||||
Utf8(Utf8Encoder),
|
||||
Gb18030(Gb18030Encoder),
|
||||
Big5(Big5Encoder),
|
||||
EucJp(EucJpEncoder),
|
||||
Iso2022Jp(Iso2022JpEncoder),
|
||||
ShiftJis(ShiftJisEncoder),
|
||||
EucKr(EucKrEncoder),
|
||||
UserDefined(UserDefinedEncoder),
|
||||
}
|
||||
|
||||
impl VariantEncoder {
|
||||
pub fn has_pending_state(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoder::Iso2022Jp(ref v) => v.has_pending_state(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Utf8(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Gb18030(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Big5(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::EucJp(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::Iso2022Jp(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::ShiftJis(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::EucKr(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
VariantEncoder::UserDefined(ref v) => {
|
||||
v.max_buffer_length_from_utf16_without_replacement(u16_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Utf8(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Gb18030(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Big5(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::EucJp(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::Iso2022Jp(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::ShiftJis(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::EucKr(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
VariantEncoder::UserDefined(ref v) => {
|
||||
v.max_buffer_length_from_utf8_without_replacement(byte_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_from_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u16],
|
||||
dst: &mut [u8],
|
||||
last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Big5(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf16_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_from_utf8_raw(
|
||||
&mut self,
|
||||
src: &str,
|
||||
dst: &mut [u8],
|
||||
last: bool,
|
||||
) -> (EncoderResult, usize, usize) {
|
||||
match *self {
|
||||
VariantEncoder::SingleByte(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Utf8(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Gb18030(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Big5(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::EucJp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::Iso2022Jp(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::ShiftJis(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::EucKr(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
VariantEncoder::UserDefined(ref mut v) => v.encode_from_utf8_raw(src, dst, last),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum VariantEncoding {
|
||||
SingleByte(&'static [u16; 128], u16, u8, u8),
|
||||
Utf8,
|
||||
Gbk,
|
||||
Gb18030,
|
||||
Big5,
|
||||
EucJp,
|
||||
Iso2022Jp,
|
||||
ShiftJis,
|
||||
EucKr,
|
||||
Replacement,
|
||||
Utf16Be,
|
||||
Utf16Le,
|
||||
UserDefined,
|
||||
}
|
||||
|
||||
impl VariantEncoding {
|
||||
pub fn new_variant_decoder(&self) -> VariantDecoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
|
||||
VariantEncoding::Utf8 => Utf8Decoder::new(),
|
||||
VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
|
||||
VariantEncoding::Big5 => Big5Decoder::new(),
|
||||
VariantEncoding::EucJp => EucJpDecoder::new(),
|
||||
VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
|
||||
VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
|
||||
VariantEncoding::EucKr => EucKrDecoder::new(),
|
||||
VariantEncoding::Replacement => ReplacementDecoder::new(),
|
||||
VariantEncoding::UserDefined => UserDefinedDecoder::new(),
|
||||
VariantEncoding::Utf16Be => Utf16Decoder::new(true),
|
||||
VariantEncoding::Utf16Le => Utf16Decoder::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => {
|
||||
SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length)
|
||||
}
|
||||
VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
|
||||
VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
|
||||
VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
|
||||
VariantEncoding::Big5 => Big5Encoder::new(encoding),
|
||||
VariantEncoding::EucJp => EucJpEncoder::new(encoding),
|
||||
VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
|
||||
VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
|
||||
VariantEncoding::EucKr => EucKrEncoder::new(encoding),
|
||||
VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
|
||||
VariantEncoding::Utf16Be | VariantEncoding::Replacement | VariantEncoding::Utf16Le => {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_single_byte(&self) -> bool {
|
||||
match *self {
|
||||
VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
249
zeroidc/vendor/encoding_rs/src/x_user_defined.rs
vendored
Normal file
249
zeroidc/vendor/encoding_rs/src/x_user_defined.rs
vendored
Normal file
@@ -0,0 +1,249 @@
|
||||
// Copyright Mozilla Foundation. See the COPYRIGHT
|
||||
// file at the top-level directory of this distribution.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
||||
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
||||
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
||||
// option. This file may not be copied, modified, or distributed
|
||||
// except according to those terms.
|
||||
|
||||
use super::*;
|
||||
use crate::handles::*;
|
||||
use crate::variant::*;
|
||||
|
||||
cfg_if! {
|
||||
if #[cfg(feature = "simd-accel")] {
|
||||
use simd_funcs::*;
|
||||
use packed_simd::u16x8;
|
||||
|
||||
#[inline(always)]
|
||||
fn shift_upper(unpacked: u16x8) -> u16x8 {
|
||||
let highest_ascii = u16x8::splat(0x7F);
|
||||
unpacked + unpacked.gt(highest_ascii).select(u16x8::splat(0xF700), u16x8::splat(0)) }
|
||||
} else {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UserDefinedDecoder;
|
||||
|
||||
impl UserDefinedDecoder {
|
||||
pub fn new() -> VariantDecoder {
|
||||
VariantDecoder::UserDefined(UserDefinedDecoder)
|
||||
}
|
||||
|
||||
pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
|
||||
byte_length.checked_mul(3)
|
||||
}
|
||||
|
||||
decoder_function!(
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{
|
||||
if b < 0x80 {
|
||||
// ASCII run not optimized, because binary data expected
|
||||
destination_handle.write_ascii(b);
|
||||
continue;
|
||||
}
|
||||
destination_handle.write_upper_bmp(u16::from(b) + 0xF700);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
dest,
|
||||
source,
|
||||
b,
|
||||
destination_handle,
|
||||
_unread_handle,
|
||||
check_space_bmp,
|
||||
decode_to_utf8_raw,
|
||||
u8,
|
||||
Utf8Destination
|
||||
);
|
||||
|
||||
#[cfg(not(feature = "simd-accel"))]
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
let src_trim = &src[..length];
|
||||
let dst_trim = &mut dst[..length];
|
||||
src_trim
|
||||
.iter()
|
||||
.zip(dst_trim.iter_mut())
|
||||
.for_each(|(from, to)| {
|
||||
*to = {
|
||||
let unit = *from;
|
||||
if unit < 0x80 {
|
||||
u16::from(unit)
|
||||
} else {
|
||||
u16::from(unit) + 0xF700
|
||||
}
|
||||
}
|
||||
});
|
||||
(pending, length, length)
|
||||
}
|
||||
|
||||
#[cfg(feature = "simd-accel")]
|
||||
pub fn decode_to_utf16_raw(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
dst: &mut [u16],
|
||||
_last: bool,
|
||||
) -> (DecoderResult, usize, usize) {
|
||||
let (pending, length) = if dst.len() < src.len() {
|
||||
(DecoderResult::OutputFull, dst.len())
|
||||
} else {
|
||||
(DecoderResult::InputEmpty, src.len())
|
||||
};
|
||||
// Not bothering with alignment
|
||||
let tail_start = length & !0xF;
|
||||
let simd_iterations = length >> 4;
|
||||
let src_ptr = src.as_ptr();
|
||||
let dst_ptr = dst.as_mut_ptr();
|
||||
for i in 0..simd_iterations {
|
||||
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
|
||||
let (first, second) = simd_unpack(input);
|
||||
unsafe {
|
||||
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
|
||||
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
|
||||
}
|
||||
}
|
||||
let src_tail = &src[tail_start..length];
|
||||
let dst_tail = &mut dst[tail_start..length];
|
||||
src_tail
|
||||
.iter()
|
||||
.zip(dst_tail.iter_mut())
|
||||
.for_each(|(from, to)| {
|
||||
*to = {
|
||||
let unit = *from;
|
||||
if unit < 0x80 {
|
||||
u16::from(unit)
|
||||
} else {
|
||||
u16::from(unit) + 0xF700
|
||||
}
|
||||
}
|
||||
});
|
||||
(pending, length, length)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UserDefinedEncoder;
|
||||
|
||||
impl UserDefinedEncoder {
|
||||
pub fn new(encoding: &'static Encoding) -> Encoder {
|
||||
Encoder::new(encoding, VariantEncoder::UserDefined(UserDefinedEncoder))
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf16_without_replacement(
|
||||
&self,
|
||||
u16_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(u16_length)
|
||||
}
|
||||
|
||||
pub fn max_buffer_length_from_utf8_without_replacement(
|
||||
&self,
|
||||
byte_length: usize,
|
||||
) -> Option<usize> {
|
||||
Some(byte_length)
|
||||
}
|
||||
|
||||
encoder_functions!(
|
||||
{},
|
||||
{
|
||||
if c <= '\u{7F}' {
|
||||
// TODO optimize ASCII run
|
||||
destination_handle.write_one(c as u8);
|
||||
continue;
|
||||
}
|
||||
if c < '\u{F780}' || c > '\u{F7FF}' {
|
||||
return (
|
||||
EncoderResult::Unmappable(c),
|
||||
unread_handle.consumed(),
|
||||
destination_handle.written(),
|
||||
);
|
||||
}
|
||||
destination_handle.write_one((u32::from(c) - 0xF700) as u8);
|
||||
continue;
|
||||
},
|
||||
self,
|
||||
src_consumed,
|
||||
source,
|
||||
dest,
|
||||
c,
|
||||
destination_handle,
|
||||
unread_handle,
|
||||
check_space_one
|
||||
);
|
||||
}
|
||||
|
||||
// Any copyright to the test code below this comment is dedicated to the
|
||||
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::super::testing::*;
|
||||
use super::super::*;
|
||||
|
||||
fn decode_x_user_defined(bytes: &[u8], expect: &str) {
|
||||
decode(X_USER_DEFINED, bytes, expect);
|
||||
}
|
||||
|
||||
fn encode_x_user_defined(string: &str, expect: &[u8]) {
|
||||
encode(X_USER_DEFINED, string, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_decode() {
|
||||
// Empty
|
||||
decode_x_user_defined(b"", "");
|
||||
|
||||
// ASCII
|
||||
decode_x_user_defined(b"\x61\x62", "\u{0061}\u{0062}");
|
||||
|
||||
decode_x_user_defined(b"\x80\xFF", "\u{F780}\u{F7FF}");
|
||||
decode_x_user_defined(b"\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62\x80\xFF\x61\x62", "\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}\u{F780}\u{F7FF}\u{0061}\u{0062}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_encode() {
|
||||
// Empty
|
||||
encode_x_user_defined("", b"");
|
||||
|
||||
// ASCII
|
||||
encode_x_user_defined("\u{0061}\u{0062}", b"\x61\x62");
|
||||
|
||||
encode_x_user_defined("\u{F780}\u{F7FF}", b"\x80\xFF");
|
||||
encode_x_user_defined("\u{F77F}\u{F800}", b"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_x_user_defined_from_two_low_surrogates() {
|
||||
let expectation = b"��";
|
||||
let mut output = [0u8; 40];
|
||||
let mut encoder = X_USER_DEFINED.new_encoder();
|
||||
let (result, read, written, had_errors) =
|
||||
encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
|
||||
assert_eq!(result, CoderResult::InputEmpty);
|
||||
assert_eq!(read, 2);
|
||||
assert_eq!(written, expectation.len());
|
||||
assert!(had_errors);
|
||||
assert_eq!(&output[..written], expectation);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user