Added support for char encoding/decoding

This commit is contained in:
Victor Koenders 2021-10-14 19:28:28 +02:00
parent b480d2b3b3
commit c4cb220fb2
8 changed files with 126 additions and 1 deletions

View File

@ -231,4 +231,51 @@ impl<'a, 'de, R: Reader<'de>, C: Config> Decode for &'a mut Decoder<R, C> {
self.reader.read(&mut array)?;
Ok(array)
}
fn decode_char(&mut self) -> Result<char, DecodeError> {
let mut array = [0u8; 4];
// Look at the first byte to see how many bytes must be read
self.reader.read(&mut array[..1])?;
let width = utf8_char_width(array[0]);
if width == 0 {
return Err(DecodeError::InvalidCharEncoding(array));
}
if width == 1 {
return Ok(array[0] as char);
}
// read the remaining pain
self.reader.read(&mut array[1..width])?;
let res = core::str::from_utf8(&array[..width])
.ok()
.and_then(|s| s.chars().next())
.ok_or(DecodeError::InvalidCharEncoding(array))?;
Ok(res)
}
}
const UTF8_CHAR_WIDTH: [u8; 256] = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x1F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x3F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x5F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, // 0x7F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, // 0x9F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, // 0xBF
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, // 0xDF
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF
];
// This function is a copy of core::str::utf8_char_width
const fn utf8_char_width(b: u8) -> usize {
UTF8_CHAR_WIDTH[b as usize] as usize
}

View File

@ -85,6 +85,12 @@ impl<'de> Decodable for f64 {
}
}
impl<'de> Decodable for char {
fn decode<D: Decode>(mut decoder: D) -> Result<Self, DecodeError> {
decoder.decode_char()
}
}
impl<'a, 'de: 'a> BorrowDecodable<'de> for &'a [u8] {
fn borrow_decode<D: BorrowDecode<'de>>(mut decoder: D) -> Result<Self, DecodeError> {
let len = usize::decode(&mut decoder)?;
@ -174,6 +180,10 @@ where
fn decode_array<const N: usize>(&mut self) -> Result<[u8; N], DecodeError> {
T::decode_array::<N>(self)
}
fn decode_char(&mut self) -> Result<char, DecodeError> {
T::decode_char(self)
}
}
impl<'a, 'de, T> BorrowDecode<'de> for &'a mut T

View File

@ -66,6 +66,9 @@ pub trait Decode {
fn decode_f64(&mut self) -> Result<f64, DecodeError>;
/// Attempt to decode an array of `N` entries.
fn decode_array<const N: usize>(&mut self) -> Result<[u8; N], DecodeError>;
/// Attempt to decode a `char`
fn decode_char(&mut self) -> Result<char, DecodeError>;
}
/// Any source that can decode basic types. This type is most notably implemented for [Decoder].

View File

@ -197,4 +197,42 @@ impl<'a, W: Writer, C: Config> Encode for &'a mut Encoder<W, C> {
fn encode_array<const N: usize>(&mut self, val: [u8; N]) -> Result<(), EncodeError> {
self.writer.write(&val)
}
fn encode_char(&mut self, val: char) -> Result<(), EncodeError> {
encode_utf8(&mut self.writer, val)
}
}
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO_B: u8 = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8 = 0b1111_0000;
const MAX_ONE_B: u32 = 0x80;
const MAX_TWO_B: u32 = 0x800;
const MAX_THREE_B: u32 = 0x10000;
fn encode_utf8(writer: &mut impl Writer, c: char) -> Result<(), EncodeError> {
let code = c as u32;
if code < MAX_ONE_B {
writer.write(&[c as u8])
} else if code < MAX_TWO_B {
let mut buf = [0u8; 2];
buf[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
buf[1] = (code & 0x3F) as u8 | TAG_CONT;
writer.write(&buf)
} else if code < MAX_THREE_B {
let mut buf = [0u8; 3];
buf[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
buf[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[2] = (code & 0x3F) as u8 | TAG_CONT;
writer.write(&buf)
} else {
let mut buf = [0u8; 4];
buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
buf[3] = (code & 0x3F) as u8 | TAG_CONT;
writer.write(&buf)
}
}

View File

@ -85,6 +85,12 @@ impl Encodeable for f64 {
}
}
impl Encodeable for char {
fn encode<E: Encode>(&self, mut encoder: E) -> Result<(), EncodeError> {
encoder.encode_char(*self)
}
}
impl Encodeable for &'_ [u8] {
fn encode<E: Encode>(&self, mut encoder: E) -> Result<(), EncodeError> {
encoder.encode_slice(*self)
@ -157,4 +163,8 @@ where
fn encode_array<const N: usize>(&mut self, val: [u8; N]) -> Result<(), EncodeError> {
T::encode_array(self, val)
}
fn encode_char(&mut self, val: char) -> Result<(), EncodeError> {
T::encode_char(self, val)
}
}

View File

@ -53,4 +53,7 @@ pub trait Encode {
fn encode_slice(&mut self, val: &[u8]) -> Result<(), EncodeError>;
/// Encode an array. Exactly `N` bytes must be encoded, else an error should be thrown.
fn encode_array<const N: usize>(&mut self, val: [u8; N]) -> Result<(), EncodeError>;
/// Encode a single utf8 char
fn encode_char(&mut self, val: char) -> Result<(), EncodeError>;
}

View File

@ -46,6 +46,9 @@ pub enum DecodeError {
/// The decoder tried to decode a `str`, but an utf8 error was encountered.
Utf8(core::str::Utf8Error),
/// The decoder tried to decode a `char` and failed. The given buffer contains the bytes that are read at the moment of failure.
InvalidCharEncoding([u8; 4]),
}
/// Integer types. Used by [DecodeError]. These types have no purpose other than being shown in errors.

View File

@ -12,7 +12,13 @@ where
C: Config,
{
let mut buffer = [0u8; 32];
bincode::encode_into_slice_with_config(element.clone(), &mut buffer, config).unwrap();
let len = bincode::encode_into_slice_with_config(element.clone(), &mut buffer, config).unwrap();
println!(
"{:?}: {:?} ({:?})",
element,
&buffer[..len],
core::any::type_name::<C>()
);
let decoded: V = bincode::decode_with_config(&mut buffer, config).unwrap();
assert_eq!(element, decoded);
@ -68,6 +74,11 @@ fn test_numbers() {
the_same(5.0f32);
the_same(5.0f64);
for char in "aÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö文".chars()
{
the_same(char);
}
}
#[test]