diff --git a/src/de/decoder.rs b/src/de/decoder.rs index 4991963..cff80d1 100644 --- a/src/de/decoder.rs +++ b/src/de/decoder.rs @@ -231,4 +231,51 @@ impl<'a, 'de, R: Reader<'de>, C: Config> Decode for &'a mut Decoder { self.reader.read(&mut array)?; Ok(array) } + + fn decode_char(&mut self) -> Result { + let mut array = [0u8; 4]; + + // Look at the first byte to see how many bytes must be read + self.reader.read(&mut array[..1])?; + + let width = utf8_char_width(array[0]); + if width == 0 { + return Err(DecodeError::InvalidCharEncoding(array)); + } + if width == 1 { + return Ok(array[0] as char); + } + + // read the remaining pain + self.reader.read(&mut array[1..width])?; + let res = core::str::from_utf8(&array[..width]) + .ok() + .and_then(|s| s.chars().next()) + .ok_or(DecodeError::InvalidCharEncoding(array))?; + Ok(res) + } +} + +const UTF8_CHAR_WIDTH: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x3F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x5F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, // 0x7F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 0xBF + 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, // 0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF +]; + +// This function is a copy of core::str::utf8_char_width +const fn utf8_char_width(b: u8) -> usize { + UTF8_CHAR_WIDTH[b as usize] as usize } diff --git a/src/de/impls.rs b/src/de/impls.rs index 21164be..3fee1ac 100644 --- a/src/de/impls.rs +++ b/src/de/impls.rs @@ -85,6 +85,12 @@ impl<'de> Decodable for f64 { } } +impl<'de> Decodable for char { + fn decode(mut decoder: D) -> Result { + decoder.decode_char() + } +} + impl<'a, 'de: 'a> BorrowDecodable<'de> for &'a [u8] { fn borrow_decode>(mut decoder: D) -> Result { let len = usize::decode(&mut decoder)?; @@ -174,6 +180,10 @@ where fn decode_array(&mut self) -> Result<[u8; N], DecodeError> { T::decode_array::(self) } + + fn decode_char(&mut self) -> Result { + T::decode_char(self) + } } impl<'a, 'de, T> BorrowDecode<'de> for &'a mut T diff --git a/src/de/mod.rs b/src/de/mod.rs index b6ea6e5..49407e4 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -66,6 +66,9 @@ pub trait Decode { fn decode_f64(&mut self) -> Result; /// Attempt to decode an array of `N` entries. fn decode_array(&mut self) -> Result<[u8; N], DecodeError>; + + /// Attempt to decode a `char` + fn decode_char(&mut self) -> Result; } /// Any source that can decode basic types. This type is most notably implemented for [Decoder]. diff --git a/src/enc/encoder.rs b/src/enc/encoder.rs index a308d2d..09ac901 100644 --- a/src/enc/encoder.rs +++ b/src/enc/encoder.rs @@ -197,4 +197,42 @@ impl<'a, W: Writer, C: Config> Encode for &'a mut Encoder { fn encode_array(&mut self, val: [u8; N]) -> Result<(), EncodeError> { self.writer.write(&val) } + + fn encode_char(&mut self, val: char) -> Result<(), EncodeError> { + encode_utf8(&mut self.writer, val) + } +} + +const TAG_CONT: u8 = 0b1000_0000; +const TAG_TWO_B: u8 = 0b1100_0000; +const TAG_THREE_B: u8 = 0b1110_0000; +const TAG_FOUR_B: u8 = 0b1111_0000; +const MAX_ONE_B: u32 = 0x80; +const MAX_TWO_B: u32 = 0x800; +const MAX_THREE_B: u32 = 0x10000; + +fn encode_utf8(writer: &mut impl Writer, c: char) -> Result<(), EncodeError> { + let code = c as u32; + + if code < MAX_ONE_B { + writer.write(&[c as u8]) + } else if code < MAX_TWO_B { + let mut buf = [0u8; 2]; + buf[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; + buf[1] = (code & 0x3F) as u8 | TAG_CONT; + writer.write(&buf) + } else if code < MAX_THREE_B { + let mut buf = [0u8; 3]; + buf[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; + buf[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + buf[2] = (code & 0x3F) as u8 | TAG_CONT; + writer.write(&buf) + } else { + let mut buf = [0u8; 4]; + buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; + buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; + buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; + buf[3] = (code & 0x3F) as u8 | TAG_CONT; + writer.write(&buf) + } } diff --git a/src/enc/impls.rs b/src/enc/impls.rs index 5d199e0..eed2f03 100644 --- a/src/enc/impls.rs +++ b/src/enc/impls.rs @@ -85,6 +85,12 @@ impl Encodeable for f64 { } } +impl Encodeable for char { + fn encode(&self, mut encoder: E) -> Result<(), EncodeError> { + encoder.encode_char(*self) + } +} + impl Encodeable for &'_ [u8] { fn encode(&self, mut encoder: E) -> Result<(), EncodeError> { encoder.encode_slice(*self) @@ -157,4 +163,8 @@ where fn encode_array(&mut self, val: [u8; N]) -> Result<(), EncodeError> { T::encode_array(self, val) } + + fn encode_char(&mut self, val: char) -> Result<(), EncodeError> { + T::encode_char(self, val) + } } diff --git a/src/enc/mod.rs b/src/enc/mod.rs index d5e9db9..f4d76a8 100644 --- a/src/enc/mod.rs +++ b/src/enc/mod.rs @@ -53,4 +53,7 @@ pub trait Encode { fn encode_slice(&mut self, val: &[u8]) -> Result<(), EncodeError>; /// Encode an array. Exactly `N` bytes must be encoded, else an error should be thrown. fn encode_array(&mut self, val: [u8; N]) -> Result<(), EncodeError>; + + /// Encode a single utf8 char + fn encode_char(&mut self, val: char) -> Result<(), EncodeError>; } diff --git a/src/error.rs b/src/error.rs index 4a7a6f2..e5279db 100644 --- a/src/error.rs +++ b/src/error.rs @@ -46,6 +46,9 @@ pub enum DecodeError { /// The decoder tried to decode a `str`, but an utf8 error was encountered. Utf8(core::str::Utf8Error), + + /// The decoder tried to decode a `char` and failed. The given buffer contains the bytes that are read at the moment of failure. + InvalidCharEncoding([u8; 4]), } /// Integer types. Used by [DecodeError]. These types have no purpose other than being shown in errors. diff --git a/tests/test.rs b/tests/basic_types.rs similarity index 86% rename from tests/test.rs rename to tests/basic_types.rs index b0ca166..b8c5132 100644 --- a/tests/test.rs +++ b/tests/basic_types.rs @@ -12,7 +12,13 @@ where C: Config, { let mut buffer = [0u8; 32]; - bincode::encode_into_slice_with_config(element.clone(), &mut buffer, config).unwrap(); + let len = bincode::encode_into_slice_with_config(element.clone(), &mut buffer, config).unwrap(); + println!( + "{:?}: {:?} ({:?})", + element, + &buffer[..len], + core::any::type_name::() + ); let decoded: V = bincode::decode_with_config(&mut buffer, config).unwrap(); assert_eq!(element, decoded); @@ -68,6 +74,11 @@ fn test_numbers() { the_same(5.0f32); the_same(5.0f64); + + for char in "aÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö文".chars() + { + the_same(char); + } } #[test]