Merge pull request #351 from Kijewski/pr-ascii_str

Replace `from_utf8_unsafe()` with `AsciiStr`
rinja-rs · Feb 24, 2025 · 0b098f1 · 0b098f1
2 parents c3fac0c + 6ae689c
commit 0b098f1
Show file tree

Hide file tree

Showing 14 changed files with 471 additions and 245 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,6 +6,6 @@ members = [
     "testing",
     "testing-alloc",
     "testing-no-std",
-    "testing-renamed"
+    "testing-renamed",
 ]
 resolver = "2"
diff --git a/_typos.toml b/_typos.toml
@@ -17,9 +17,13 @@ extend-exclude = [
 ]
 
 [default.extend-words]
+# It's actually called that in the ASCII standard
+Enquiry = "Enquiry"
+
 # French words
 exemple = "exemple"
 existant = "existant"
+
 # used in tests
 Ba = "Ba"
 fo = "fo"

diff --git a/fuzzing/fuzz/src/ascii_str.rs b/fuzzing/fuzz/src/ascii_str.rs
@@ -0,0 +1 @@
+../../../rinja/src/ascii_str.rs
diff --git a/fuzzing/fuzz/src/lib.rs b/fuzzing/fuzz/src/lib.rs
@@ -3,6 +3,7 @@
 #![deny(unreachable_pub)]
 
 pub mod all;
+mod ascii_str;
 pub mod filters;
 pub mod html;
 pub mod parser;

diff --git a/rinja/Cargo.toml b/rinja/Cargo.toml
@@ -57,7 +57,7 @@ blocks = ["rinja_derive?/blocks"]
 code-in-doc = ["rinja_derive?/code-in-doc"]
 config = ["rinja_derive?/config"]
 derive = ["rinja_derive"]
-serde_json = ["rinja_derive?/serde_json", "dep:serde", "dep:serde_json"]
+serde_json = ["std", "rinja_derive?/serde_json", "dep:serde", "dep:serde_json"]
 std = [
     "alloc",
     "rinja_derive?/std",

diff --git a/rinja/src/ascii_str.rs b/rinja/src/ascii_str.rs
@@ -0,0 +1,144 @@
+// FIXME: Replace `AsciiChar` with `[core:ascii::Char]` once [#110998] is stable
+// [#110998]: https://github.com/rust-lang/rust/issues/110998
+
+#![allow(unreachable_pub)]
+
+use core::ops::{Deref, Index, IndexMut};
+
+pub use _ascii_char::AsciiChar;
+
+/// A string that only contains ASCII characters, same layout as [`str`].
+#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(transparent)]
+pub struct AsciiStr([AsciiChar]);
+
+impl AsciiStr {
+    #[inline]
+    pub const fn new_sized<const N: usize>(src: &str) -> [AsciiChar; N] {
+        if !src.is_ascii() || src.len() > N {
+            panic!();
+        }
+
+        let src = src.as_bytes();
+        let mut result = [AsciiChar::NULL; N];
+        let mut i = 0;
+        while i < src.len() {
+            result[i] = AsciiChar::new(src[i]);
+            i += 1;
+        }
+        result
+    }
+
+    #[inline]
+    pub const fn from_slice(src: &[AsciiChar]) -> &Self {
+        // SAFETY: `Self` is transparent over `[AsciiChar]`.
+        unsafe { core::mem::transmute::<&[AsciiChar], &AsciiStr>(src) }
+    }
+
+    #[inline]
+    pub const fn as_str(&self) -> &str {
+        // SAFETY: `Self` has the same layout as `str`,
+        // and all ASCII characters are valid UTF-8 characters.
+        unsafe { core::mem::transmute::<&AsciiStr, &str>(self) }
+    }
+
+    #[inline]
+    pub const fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    #[inline]
+    pub const fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+}
+
+// Must not implement `DerefMut`. Not every `char` is an ASCII character.
+impl Deref for AsciiStr {
+    type Target = str;
+
+    #[inline]
+    fn deref(&self) -> &Self::Target {
+        self.as_str()
+    }
+}
+
+impl<Idx> Index<Idx> for AsciiStr
+where
+    [AsciiChar]: Index<Idx, Output = [AsciiChar]>,
+{
+    type Output = [AsciiChar];
+
+    #[inline]
+    fn index(&self, index: Idx) -> &Self::Output {
+        &self.0[index]
+    }
+}
+
+impl<Idx> IndexMut<Idx> for AsciiStr
+where
+    [AsciiChar]: IndexMut<Idx, Output = [AsciiChar]>,
+{
+    #[inline]
+    fn index_mut(&mut self, index: Idx) -> &mut Self::Output {
+        &mut self.0[index]
+    }
+}
+
+impl Default for &'static AsciiStr {
+    #[inline]
+    fn default() -> Self {
+        // SAFETY: `Self` has the same layout as `str`.
+        unsafe { core::mem::transmute::<&str, &AsciiStr>("") }
+    }
+}
+
+impl AsciiChar {
+    pub const NULL: AsciiChar = AsciiChar::new(0);
+
+    #[inline]
+    pub const fn slice_as_bytes<const N: usize>(src: &[AsciiChar; N]) -> &[u8; N] {
+        // SAFETY: `[AsciiChar]` has the same layout as `[u8]`.
+        unsafe { core::mem::transmute::<&[AsciiChar; N], &[u8; N]>(src) }
+    }
+
+    #[inline]
+    pub const fn two_digits(d: u32) -> [Self; 2] {
+        const ALPHABET: &[u8; 10] = b"0123456789";
+
+        if d >= ALPHABET.len().pow(2) as u32 {
+            panic!();
+        }
+        [
+            Self::new(ALPHABET[d as usize / ALPHABET.len()]),
+            Self::new(ALPHABET[d as usize % ALPHABET.len()]),
+        ]
+    }
+
+    #[inline]
+    pub const fn two_hex_digits(d: u32) -> [Self; 2] {
+        const ALPHABET: &[u8; 16] = b"0123456789abcdef";
+
+        if d >= ALPHABET.len().pow(2) as u32 {
+            panic!();
+        }
+        [
+            Self::new(ALPHABET[d as usize / ALPHABET.len()]),
+            Self::new(ALPHABET[d as usize % ALPHABET.len()]),
+        ]
+    }
+}
+
+mod _ascii_char {
+    /// A character that is known to be in ASCII range, same layout as [`u8`].
+    #[derive(Debug, Clone, Copy, Default, Hash, PartialEq, Eq, PartialOrd, Ord)]
+    #[repr(transparent)]
+    pub struct AsciiChar(u8);
+
+    impl AsciiChar {
+        #[inline]
+        pub const fn new(c: u8) -> Self {
+            if c.is_ascii() { Self(c) } else { panic!() }
+        }
+    }
+}
diff --git a/rinja/src/filters/humansize.rs b/rinja/src/filters/humansize.rs
@@ -1,9 +1,9 @@
 use core::convert::Infallible;
 use core::fmt;
 use core::mem::MaybeUninit;
-use core::str::from_utf8_unchecked;
 
 use super::FastWritable;
+use crate::ascii_str::{AsciiChar, AsciiStr};
 
 /// Returns adequate string representation (in KB, ..) of number of bytes
 ///
@@ -58,26 +58,27 @@ impl FastWritable for FilesizeFormatFilter {
 }
 
 /// Formats `buffer` to contain the decimal point, decimal places and unit
-fn format_frac(buffer: &mut MaybeUninit<[u8; 8]>, prefix: u8, scaled: u32) -> &str {
-    // LLVM generates better byte code for register sized buffers, so we add some NULs
-    let buffer = buffer.write(*b"..0 kB\0\0");
+fn format_frac(buffer: &mut MaybeUninit<[AsciiChar; 8]>, prefix: AsciiChar, scaled: u32) -> &str {
+    // LLVM generates better byte code for register sized buffers
+    let buffer = buffer.write(AsciiStr::new_sized("..0 kB"));
     buffer[4] = prefix;
 
     let frac = scaled % 100;
     let buffer = if frac == 0 {
         &buffer[3..6]
-    } else if frac % 10 == 0 {
-        // the decimal separator '.' is already contained in buffer[1]
-        buffer[2] = b'0' + (frac / 10) as u8;
-        &buffer[1..6]
     } else {
-        // the decimal separator '.' is already contained in buffer[0]
-        buffer[1] = b'0' + (frac / 10) as u8;
-        buffer[2] = b'0' + (frac % 10) as u8;
-        &buffer[0..6]
+        let digits = AsciiChar::two_digits(frac);
+        if digits[1] == AsciiChar::new(b'0') {
+            // the decimal separator '.' is already contained in buffer[1]
+            buffer[2] = digits[0];
+            &buffer[1..6]
+        } else {
+            // the decimal separator '.' is already contained in buffer[0]
+            [buffer[1], buffer[2]] = digits;
+            &buffer[0..6]
+        }
     };
-    // SAFETY: we know that the buffer contains only ASCII data
-    unsafe { from_utf8_unchecked(buffer) }
+    AsciiStr::from_slice(buffer).as_str()
 }
 
 #[cold]
@@ -87,17 +88,17 @@ fn too_big<W: fmt::Write + ?Sized>(value: f32, dest: &mut W) -> crate::Result<()
 }
 
 /// `((si_prefix, factor), limit)`, the factor is offset by 10**2 to account for 2 decimal places
-const SI_PREFIXES: &[((u8, f32), f32)] = &[
-    ((b'k', 1e-1), 1e6),
-    ((b'M', 1e-4), 1e9),
-    ((b'G', 1e-7), 1e12),
-    ((b'T', 1e-10), 1e15),
-    ((b'P', 1e-13), 1e18),
-    ((b'E', 1e-16), 1e21),
-    ((b'Z', 1e-19), 1e24),
-    ((b'Y', 1e-22), 1e27),
-    ((b'R', 1e-25), 1e30),
-    ((b'Q', 1e-28), 1e33),
+const SI_PREFIXES: &[((AsciiChar, f32), f32)] = &[
+    ((AsciiChar::new(b'k'), 1e-1), 1e6),
+    ((AsciiChar::new(b'M'), 1e-4), 1e9),
+    ((AsciiChar::new(b'G'), 1e-7), 1e12),
+    ((AsciiChar::new(b'T'), 1e-10), 1e15),
+    ((AsciiChar::new(b'P'), 1e-13), 1e18),
+    ((AsciiChar::new(b'E'), 1e-16), 1e21),
+    ((AsciiChar::new(b'Z'), 1e-19), 1e24),
+    ((AsciiChar::new(b'Y'), 1e-22), 1e27),
+    ((AsciiChar::new(b'R'), 1e-25), 1e30),
+    ((AsciiChar::new(b'Q'), 1e-28), 1e33),
 ];
 
 #[test]