Skip to content

Commit

Permalink
Merge pull request #351 from Kijewski/pr-ascii_str
Browse files Browse the repository at this point in the history
Replace `from_utf8_unsafe()` with `AsciiStr`
  • Loading branch information
GuillaumeGomez authored Feb 24, 2025
2 parents c3fac0c + 6ae689c commit 0b098f1
Show file tree
Hide file tree
Showing 14 changed files with 471 additions and 245 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ members = [
"testing",
"testing-alloc",
"testing-no-std",
"testing-renamed"
"testing-renamed",
]
resolver = "2"
4 changes: 4 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ extend-exclude = [
]

[default.extend-words]
# It's actually called that in the ASCII standard
Enquiry = "Enquiry"

# French words
exemple = "exemple"
existant = "existant"

# used in tests
Ba = "Ba"
fo = "fo"
Expand Down
1 change: 1 addition & 0 deletions fuzzing/fuzz/src/ascii_str.rs
1 change: 1 addition & 0 deletions fuzzing/fuzz/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#![deny(unreachable_pub)]

pub mod all;
mod ascii_str;
pub mod filters;
pub mod html;
pub mod parser;
Expand Down
2 changes: 1 addition & 1 deletion rinja/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ blocks = ["rinja_derive?/blocks"]
code-in-doc = ["rinja_derive?/code-in-doc"]
config = ["rinja_derive?/config"]
derive = ["rinja_derive"]
serde_json = ["rinja_derive?/serde_json", "dep:serde", "dep:serde_json"]
serde_json = ["std", "rinja_derive?/serde_json", "dep:serde", "dep:serde_json"]
std = [
"alloc",
"rinja_derive?/std",
Expand Down
144 changes: 144 additions & 0 deletions rinja/src/ascii_str.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// FIXME: Replace `AsciiChar` with `[core:ascii::Char]` once [#110998] is stable
// [#110998]: https://github.com/rust-lang/rust/issues/110998

#![allow(unreachable_pub)]

use core::ops::{Deref, Index, IndexMut};

pub use _ascii_char::AsciiChar;

/// A string that only contains ASCII characters, same layout as [`str`].
#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct AsciiStr([AsciiChar]);

impl AsciiStr {
#[inline]
pub const fn new_sized<const N: usize>(src: &str) -> [AsciiChar; N] {
if !src.is_ascii() || src.len() > N {
panic!();
}

let src = src.as_bytes();
let mut result = [AsciiChar::NULL; N];
let mut i = 0;
while i < src.len() {
result[i] = AsciiChar::new(src[i]);
i += 1;
}
result
}

#[inline]
pub const fn from_slice(src: &[AsciiChar]) -> &Self {
// SAFETY: `Self` is transparent over `[AsciiChar]`.
unsafe { core::mem::transmute::<&[AsciiChar], &AsciiStr>(src) }
}

#[inline]
pub const fn as_str(&self) -> &str {
// SAFETY: `Self` has the same layout as `str`,
// and all ASCII characters are valid UTF-8 characters.
unsafe { core::mem::transmute::<&AsciiStr, &str>(self) }
}

#[inline]
pub const fn len(&self) -> usize {
self.0.len()
}

#[inline]
pub const fn is_empty(&self) -> bool {
self.0.is_empty()
}
}

// Must not implement `DerefMut`. Not every `char` is an ASCII character.
impl Deref for AsciiStr {
type Target = str;

#[inline]
fn deref(&self) -> &Self::Target {
self.as_str()
}
}

impl<Idx> Index<Idx> for AsciiStr
where
[AsciiChar]: Index<Idx, Output = [AsciiChar]>,
{
type Output = [AsciiChar];

#[inline]
fn index(&self, index: Idx) -> &Self::Output {
&self.0[index]
}
}

impl<Idx> IndexMut<Idx> for AsciiStr
where
[AsciiChar]: IndexMut<Idx, Output = [AsciiChar]>,
{
#[inline]
fn index_mut(&mut self, index: Idx) -> &mut Self::Output {
&mut self.0[index]
}
}

impl Default for &'static AsciiStr {
#[inline]
fn default() -> Self {
// SAFETY: `Self` has the same layout as `str`.
unsafe { core::mem::transmute::<&str, &AsciiStr>("") }
}
}

impl AsciiChar {
pub const NULL: AsciiChar = AsciiChar::new(0);

#[inline]
pub const fn slice_as_bytes<const N: usize>(src: &[AsciiChar; N]) -> &[u8; N] {
// SAFETY: `[AsciiChar]` has the same layout as `[u8]`.
unsafe { core::mem::transmute::<&[AsciiChar; N], &[u8; N]>(src) }
}

#[inline]
pub const fn two_digits(d: u32) -> [Self; 2] {
const ALPHABET: &[u8; 10] = b"0123456789";

if d >= ALPHABET.len().pow(2) as u32 {
panic!();
}
[
Self::new(ALPHABET[d as usize / ALPHABET.len()]),
Self::new(ALPHABET[d as usize % ALPHABET.len()]),
]
}

#[inline]
pub const fn two_hex_digits(d: u32) -> [Self; 2] {
const ALPHABET: &[u8; 16] = b"0123456789abcdef";

if d >= ALPHABET.len().pow(2) as u32 {
panic!();
}
[
Self::new(ALPHABET[d as usize / ALPHABET.len()]),
Self::new(ALPHABET[d as usize % ALPHABET.len()]),
]
}
}

mod _ascii_char {
/// A character that is known to be in ASCII range, same layout as [`u8`].
#[derive(Debug, Clone, Copy, Default, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[repr(transparent)]
pub struct AsciiChar(u8);

impl AsciiChar {
#[inline]
pub const fn new(c: u8) -> Self {
if c.is_ascii() { Self(c) } else { panic!() }
}
}
}
51 changes: 26 additions & 25 deletions rinja/src/filters/humansize.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
use core::convert::Infallible;
use core::fmt;
use core::mem::MaybeUninit;
use core::str::from_utf8_unchecked;

use super::FastWritable;
use crate::ascii_str::{AsciiChar, AsciiStr};

/// Returns adequate string representation (in KB, ..) of number of bytes
///
Expand Down Expand Up @@ -58,26 +58,27 @@ impl FastWritable for FilesizeFormatFilter {
}

/// Formats `buffer` to contain the decimal point, decimal places and unit
fn format_frac(buffer: &mut MaybeUninit<[u8; 8]>, prefix: u8, scaled: u32) -> &str {
// LLVM generates better byte code for register sized buffers, so we add some NULs
let buffer = buffer.write(*b"..0 kB\0\0");
fn format_frac(buffer: &mut MaybeUninit<[AsciiChar; 8]>, prefix: AsciiChar, scaled: u32) -> &str {
// LLVM generates better byte code for register sized buffers
let buffer = buffer.write(AsciiStr::new_sized("..0 kB"));
buffer[4] = prefix;

let frac = scaled % 100;
let buffer = if frac == 0 {
&buffer[3..6]
} else if frac % 10 == 0 {
// the decimal separator '.' is already contained in buffer[1]
buffer[2] = b'0' + (frac / 10) as u8;
&buffer[1..6]
} else {
// the decimal separator '.' is already contained in buffer[0]
buffer[1] = b'0' + (frac / 10) as u8;
buffer[2] = b'0' + (frac % 10) as u8;
&buffer[0..6]
let digits = AsciiChar::two_digits(frac);
if digits[1] == AsciiChar::new(b'0') {
// the decimal separator '.' is already contained in buffer[1]
buffer[2] = digits[0];
&buffer[1..6]
} else {
// the decimal separator '.' is already contained in buffer[0]
[buffer[1], buffer[2]] = digits;
&buffer[0..6]
}
};
// SAFETY: we know that the buffer contains only ASCII data
unsafe { from_utf8_unchecked(buffer) }
AsciiStr::from_slice(buffer).as_str()
}

#[cold]
Expand All @@ -87,17 +88,17 @@ fn too_big<W: fmt::Write + ?Sized>(value: f32, dest: &mut W) -> crate::Result<()
}

/// `((si_prefix, factor), limit)`, the factor is offset by 10**2 to account for 2 decimal places
const SI_PREFIXES: &[((u8, f32), f32)] = &[
((b'k', 1e-1), 1e6),
((b'M', 1e-4), 1e9),
((b'G', 1e-7), 1e12),
((b'T', 1e-10), 1e15),
((b'P', 1e-13), 1e18),
((b'E', 1e-16), 1e21),
((b'Z', 1e-19), 1e24),
((b'Y', 1e-22), 1e27),
((b'R', 1e-25), 1e30),
((b'Q', 1e-28), 1e33),
const SI_PREFIXES: &[((AsciiChar, f32), f32)] = &[
((AsciiChar::new(b'k'), 1e-1), 1e6),
((AsciiChar::new(b'M'), 1e-4), 1e9),
((AsciiChar::new(b'G'), 1e-7), 1e12),
((AsciiChar::new(b'T'), 1e-10), 1e15),
((AsciiChar::new(b'P'), 1e-13), 1e18),
((AsciiChar::new(b'E'), 1e-16), 1e21),
((AsciiChar::new(b'Z'), 1e-19), 1e24),
((AsciiChar::new(b'Y'), 1e-22), 1e27),
((AsciiChar::new(b'R'), 1e-25), 1e30),
((AsciiChar::new(b'Q'), 1e-28), 1e33),
];

#[test]
Expand Down
Loading

0 comments on commit 0b098f1

Please sign in to comment.