diff --git a/library/alloc/src/ffi/mod.rs b/library/alloc/src/ffi/mod.rs index 4f9dc40a3cfc9..0880e8a340f4e 100644 --- a/library/alloc/src/ffi/mod.rs +++ b/library/alloc/src/ffi/mod.rs @@ -89,3 +89,19 @@ pub use self::c_str::{FromVecWithNulError, IntoStringError, NulError}; #[unstable(feature = "c_str_module", issue = "112134")] pub mod c_str; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub mod os_str; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub mod wtf8; diff --git a/library/alloc/src/ffi/os_str.rs b/library/alloc/src/ffi/os_str.rs new file mode 100644 index 0000000000000..521b0366df24a --- /dev/null +++ b/library/alloc/src/ffi/os_str.rs @@ -0,0 +1,1272 @@ +//! The [`OsStr`] and [`OsString`] types and associated utilities. + +#[cfg(test)] +mod tests; + +use core::ffi::os_str::{OsStr, Slice}; +use core::hash::{Hash, Hasher}; +use core::{cmp, fmt, ops}; + +use crate::borrow::{Borrow, Cow, ToOwned}; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::rc::Rc; +use crate::str::FromStr; +use crate::string::String; +use crate::sync::Arc; +use crate::vec::Vec; + +mod private { + /// This trait being unreachable from outside the crate + /// prevents outside implementations of our extension traits. + /// This allows adding more trait methods in the future. + #[unstable(feature = "sealed", issue = "none")] + pub trait Sealed {} +} + +#[cfg(any(target_os = "windows", target_os = "uefi"))] +#[stable(feature = "rust1", since = "1.0.0")] +pub mod os_str_ext_windows; + +#[cfg(not(any(target_os = "windows", target_os = "uefi")))] +#[stable(feature = "rust1", since = "1.0.0")] +pub mod os_str_ext_unix; + +#[cfg(any(target_os = "windows", target_os = "uefi"))] +mod wtf8; +#[cfg(any(target_os = "windows", target_os = "uefi"))] +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub use wtf8::Buf; + +#[cfg(not(any(target_os = "windows", target_os = "uefi")))] +mod bytes; +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[cfg(not(any(target_os = "windows", target_os = "uefi")))] +#[doc(hidden)] +pub use bytes::Buf; + +/// A type that can represent owned, mutable platform-native strings, but is +/// cheaply inter-convertible with Rust strings. +/// +/// The need for this type arises from the fact that: +/// +/// * On Unix systems, strings are often arbitrary sequences of non-zero +/// bytes, in many cases interpreted as UTF-8. +/// +/// * On Windows, strings are often arbitrary sequences of non-zero 16-bit +/// values, interpreted as UTF-16 when it is valid to do so. +/// +/// * In Rust, strings are always valid UTF-8, which may contain zeros. +/// +/// `OsString` and [`OsStr`] bridge this gap by simultaneously representing Rust +/// and platform-native string values, and in particular allowing a Rust string +/// to be converted into an "OS" string with no cost if possible. A consequence +/// of this is that `OsString` instances are *not* `NUL` terminated; in order +/// to pass to e.g., Unix system call, you should create a [`CStr`]. +/// +/// `OsString` is to &[OsStr] as [`String`] is to &[str]: the former +/// in each pair are owned strings; the latter are borrowed +/// references. +/// +/// Note, `OsString` and [`OsStr`] internally do not necessarily hold strings in +/// the form native to the platform; While on Unix, strings are stored as a +/// sequence of 8-bit values, on Windows, where strings are 16-bit value based +/// as just discussed, strings are also actually stored as a sequence of 8-bit +/// values, encoded in a less-strict variant of UTF-8. This is useful to +/// understand when handling capacity and length values. +/// +/// # Capacity of `OsString` +/// +/// Capacity uses units of UTF-8 bytes for OS strings which were created from valid unicode, and +/// uses units of bytes in an unspecified encoding for other contents. On a given target, all +/// `OsString` and `OsStr` values use the same units for capacity, so the following will work: +/// ``` +/// use std::ffi::{OsStr, OsString}; +/// +/// fn concat_os_strings(a: &OsStr, b: &OsStr) -> OsString { +/// let mut ret = OsString::with_capacity(a.len() + b.len()); // This will allocate +/// ret.push(a); // This will not allocate further +/// ret.push(b); // This will not allocate further +/// ret +/// } +/// ``` +/// +/// # Creating an `OsString` +/// +/// **From a Rust string**: `OsString` implements +/// [From]<[String]>, so you can use my_string.[into]\() to +/// create an `OsString` from a normal Rust string. +/// +/// **From slices:** Just like you can start with an empty Rust +/// [`String`] and then [`String::push_str`] some &[str] +/// sub-string slices into it, you can create an empty `OsString` with +/// the [`OsString::new`] method and then push string slices into it with the +/// [`OsString::push`] method. +/// +/// # Extracting a borrowed reference to the whole OS string +/// +/// You can use the [`OsString::as_os_str`] method to get an &[OsStr] from +/// an `OsString`; this is effectively a borrowed reference to the +/// whole string. +/// +/// # Conversions +/// +/// See the [module's toplevel documentation about conversions][conversions] for a discussion on +/// the traits which `OsString` implements for [conversions] from/to native representations. +/// +/// [`CStr`]: crate::ffi::CStr +/// [conversions]: super#conversions +/// [into]: Into::into +#[cfg_attr(not(test), rustc_diagnostic_item = "OsString")] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct OsString { + inner: Buf, +} + +/// Allows extension traits within `std`. +#[unstable(feature = "sealed", issue = "none")] +impl private::Sealed for OsString {} + +impl OsString { + /// Construct [`OsString`] from [`Buf`]. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[must_use] + #[inline] + #[doc(hidden)] + pub fn from_inner(inner: Buf) -> Self { + Self { inner } + } + + /// Constructs a new empty `OsString`. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let os_string = OsString::new(); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + #[inline] + pub fn new() -> OsString { + OsString { inner: Buf::from_string(String::new()) } + } + + /// Converts bytes to an `OsString` without checking that the bytes contains + /// valid [`OsStr`]-encoded data. + /// + /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. + /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit + /// ASCII. + /// + /// See the [module's toplevel documentation about conversions][conversions] for safe, + /// cross-platform [conversions] from/to native representations. + /// + /// # Safety + /// + /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of + /// validated UTF-8 and bytes from [`OsStr::as_encoded_bytes`] from within the same Rust version + /// built for the same target platform. For example, reconstructing an `OsString` from bytes sent + /// over the network or stored in a file will likely violate these safety rules. + /// + /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_encoded_bytes`] can be + /// split either immediately before or immediately after any valid non-empty UTF-8 substring. + /// + /// # Example + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("Mary had a little lamb"); + /// let bytes = os_str.as_encoded_bytes(); + /// let words = bytes.split(|b| *b == b' '); + /// let words: Vec<&OsStr> = words.map(|word| { + /// // SAFETY: + /// // - Each `word` only contains content that originated from `OsStr::as_encoded_bytes` + /// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring + /// unsafe { OsStr::from_encoded_bytes_unchecked(word) } + /// }).collect(); + /// ``` + /// + /// [conversions]: super#conversions + #[inline] + #[stable(feature = "os_str_bytes", since = "1.74.0")] + pub unsafe fn from_encoded_bytes_unchecked(bytes: Vec) -> Self { + OsString { inner: unsafe { Buf::from_encoded_bytes_unchecked(bytes) } } + } + + /// Converts to an [`OsStr`] slice. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::{OsString, OsStr}; + /// + /// let os_string = OsString::from("foo"); + /// let os_str = OsStr::new("foo"); + /// assert_eq!(os_string.as_os_str(), os_str); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + #[inline] + pub fn as_os_str(&self) -> &OsStr { + self + } + + /// Converts the `OsString` into a byte slice. To convert the byte slice back into an + /// `OsString`, use the [`OsStr::from_encoded_bytes_unchecked`] function. + /// + /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. + /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit + /// ASCII. + /// + /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should + /// be treated as opaque and only comparable within the same Rust version built for the same + /// target platform. For example, sending the bytes over the network or storing it in a file + /// will likely result in incompatible data. See [`OsString`] for more encoding details + /// and [`std::ffi`] for platform-specific, specified conversions. + /// + /// [`std::ffi`]: crate::ffi + #[inline] + #[stable(feature = "os_str_bytes", since = "1.74.0")] + pub fn into_encoded_bytes(self) -> Vec { + self.inner.into_encoded_bytes() + } + + /// Converts the `OsString` into a [`String`] if it contains valid Unicode data. + /// + /// On failure, ownership of the original `OsString` is returned. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let os_string = OsString::from("foo"); + /// let string = os_string.into_string(); + /// assert_eq!(string, Ok(String::from("foo"))); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn into_string(self) -> Result { + self.inner.into_string().map_err(|buf| OsString { inner: buf }) + } + + /// Extends the string with the given &[OsStr] slice. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut os_string = OsString::from("foo"); + /// os_string.push("bar"); + /// assert_eq!(&os_string, "foobar"); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + #[rustc_confusables("append", "put")] + pub fn push>(&mut self, s: T) { + self.inner.push_slice(s.as_ref().as_inner()) + } + + /// Creates a new `OsString` with at least the given capacity. + /// + /// The string will be able to hold at least `capacity` length units of other + /// OS strings without reallocating. This method is allowed to allocate for + /// more units than `capacity`. If `capacity` is 0, the string will not + /// allocate. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut os_string = OsString::with_capacity(10); + /// let capacity = os_string.capacity(); + /// + /// // This push is done without reallocating + /// os_string.push("foo"); + /// + /// assert_eq!(capacity, os_string.capacity()); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[must_use] + #[inline] + pub fn with_capacity(capacity: usize) -> OsString { + OsString { inner: Buf::with_capacity(capacity) } + } + + /// Truncates the `OsString` to zero length. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut os_string = OsString::from("foo"); + /// assert_eq!(&os_string, "foo"); + /// + /// os_string.clear(); + /// assert_eq!(&os_string, ""); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[inline] + pub fn clear(&mut self) { + self.inner.clear() + } + + /// Returns the capacity this `OsString` can hold without reallocating. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let os_string = OsString::with_capacity(10); + /// assert!(os_string.capacity() >= 10); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[must_use] + #[inline] + pub fn capacity(&self) -> usize { + self.inner.capacity() + } + + /// Reserves capacity for at least `additional` more capacity to be inserted + /// in the given `OsString`. Does nothing if the capacity is + /// already sufficient. + /// + /// The collection may reserve more space to speculatively avoid frequent reallocations. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut s = OsString::new(); + /// s.reserve(10); + /// assert!(s.capacity() >= 10); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.inner.reserve(additional) + } + + /// Tries to reserve capacity for at least `additional` more length units + /// in the given `OsString`. The string may reserve more space to speculatively avoid + /// frequent reallocations. After calling `try_reserve`, capacity will be + /// greater than or equal to `self.len() + additional` if it returns `Ok(())`. + /// Does nothing if capacity is already sufficient. This method preserves + /// the contents even if an error occurs. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::{OsStr, OsString}; + /// use std::collections::TryReserveError; + /// + /// fn process_data(data: &str) -> Result { + /// let mut s = OsString::new(); + /// + /// // Pre-reserve the memory, exiting if we can't + /// s.try_reserve(OsStr::new(data).len())?; + /// + /// // Now we know this can't OOM in the middle of our complex work + /// s.push(data); + /// + /// Ok(s) + /// } + /// # process_data("123").expect("why is the test harness OOMing on 3 bytes?"); + /// ``` + #[stable(feature = "try_reserve_2", since = "1.63.0")] + #[inline] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.inner.try_reserve(additional) + } + + /// Reserves the minimum capacity for at least `additional` more capacity to + /// be inserted in the given `OsString`. Does nothing if the capacity is + /// already sufficient. + /// + /// Note that the allocator may give the collection more space than it + /// requests. Therefore, capacity can not be relied upon to be precisely + /// minimal. Prefer [`reserve`] if future insertions are expected. + /// + /// [`reserve`]: OsString::reserve + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut s = OsString::new(); + /// s.reserve_exact(10); + /// assert!(s.capacity() >= 10); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[inline] + pub fn reserve_exact(&mut self, additional: usize) { + self.inner.reserve_exact(additional) + } + + /// Tries to reserve the minimum capacity for at least `additional` + /// more length units in the given `OsString`. After calling + /// `try_reserve_exact`, capacity will be greater than or equal to + /// `self.len() + additional` if it returns `Ok(())`. + /// Does nothing if the capacity is already sufficient. + /// + /// Note that the allocator may give the `OsString` more space than it + /// requests. Therefore, capacity can not be relied upon to be precisely + /// minimal. Prefer [`try_reserve`] if future insertions are expected. + /// + /// [`try_reserve`]: OsString::try_reserve + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::{OsStr, OsString}; + /// use std::collections::TryReserveError; + /// + /// fn process_data(data: &str) -> Result { + /// let mut s = OsString::new(); + /// + /// // Pre-reserve the memory, exiting if we can't + /// s.try_reserve_exact(OsStr::new(data).len())?; + /// + /// // Now we know this can't OOM in the middle of our complex work + /// s.push(data); + /// + /// Ok(s) + /// } + /// # process_data("123").expect("why is the test harness OOMing on 3 bytes?"); + /// ``` + #[stable(feature = "try_reserve_2", since = "1.63.0")] + #[inline] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.inner.try_reserve_exact(additional) + } + + /// Shrinks the capacity of the `OsString` to match its length. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut s = OsString::from("foo"); + /// + /// s.reserve(100); + /// assert!(s.capacity() >= 100); + /// + /// s.shrink_to_fit(); + /// assert_eq!(3, s.capacity()); + /// ``` + #[stable(feature = "osstring_shrink_to_fit", since = "1.19.0")] + #[inline] + pub fn shrink_to_fit(&mut self) { + self.inner.shrink_to_fit() + } + + /// Shrinks the capacity of the `OsString` with a lower bound. + /// + /// The capacity will remain at least as large as both the length + /// and the supplied value. + /// + /// If the current capacity is less than the lower limit, this is a no-op. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut s = OsString::from("foo"); + /// + /// s.reserve(100); + /// assert!(s.capacity() >= 100); + /// + /// s.shrink_to(10); + /// assert!(s.capacity() >= 10); + /// s.shrink_to(0); + /// assert!(s.capacity() >= 3); + /// ``` + #[inline] + #[stable(feature = "shrink_to", since = "1.56.0")] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.inner.shrink_to(min_capacity) + } + + /// Converts this `OsString` into a boxed [`OsStr`]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::{OsString, OsStr}; + /// + /// let s = OsString::from("hello"); + /// + /// let b: Box = s.into_boxed_os_str(); + /// ``` + #[must_use = "`self` will be dropped if the result is not used"] + #[stable(feature = "into_boxed_os_str", since = "1.20.0")] + pub fn into_boxed_os_str(self) -> Box { + let rw = Box::into_raw(self.inner.into_box()) as *mut OsStr; + unsafe { Box::from_raw(rw) } + } + + /// Consumes and leaks the `OsString`, returning a mutable reference to the contents, + /// `&'a mut OsStr`. + /// + /// The caller has free choice over the returned lifetime, including 'static. + /// Indeed, this function is ideally used for data that lives for the remainder of + /// the program’s life, as dropping the returned reference will cause a memory leak. + /// + /// It does not reallocate or shrink the `OsString`, so the leaked allocation may include + /// unused capacity that is not part of the returned slice. If you want to discard excess + /// capacity, call [`into_boxed_os_str`], and then [`Box::leak`] instead. + /// However, keep in mind that trimming the capacity may result in a reallocation and copy. + /// + /// [`into_boxed_os_str`]: Self::into_boxed_os_str + #[unstable(feature = "os_string_pathbuf_leak", issue = "125965")] + #[inline] + pub fn leak<'a>(self) -> &'a mut OsStr { + OsStr::from_inner_mut(self.inner.leak()) + } + + /// Provides plumbing to core `Vec::truncate`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[inline] + #[allow(unused)] + pub(crate) fn truncate(&mut self, len: usize) { + self.inner.truncate(len); + } + + /// Provides plumbing to core `Vec::extend_from_slice`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[inline] + #[allow(unused)] + pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { + self.inner.extend_from_slice(other); + } +} + +impl OsStr { + /// Returns a copy of this string where each character is mapped to its + /// ASCII upper case equivalent. + /// + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', + /// but non-ASCII letters are unchanged. + /// + /// To uppercase the value in-place, use [`OsStr::make_ascii_uppercase`]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// let s = OsString::from("Grüße, Jürgen ❤"); + /// + /// assert_eq!("GRüßE, JüRGEN ❤", s.to_ascii_uppercase()); + /// ``` + #[rustc_allow_incoherent_impl] + #[must_use = "to uppercase the value in-place, use `make_ascii_uppercase`"] + #[stable(feature = "osstring_ascii", since = "1.53.0")] + pub fn to_ascii_uppercase(&self) -> OsString { + OsString::from_inner(self.as_inner().to_ascii_uppercase()) + } + + /// Returns a copy of this string where each character is mapped to its + /// ASCII lower case equivalent. + /// + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', + /// but non-ASCII letters are unchanged. + /// + /// To lowercase the value in-place, use [`OsStr::make_ascii_lowercase`]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// let s = OsString::from("Grüße, Jürgen ❤"); + /// + /// assert_eq!("grüße, jürgen ❤", s.to_ascii_lowercase()); + /// ``` + #[rustc_allow_incoherent_impl] + #[must_use = "to lowercase the value in-place, use `make_ascii_lowercase`"] + #[stable(feature = "osstring_ascii", since = "1.53.0")] + pub fn to_ascii_lowercase(&self) -> OsString { + OsString::from_inner(self.as_inner().to_ascii_lowercase()) + } + + /// Converts a [Box]<[OsStr]> into an [`OsString`] without copying or allocating. + #[rustc_allow_incoherent_impl] + #[stable(feature = "into_boxed_os_str", since = "1.20.0")] + #[must_use = "`self` will be dropped if the result is not used"] + pub fn into_os_string(self: Box) -> OsString { + let boxed = unsafe { Box::from_raw(Box::into_raw(self) as *mut Slice) }; + OsString { inner: Buf::from_box(boxed) } + } + + /// Converts an `OsStr` to a [Cow]<[str]>. + /// + /// Any non-Unicode sequences are replaced with + /// [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]. + /// + /// [U+FFFD]: crate::char::REPLACEMENT_CHARACTER + /// + /// # Examples + /// + /// Calling `to_string_lossy` on an `OsStr` with invalid unicode: + /// + /// ``` + /// // Note, due to differences in how Unix and Windows represent strings, + /// // we are forced to complicate this example, setting up example `OsStr`s + /// // with different source data and via different platform extensions. + /// // Understand that in reality you could end up with such example invalid + /// // sequences simply through collecting user command line arguments, for + /// // example. + /// + /// #[cfg(unix)] { + /// use std::ffi::OsStr; + /// use std::os::unix::ffi::OsStrExt; + /// + /// // Here, the values 0x66 and 0x6f correspond to 'f' and 'o' + /// // respectively. The value 0x80 is a lone continuation byte, invalid + /// // in a UTF-8 sequence. + /// let source = [0x66, 0x6f, 0x80, 0x6f]; + /// let os_str = OsStr::from_bytes(&source[..]); + /// + /// assert_eq!(os_str.to_string_lossy(), "fo�o"); + /// } + /// #[cfg(windows)] { + /// use std::ffi::OsString; + /// use std::os::windows::prelude::*; + /// + /// // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' + /// // respectively. The value 0xD800 is a lone surrogate half, invalid + /// // in a UTF-16 sequence. + /// let source = [0x0066, 0x006f, 0xD800, 0x006f]; + /// let os_string = OsString::from_wide(&source[..]); + /// let os_str = os_string.as_os_str(); + /// + /// assert_eq!(os_str.to_string_lossy(), "fo�o"); + /// } + /// ``` + #[rustc_allow_incoherent_impl] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use = "this returns the result of the operation, \ + without modifying the original"] + #[inline] + pub fn to_string_lossy(&self) -> Cow<'_, str> { + self.as_inner().to_string_lossy() + } + + /// Copies the slice into an owned [`OsString`]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::{OsStr, OsString}; + /// + /// let os_str = OsStr::new("foo"); + /// let os_string = os_str.to_os_string(); + /// assert_eq!(os_string, OsString::from("foo")); + /// ``` + #[rustc_allow_incoherent_impl] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use = "this returns the result of the operation, \ + without modifying the original"] + #[inline] + pub fn to_os_string(&self) -> OsString { + OsString { inner: self.as_inner().to_owned() } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl From for OsString { + /// Converts a [`String`] into an [`OsString`]. + /// + /// This conversion does not allocate or copy memory. + #[inline] + fn from(s: String) -> OsString { + OsString { inner: Buf::from_string(s) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl> From<&T> for OsString { + /// Copies any value implementing [AsRef]<[OsStr]> + /// into a newly allocated [`OsString`]. + fn from(s: &T) -> OsString { + s.as_ref().to_os_string() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Index for OsString { + type Output = OsStr; + + #[inline] + fn index(&self, _index: ops::RangeFull) -> &OsStr { + OsStr::from_inner(self.inner.as_slice()) + } +} + +#[stable(feature = "mut_osstr", since = "1.44.0")] +impl ops::IndexMut for OsString { + #[inline] + fn index_mut(&mut self, _index: ops::RangeFull) -> &mut OsStr { + OsStr::from_inner_mut(self.inner.as_mut_slice()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Deref for OsString { + type Target = OsStr; + + #[inline] + fn deref(&self) -> &OsStr { + &self[..] + } +} + +#[stable(feature = "mut_osstr", since = "1.44.0")] +impl ops::DerefMut for OsString { + #[inline] + fn deref_mut(&mut self) -> &mut OsStr { + &mut self[..] + } +} + +#[stable(feature = "osstring_default", since = "1.9.0")] +impl Default for OsString { + /// Constructs an empty `OsString`. + #[inline] + fn default() -> OsString { + OsString::new() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Clone for OsString { + #[inline] + fn clone(&self) -> Self { + OsString { inner: self.inner.clone() } + } + + /// Clones the contents of `source` into `self`. + /// + /// This method is preferred over simply assigning `source.clone()` to `self`, + /// as it avoids reallocation if possible. + #[inline] + fn clone_from(&mut self, source: &Self) { + self.inner.clone_from(&source.inner) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Debug for OsString { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, formatter) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for OsString { + #[inline] + fn eq(&self, other: &OsString) -> bool { + &**self == &**other + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for OsString { + #[inline] + fn eq(&self, other: &str) -> bool { + &**self == other + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for str { + #[inline] + fn eq(&self, other: &OsString) -> bool { + &**other == self + } +} + +#[stable(feature = "os_str_str_ref_eq", since = "1.29.0")] +impl PartialEq<&str> for OsString { + #[inline] + fn eq(&self, other: &&str) -> bool { + **self == **other + } +} + +#[stable(feature = "os_str_str_ref_eq", since = "1.29.0")] +impl<'a> PartialEq for &'a str { + #[inline] + fn eq(&self, other: &OsString) -> bool { + **other == **self + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for OsString {} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for OsString { + #[inline] + fn partial_cmp(&self, other: &OsString) -> Option { + (&**self).partial_cmp(&**other) + } + #[inline] + fn lt(&self, other: &OsString) -> bool { + &**self < &**other + } + #[inline] + fn le(&self, other: &OsString) -> bool { + &**self <= &**other + } + #[inline] + fn gt(&self, other: &OsString) -> bool { + &**self > &**other + } + #[inline] + fn ge(&self, other: &OsString) -> bool { + &**self >= &**other + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for OsString { + #[inline] + fn partial_cmp(&self, other: &str) -> Option { + (&**self).partial_cmp(other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for OsString { + #[inline] + fn cmp(&self, other: &OsString) -> cmp::Ordering { + (&**self).cmp(&**other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Hash for OsString { + #[inline] + fn hash(&self, state: &mut H) { + (&**self).hash(state) + } +} + +#[stable(feature = "os_string_fmt_write", since = "1.64.0")] +impl fmt::Write for OsString { + fn write_str(&mut self, s: &str) -> fmt::Result { + self.push(s); + Ok(()) + } +} + +#[stable(feature = "box_from_os_str", since = "1.17.0")] +impl From<&OsStr> for Box { + /// Copies the string into a newly allocated [Box]<[OsStr]>. + #[inline] + fn from(s: &OsStr) -> Box { + let rw = Box::into_raw(s.as_inner().into_box()) as *mut OsStr; + unsafe { Box::from_raw(rw) } + } +} + +#[stable(feature = "box_from_cow", since = "1.45.0")] +impl From> for Box { + /// Converts a `Cow<'a, OsStr>` into a [Box]<[OsStr]>, + /// by copying the contents if they are borrowed. + #[inline] + fn from(cow: Cow<'_, OsStr>) -> Box { + match cow { + Cow::Borrowed(s) => Box::from(s), + Cow::Owned(s) => Box::from(s), + } + } +} + +#[stable(feature = "os_string_from_box", since = "1.18.0")] +impl From> for OsString { + /// Converts a [Box]<[OsStr]> into an [`OsString`] without copying or + /// allocating. + #[inline] + fn from(boxed: Box) -> OsString { + boxed.into_os_string() + } +} + +#[stable(feature = "box_from_os_string", since = "1.20.0")] +impl From for Box { + /// Converts an [`OsString`] into a [Box]<[OsStr]> without copying or allocating. + #[inline] + fn from(s: OsString) -> Box { + s.into_boxed_os_str() + } +} + +#[stable(feature = "more_box_slice_clone", since = "1.29.0")] +impl Clone for Box { + #[inline] + fn clone(&self) -> Self { + self.to_os_string().into_boxed_os_str() + } +} + +#[stable(feature = "shared_from_slice2", since = "1.24.0")] +impl From for Arc { + /// Converts an [`OsString`] into an [Arc]<[OsStr]> by moving the [`OsString`] + /// data into a new [`Arc`] buffer. + #[inline] + fn from(s: OsString) -> Arc { + let arc = s.inner.into_arc(); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const OsStr) } + } +} + +#[stable(feature = "shared_from_slice2", since = "1.24.0")] +impl From<&OsStr> for Arc { + /// Copies the string into a newly allocated [Arc]<[OsStr]>. + #[inline] + fn from(s: &OsStr) -> Arc { + let arc = s.as_inner().into_arc(); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const OsStr) } + } +} + +#[stable(feature = "shared_from_slice2", since = "1.24.0")] +impl From for Rc { + /// Converts an [`OsString`] into an [Rc]<[OsStr]> by moving the [`OsString`] + /// data into a new [`Rc`] buffer. + #[inline] + fn from(s: OsString) -> Rc { + let rc = s.inner.into_rc(); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const OsStr) } + } +} + +#[stable(feature = "shared_from_slice2", since = "1.24.0")] +impl From<&OsStr> for Rc { + /// Copies the string into a newly allocated [Rc]<[OsStr]>. + #[inline] + fn from(s: &OsStr) -> Rc { + let rc = s.as_inner().into_rc(); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const OsStr) } + } +} + +#[stable(feature = "cow_from_osstr", since = "1.28.0")] +impl<'a> From for Cow<'a, OsStr> { + /// Moves the string into a [`Cow::Owned`]. + #[inline] + fn from(s: OsString) -> Cow<'a, OsStr> { + Cow::Owned(s) + } +} + +#[stable(feature = "cow_from_osstr", since = "1.28.0")] +impl<'a> From<&'a OsStr> for Cow<'a, OsStr> { + /// Converts the string reference into a [`Cow::Borrowed`]. + #[inline] + fn from(s: &'a OsStr) -> Cow<'a, OsStr> { + Cow::Borrowed(s) + } +} + +#[stable(feature = "cow_from_osstr", since = "1.28.0")] +impl<'a> From<&'a OsString> for Cow<'a, OsStr> { + /// Converts the string reference into a [`Cow::Borrowed`]. + #[inline] + fn from(s: &'a OsString) -> Cow<'a, OsStr> { + Cow::Borrowed(s.as_os_str()) + } +} + +#[stable(feature = "osstring_from_cow_osstr", since = "1.28.0")] +impl<'a> From> for OsString { + /// Converts a `Cow<'a, OsStr>` into an [`OsString`], + /// by copying the contents if they are borrowed. + #[inline] + fn from(s: Cow<'a, OsStr>) -> Self { + s.into_owned() + } +} + +#[stable(feature = "box_default_extra", since = "1.17.0")] +impl Default for Box { + #[inline] + fn default() -> Box { + let rw = Box::into_raw(Slice::empty_box()) as *mut OsStr; + unsafe { Box::from_raw(rw) } + } +} + +macro_rules! impl_cmp { + ($lhs:ty, $rhs: ty) => { + #[allow(unused_lifetimes)] + #[stable(feature = "cmp_os_str", since = "1.8.0")] + impl<'a, 'b> PartialEq<$rhs> for $lhs { + #[inline] + fn eq(&self, other: &$rhs) -> bool { + ::eq(self, other) + } + } + + #[allow(unused_lifetimes)] + #[stable(feature = "cmp_os_str", since = "1.8.0")] + impl<'a, 'b> PartialEq<$lhs> for $rhs { + #[inline] + fn eq(&self, other: &$lhs) -> bool { + ::eq(self, other) + } + } + + #[allow(unused_lifetimes)] + #[stable(feature = "cmp_os_str", since = "1.8.0")] + impl<'a, 'b> PartialOrd<$rhs> for $lhs { + #[inline] + fn partial_cmp(&self, other: &$rhs) -> Option { + ::partial_cmp(self, other) + } + } + + #[allow(unused_lifetimes)] + #[stable(feature = "cmp_os_str", since = "1.8.0")] + impl<'a, 'b> PartialOrd<$lhs> for $rhs { + #[inline] + fn partial_cmp(&self, other: &$lhs) -> Option { + ::partial_cmp(self, other) + } + } + }; +} + +impl_cmp!(OsString, OsStr); +impl_cmp!(OsString, &'a OsStr); +impl_cmp!(Cow<'a, OsStr>, OsStr); +impl_cmp!(Cow<'a, OsStr>, &'b OsStr); +impl_cmp!(Cow<'a, OsStr>, OsString); + +#[unstable(feature = "slice_concat_ext", issue = "27747")] +impl> crate::slice::Join<&OsStr> for [S] { + type Output = OsString; + + fn join(slice: &Self, sep: &OsStr) -> OsString { + let Some((first, suffix)) = slice.split_first() else { + return OsString::new(); + }; + let first_owned = first.borrow().to_owned(); + suffix.iter().fold(first_owned, |mut a, b| { + a.push(sep); + a.push(b.borrow()); + a + }) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Borrow for OsString { + #[inline] + fn borrow(&self) -> &OsStr { + &self[..] + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ToOwned for OsStr { + type Owned = OsString; + #[inline] + fn to_owned(&self) -> OsString { + self.to_os_string() + } + #[inline] + fn clone_into(&self, target: &mut OsString) { + self.as_inner().clone_into(&mut target.inner) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef for OsString { + #[inline] + fn as_ref(&self) -> &OsStr { + self + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef for String { + #[inline] + fn as_ref(&self) -> &OsStr { + (&**self).as_ref() + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +impl From for OsString { + #[inline] + fn from(buf: Buf) -> OsString { + OsString { inner: buf } + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +impl Into for OsString { + #[inline] + fn into(self) -> Buf { + self.inner + } +} + +#[stable(feature = "osstring_from_str", since = "1.45.0")] +impl FromStr for OsString { + type Err = core::convert::Infallible; + + #[inline] + fn from_str(s: &str) -> Result { + Ok(OsString::from(s)) + } +} + +#[stable(feature = "osstring_extend", since = "1.52.0")] +impl Extend for OsString { + #[inline] + fn extend>(&mut self, iter: T) { + for s in iter { + self.push(&s); + } + } +} + +#[stable(feature = "osstring_extend", since = "1.52.0")] +impl<'a> Extend<&'a OsStr> for OsString { + #[inline] + fn extend>(&mut self, iter: T) { + for s in iter { + self.push(s); + } + } +} + +#[stable(feature = "osstring_extend", since = "1.52.0")] +impl<'a> Extend> for OsString { + #[inline] + fn extend>>(&mut self, iter: T) { + for s in iter { + self.push(&s); + } + } +} + +#[stable(feature = "osstring_extend", since = "1.52.0")] +impl FromIterator for OsString { + #[inline] + fn from_iter>(iter: I) -> Self { + let mut iterator = iter.into_iter(); + + // Because we're iterating over `OsString`s, we can avoid at least + // one allocation by getting the first string from the iterator + // and appending to it all the subsequent strings. + match iterator.next() { + None => OsString::new(), + Some(mut buf) => { + buf.extend(iterator); + buf + } + } + } +} + +#[stable(feature = "osstring_extend", since = "1.52.0")] +impl<'a> FromIterator<&'a OsStr> for OsString { + #[inline] + fn from_iter>(iter: I) -> Self { + let mut buf = Self::new(); + for s in iter { + buf.push(s); + } + buf + } +} + +#[stable(feature = "osstring_extend", since = "1.52.0")] +impl<'a> FromIterator> for OsString { + #[inline] + fn from_iter>>(iter: I) -> Self { + let mut iterator = iter.into_iter(); + + // Because we're iterating over `OsString`s, we can avoid at least + // one allocation by getting the first owned string from the iterator + // and appending to it all the subsequent strings. + match iterator.next() { + None => OsString::new(), + Some(Cow::Owned(mut buf)) => { + buf.extend(iterator); + buf + } + Some(Cow::Borrowed(buf)) => { + let mut buf = OsString::from(buf); + buf.extend(iterator); + buf + } + } + } +} diff --git a/library/alloc/src/ffi/os_str/bytes.rs b/library/alloc/src/ffi/os_str/bytes.rs new file mode 100644 index 0000000000000..55fe072c7be6e --- /dev/null +++ b/library/alloc/src/ffi/os_str/bytes.rs @@ -0,0 +1,435 @@ +#![allow(missing_docs)] +#![allow(missing_debug_implementations)] + +//! The underlying OsString/OsStr implementation on Unix and many other +//! systems: just a `Vec`/`[u8]`. + +use core::ffi::os_str::Slice; +use core::{fmt, mem, str}; + +use crate::borrow::{Cow, ToOwned}; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::rc::Rc; +use crate::string::String; +use crate::sync::Arc; +use crate::vec::Vec; + +#[cfg(test)] +mod tests; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[derive(Hash)] +#[repr(transparent)] +pub struct Buf { + pub inner: Vec, +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Debug for Buf { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(self.as_slice(), formatter) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Display for Buf { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self.as_slice(), formatter) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl Clone for Buf { + #[inline] + fn clone(&self) -> Self { + Buf { inner: self.inner.clone() } + } + + #[inline] + fn clone_from(&mut self, source: &Self) { + self.inner.clone_from(&source.inner) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl Into> for Buf { + fn into(self) -> Vec { + self.inner + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl AsRef<[u8]> for Buf { + #[inline] + fn as_ref(&self) -> &[u8] { + &self.inner + } +} + +impl Buf { + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_encoded_bytes(self) -> Vec { + self.inner + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub unsafe fn from_encoded_bytes_unchecked(s: Vec) -> Self { + Self { inner: s } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn from_string(s: String) -> Buf { + Buf { inner: s.into_bytes() } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn with_capacity(capacity: usize) -> Buf { + Buf { inner: Vec::with_capacity(capacity) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn clear(&mut self) { + self.inner.clear() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn capacity(&self) -> usize { + self.inner.capacity() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.inner.reserve(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.inner.try_reserve(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn reserve_exact(&mut self, additional: usize) { + self.inner.reserve_exact(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.inner.try_reserve_exact(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn shrink_to_fit(&mut self) { + self.inner.shrink_to_fit() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.inner.shrink_to(min_capacity) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn as_slice(&self) -> &Slice { + // SAFETY: Slice just wraps [u8], + // and &*self.inner is &[u8], therefore + // transmuting &[u8] to &Slice is safe. + unsafe { mem::transmute(&*self.inner) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn as_mut_slice(&mut self) -> &mut Slice { + // SAFETY: Slice just wraps [u8], + // and &mut *self.inner is &mut [u8], therefore + // transmuting &mut [u8] to &mut Slice is safe. + unsafe { mem::transmute(&mut *self.inner) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn into_string(self) -> Result { + String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() }) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn push_slice(&mut self, s: &Slice) { + self.inner.extend_from_slice(&s.inner) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn leak<'a>(self) -> &'a mut Slice { + unsafe { mem::transmute(self.inner.leak()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_box(self) -> Box { + unsafe { mem::transmute(self.inner.into_boxed_slice()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn from_box(boxed: Box) -> Buf { + let inner: Box<[u8]> = unsafe { mem::transmute(boxed) }; + Buf { inner: inner.into_vec() } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_arc(&self) -> Arc { + self.as_slice().into_arc() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_rc(&self) -> Rc { + self.as_slice().into_rc() + } + + /// Provides plumbing to core `Vec::truncate`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub(crate) fn truncate(&mut self, len: usize) { + self.inner.truncate(len); + } + + /// Provides plumbing to core `Vec::extend_from_slice`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { + self.inner.extend_from_slice(other); + } +} + +impl Slice { + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_string_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(&self.inner) + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_owned(&self) -> Buf { + Buf { inner: self.inner.to_vec() } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn clone_into(&self, buf: &mut Buf) { + self.inner.clone_into(&mut buf.inner) + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_box(&self) -> Box { + let boxed: Box<[u8]> = self.inner.into(); + unsafe { mem::transmute(boxed) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn empty_box() -> Box { + let boxed: Box<[u8]> = Default::default(); + unsafe { mem::transmute(boxed) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_arc(&self) -> Arc { + let arc: Arc<[u8]> = Arc::from(&self.inner); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Slice) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_rc(&self) -> Rc { + let rc: Rc<[u8]> = Rc::from(&self.inner); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn to_ascii_lowercase(&self) -> Buf { + Buf { inner: self.inner.to_ascii_lowercase() } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn to_ascii_uppercase(&self) -> Buf { + Buf { inner: self.inner.to_ascii_uppercase() } + } +} diff --git a/library/alloc/src/ffi/os_str/os_str_ext_unix.rs b/library/alloc/src/ffi/os_str/os_str_ext_unix.rs new file mode 100644 index 0000000000000..fd8891624b8f3 --- /dev/null +++ b/library/alloc/src/ffi/os_str/os_str_ext_unix.rs @@ -0,0 +1,35 @@ +//! [`OsStringExt`] for unix. + +use super::{private, Buf, OsString}; +use crate::vec::Vec; + +/// Platform-specific extensions to [`OsString`]. +/// +/// This trait is sealed: it cannot be implemented outside the standard library. +/// This is so that future additional methods are not breaking changes. +#[stable(feature = "rust1", since = "1.0.0")] +pub trait OsStringExt: private::Sealed { + /// Creates an [`OsString`] from a byte vector. + /// + /// See the module documentation for an example. + #[stable(feature = "rust1", since = "1.0.0")] + fn from_vec(vec: Vec) -> Self; + + /// Yields the underlying byte vector of this [`OsString`]. + /// + /// See the module documentation for an example. + #[stable(feature = "rust1", since = "1.0.0")] + fn into_vec(self) -> Vec; +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl OsStringExt for OsString { + #[inline] + fn from_vec(vec: Vec) -> OsString { + From::from(Buf { inner: vec }) + } + #[inline] + fn into_vec(self) -> Vec { + self.inner.inner + } +} diff --git a/library/alloc/src/ffi/os_str/os_str_ext_windows.rs b/library/alloc/src/ffi/os_str/os_str_ext_windows.rs new file mode 100644 index 0000000000000..fa2e0a8d25f26 --- /dev/null +++ b/library/alloc/src/ffi/os_str/os_str_ext_windows.rs @@ -0,0 +1,38 @@ +//! [`OsStringExt`] for windows. + +use super::{private, Buf, OsString}; +use crate::ffi::wtf8::Wtf8Buf; + +/// Windows-specific extensions to [`OsString`]. +/// +/// This trait is sealed: it cannot be implemented outside the standard library. +/// This is so that future additional methods are not breaking changes. +#[stable(feature = "rust1", since = "1.0.0")] +pub trait OsStringExt: private::Sealed { + /// Creates an `OsString` from a potentially ill-formed UTF-16 slice of + /// 16-bit code units. + /// + /// This is lossless: calling [`OsStrExt::encode_wide`] on the resulting string + /// will always return the original code units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// use std::os::windows::prelude::*; + /// + /// // UTF-16 encoding for "Unicode". + /// let source = [0x0055, 0x006E, 0x0069, 0x0063, 0x006F, 0x0064, 0x0065]; + /// + /// let string = OsString::from_wide(&source[..]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + fn from_wide(wide: &[u16]) -> Self; +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl OsStringExt for OsString { + fn from_wide(wide: &[u16]) -> OsString { + From::from(Buf { inner: Wtf8Buf::from_wide(wide) }) + } +} diff --git a/library/std/src/ffi/os_str/tests.rs b/library/alloc/src/ffi/os_str/tests.rs similarity index 80% rename from library/std/src/ffi/os_str/tests.rs rename to library/alloc/src/ffi/os_str/tests.rs index 67147934b4db3..5faac75621f29 100644 --- a/library/std/src/ffi/os_str/tests.rs +++ b/library/alloc/src/ffi/os_str/tests.rs @@ -185,49 +185,6 @@ fn into_rc() { assert_eq!(&*arc2, os_str); } -#[test] -fn slice_encoded_bytes() { - let os_str = OsStr::new("123θგ🦀"); - // ASCII - let digits = os_str.slice_encoded_bytes(..3); - assert_eq!(digits, "123"); - let three = os_str.slice_encoded_bytes(2..3); - assert_eq!(three, "3"); - // 2-byte UTF-8 - let theta = os_str.slice_encoded_bytes(3..5); - assert_eq!(theta, "θ"); - // 3-byte UTF-8 - let gani = os_str.slice_encoded_bytes(5..8); - assert_eq!(gani, "გ"); - // 4-byte UTF-8 - let crab = os_str.slice_encoded_bytes(8..); - assert_eq!(crab, "🦀"); -} - -#[test] -#[should_panic] -fn slice_out_of_bounds() { - let crab = OsStr::new("🦀"); - let _ = crab.slice_encoded_bytes(..5); -} - -#[test] -#[should_panic] -fn slice_mid_char() { - let crab = OsStr::new("🦀"); - let _ = crab.slice_encoded_bytes(..2); -} - -#[cfg(unix)] -#[test] -#[should_panic(expected = "byte index 1 is not an OsStr boundary")] -fn slice_invalid_data() { - use crate::os::unix::ffi::OsStrExt; - - let os_string = OsStr::from_bytes(b"\xFF\xFF"); - let _ = os_string.slice_encoded_bytes(1..); -} - #[cfg(unix)] #[test] #[should_panic(expected = "byte index 1 is not an OsStr boundary")] @@ -288,18 +245,3 @@ fn slice_surrogate_edge() { assert_eq!(post_crab.slice_encoded_bytes(..4), "🦀"); assert_eq!(post_crab.slice_encoded_bytes(4..), surrogate); } - -#[test] -fn clone_to_uninit() { - let a = OsStr::new("hello.txt"); - - let mut storage = vec![MaybeUninit::::uninit(); size_of_val::(a)]; - unsafe { a.clone_to_uninit(ptr::from_mut::<[_]>(storage.as_mut_slice()) as *mut OsStr) }; - assert_eq!(a.as_encoded_bytes(), unsafe { MaybeUninit::slice_assume_init_ref(&storage) }); - - let mut b: Box = OsStr::new("world.exe").into(); - assert_eq!(size_of_val::(a), size_of_val::(&b)); - assert_ne!(a, &*b); - unsafe { a.clone_to_uninit(ptr::from_mut::(&mut b)) }; - assert_eq!(a, &*b); -} diff --git a/library/alloc/src/ffi/os_str/wtf8.rs b/library/alloc/src/ffi/os_str/wtf8.rs new file mode 100644 index 0000000000000..004ba6ca9094f --- /dev/null +++ b/library/alloc/src/ffi/os_str/wtf8.rs @@ -0,0 +1,416 @@ +#![allow(missing_docs)] +#![allow(missing_debug_implementations)] + +//! The underlying OsString/OsStr implementation on Windows is a +//! wrapper around the "WTF-8" encoding; see the `wtf8` module for more. +use core::ffi::os_str::Slice; +use core::ffi::wtf8::Wtf8; +use core::{fmt, mem}; + +use crate::borrow::Cow; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::ffi::wtf8::Wtf8Buf; +use crate::rc::Rc; +use crate::string::String; +use crate::sync::Arc; +use crate::vec::Vec; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[derive(Clone, Hash)] +pub struct Buf { + pub inner: Wtf8Buf, +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl Into for Buf { + fn into(self) -> Wtf8Buf { + self.inner + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl From for Buf { + fn from(inner: Wtf8Buf) -> Self { + Buf { inner } + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl AsRef for Buf { + #[inline] + fn as_ref(&self) -> &Wtf8 { + &self.inner + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Debug for Buf { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(self.as_slice(), formatter) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Display for Buf { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self.as_slice(), formatter) + } +} + +impl Buf { + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_encoded_bytes(self) -> Vec { + self.inner.into_bytes() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub unsafe fn from_encoded_bytes_unchecked(s: Vec) -> Self { + unsafe { Self { inner: Wtf8Buf::from_bytes_unchecked(s) } } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn with_capacity(capacity: usize) -> Buf { + Buf { inner: Wtf8Buf::with_capacity(capacity) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn clear(&mut self) { + self.inner.clear() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn capacity(&self) -> usize { + self.inner.capacity() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn from_string(s: String) -> Buf { + Buf { inner: Wtf8Buf::from_string(s) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn as_slice(&self) -> &Slice { + // SAFETY: Slice is just a wrapper for Wtf8, + // and self.inner.as_slice() returns &Wtf8. + // Therefore, transmuting &Wtf8 to &Slice is safe. + unsafe { mem::transmute(self.inner.as_slice()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn as_mut_slice(&mut self) -> &mut Slice { + // SAFETY: Slice is just a wrapper for Wtf8, + // and self.inner.as_mut_slice() returns &mut Wtf8. + // Therefore, transmuting &mut Wtf8 to &mut Slice is safe. + // Additionally, care should be taken to ensure the slice + // is always valid Wtf8. + unsafe { mem::transmute(self.inner.as_mut_slice()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn into_string(self) -> Result { + self.inner.into_string().map_err(|buf| Buf { inner: buf }) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn push_slice(&mut self, s: &Slice) { + self.inner.push_wtf8(&s.inner) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn reserve(&mut self, additional: usize) { + self.inner.reserve(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.inner.try_reserve(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn reserve_exact(&mut self, additional: usize) { + self.inner.reserve_exact(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.inner.try_reserve_exact(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn shrink_to_fit(&mut self) { + self.inner.shrink_to_fit() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.inner.shrink_to(min_capacity) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn leak<'a>(self) -> &'a mut Slice { + unsafe { mem::transmute(self.inner.leak()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_box(self) -> Box { + unsafe { mem::transmute(self.inner.into_box()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn from_box(boxed: Box) -> Buf { + let inner: Box = unsafe { mem::transmute(boxed) }; + Buf { inner: Wtf8Buf::from_box(inner) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_arc(&self) -> Arc { + self.as_slice().into_arc() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_rc(&self) -> Rc { + self.as_slice().into_rc() + } + + /// Provides plumbing to core `Vec::truncate`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub(crate) fn truncate(&mut self, len: usize) { + self.inner.truncate(len); + } + + /// Provides plumbing to core `Vec::extend_from_slice`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { + self.inner.extend_from_slice(other); + } +} + +impl Slice { + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_string_lossy(&self) -> Cow<'_, str> { + self.inner.to_string_lossy() + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_owned(&self) -> Buf { + Buf { inner: self.inner.to_owned() } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn clone_into(&self, buf: &mut Buf) { + self.inner.clone_into(&mut buf.inner) + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_box(&self) -> Box { + unsafe { mem::transmute(self.inner.into_box()) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn empty_box() -> Box { + unsafe { mem::transmute(Wtf8::empty_box()) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_arc(&self) -> Arc { + let arc = self.inner.into_arc(); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Slice) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_rc(&self) -> Rc { + let rc = self.inner.into_rc(); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn to_ascii_lowercase(&self) -> Buf { + Buf { inner: self.inner.to_ascii_lowercase() } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn to_ascii_uppercase(&self) -> Buf { + Buf { inner: self.inner.to_ascii_uppercase() } + } +} diff --git a/library/alloc/src/ffi/wtf8.rs b/library/alloc/src/ffi/wtf8.rs new file mode 100644 index 0000000000000..c9d9f0dc0c0fb --- /dev/null +++ b/library/alloc/src/ffi/wtf8.rs @@ -0,0 +1,703 @@ +#![allow(missing_docs)] +#![allow(missing_debug_implementations)] + +//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). +//! +//! This library uses Rust’s type system to maintain +//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), +//! like the `String` and `&str` types do for UTF-8. +//! +//! Since [WTF-8 must not be used +//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), +//! this library deliberately does not provide access to the underlying bytes +//! of WTF-8 strings, +//! nor can it decode WTF-8 from arbitrary bytes. +//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. + +// this module is imported from @SimonSapin's repo and has tons of dead code on +// unix (it's mostly used on windows), so don't worry about dead code here. +#![allow(dead_code)] + +#[cfg(test)] +mod tests; + +use core::char::encode_utf8_raw; +use core::ffi::wtf8::*; +use core::hash::{Hash, Hasher}; +use core::{fmt, mem, ops, str}; + +use crate::borrow::{Cow, ToOwned}; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::rc::Rc; +use crate::string::String; +use crate::sync::Arc; +use crate::vec::Vec; + +/// An owned, growable string of well-formed WTF-8 data. +/// +/// Similar to `String`, but can additionally contain surrogate code points +/// if they’re not in a surrogate pair. +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] +pub struct Wtf8Buf { + bytes: Vec, + + /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily + /// know this if we're constructed from a `String` or `&str`. + /// + /// It is possible for `bytes` to have valid UTF-8 without this being + /// set, such as when we're concatenating `&Wtf8`'s and surrogates become + /// paired, as we don't bother to rescan the entire string. + is_known_utf8: bool, +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl ops::Deref for Wtf8Buf { + type Target = Wtf8; + + fn deref(&self) -> &Wtf8 { + self.as_slice() + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl ops::DerefMut for Wtf8Buf { + fn deref_mut(&mut self) -> &mut Wtf8 { + self.as_mut_slice() + } +} + +/// Format the string with double quotes, +/// and surrogates as `\u` followed by four hexadecimal digits. +/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Debug for Wtf8Buf { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, formatter) + } +} + +impl Wtf8Buf { + /// Creates a new, empty WTF-8 string. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn new() -> Wtf8Buf { + Wtf8Buf { bytes: Vec::new(), is_known_utf8: true } + } + + /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn with_capacity(capacity: usize) -> Wtf8Buf { + Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true } + } + + /// Creates a WTF-8 string from a WTF-8 byte vec. + /// + /// Since the byte vec is not checked for valid WTF-8, this functions is + /// marked unsafe. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub unsafe fn from_bytes_unchecked(value: Vec) -> Wtf8Buf { + Wtf8Buf { bytes: value, is_known_utf8: false } + } + + /// Creates a WTF-8 string from a UTF-8 `String`. + /// + /// This takes ownership of the `String` and does not copy. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub fn from_string(string: String) -> Wtf8Buf { + Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true } + } + + /// Creates a WTF-8 string from a UTF-8 `&str` slice. + /// + /// This copies the content of the slice. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub fn from_str(str: &str) -> Wtf8Buf { + Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true } + } + + pub fn clear(&mut self) { + self.bytes.clear(); + self.is_known_utf8 = true; + } + + /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. + /// + /// This is lossless: calling `.encode_wide()` on the resulting string + /// will always return the original code units. + pub fn from_wide(v: &[u16]) -> Wtf8Buf { + let mut string = Wtf8Buf::with_capacity(v.len()); + for item in char::decode_utf16(v.iter().cloned()) { + match item { + Ok(ch) => string.push_char(ch), + Err(surrogate) => { + let surrogate = surrogate.unpaired_surrogate(); + // Surrogates are known to be in the code point range. + let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; + // The string will now contain an unpaired surrogate. + string.is_known_utf8 = false; + // Skip the WTF-8 concatenation check, + // surrogate pairs are already decoded by decode_utf16 + string.push_code_point_unchecked(code_point); + } + } + } + string + } + + /// Copied from String::push + /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check. + fn push_code_point_unchecked(&mut self, code_point: CodePoint) { + let mut bytes = [0; 4]; + let bytes = encode_utf8_raw(code_point.to_u32(), &mut bytes); + self.bytes.extend_from_slice(bytes) + } + + #[inline] + pub fn as_slice(&self) -> &Wtf8 { + unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } + } + + #[inline] + pub fn as_mut_slice(&mut self) -> &mut Wtf8 { + // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would + // cause them to change from well-formed UTF-8 to ill-formed UTF-8, + // which would break the assumptions of the `is_known_utf8` field. + unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } + } + + /// Reserves capacity for at least `additional` more bytes to be inserted + /// in the given `Wtf8Buf`. + /// The collection may reserve more space to avoid frequent reallocations. + /// + /// # Panics + /// + /// Panics if the new capacity overflows `usize`. + #[inline] + pub fn reserve(&mut self, additional: usize) { + self.bytes.reserve(additional) + } + + /// Tries to reserve capacity for at least `additional` more length units + /// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid + /// frequent reallocations. After calling `try_reserve`, capacity will be + /// greater than or equal to `self.len() + additional`. Does nothing if + /// capacity is already sufficient. This method preserves the contents even + /// if an error occurs. + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.bytes.try_reserve(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn reserve_exact(&mut self, additional: usize) { + self.bytes.reserve_exact(additional) + } + + /// Tries to reserve the minimum capacity for exactly `additional` + /// length units in the given `Wtf8Buf`. After calling + /// `try_reserve_exact`, capacity will be greater than or equal to + /// `self.len() + additional` if it returns `Ok(())`. + /// Does nothing if the capacity is already sufficient. + /// + /// Note that the allocator may give the `Wtf8Buf` more space than it + /// requests. Therefore, capacity can not be relied upon to be precisely + /// minimal. Prefer [`try_reserve`] if future insertions are expected. + /// + /// [`try_reserve`]: Wtf8Buf::try_reserve + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.bytes.try_reserve_exact(additional) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn shrink_to_fit(&mut self) { + self.bytes.shrink_to_fit() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.bytes.shrink_to(min_capacity) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn leak<'a>(self) -> &'a mut Wtf8 { + unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) } + } + + /// Returns the number of bytes that this string buffer can hold without reallocating. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn capacity(&self) -> usize { + self.bytes.capacity() + } + + /// Append a UTF-8 slice at the end of the string. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn push_str(&mut self, other: &str) { + self.bytes.extend_from_slice(other.as_bytes()) + } + + /// Append a WTF-8 slice at the end of the string. + /// + /// This replaces newly paired surrogates at the boundary + /// with a supplementary code point, + /// like concatenating ill-formed UTF-16 strings effectively would. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn push_wtf8(&mut self, other: &Wtf8) { + match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { + // Replace newly paired surrogates by a supplementary code point. + (Some(lead), Some(trail)) => { + let len_without_lead_surrogate = self.len() - 3; + self.bytes.truncate(len_without_lead_surrogate); + let other_without_trail_surrogate = &other.as_bytes()[3..]; + // 4 bytes for the supplementary code point + self.bytes.reserve(4 + other_without_trail_surrogate.len()); + self.push_char(decode_surrogate_pair(lead, trail)); + self.bytes.extend_from_slice(other_without_trail_surrogate); + } + _ => { + // If we'll be pushing a string containing a surrogate, we may + // no longer have UTF-8. + if other.next_surrogate(0).is_some() { + self.is_known_utf8 = false; + } + + self.bytes.extend_from_slice(other.as_bytes()); + } + } + } + + /// Append a Unicode scalar value at the end of the string. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn push_char(&mut self, c: char) { + self.push_code_point_unchecked(CodePoint::from_char(c)) + } + + /// Append a code point at the end of the string. + /// + /// This replaces newly paired surrogates at the boundary + /// with a supplementary code point, + /// like concatenating ill-formed UTF-16 strings effectively would. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn push(&mut self, code_point: CodePoint) { + if let Some(trail) = code_point.to_trail_surrogate() { + if let Some(lead) = (&*self).final_lead_surrogate() { + let len_without_lead_surrogate = self.len() - 3; + self.bytes.truncate(len_without_lead_surrogate); + self.push_char(decode_surrogate_pair(lead, trail)); + return; + } + + // We're pushing a trailing surrogate. + self.is_known_utf8 = false; + } else if code_point.to_lead_surrogate().is_some() { + // We're pushing a leading surrogate. + self.is_known_utf8 = false; + } + + // No newly paired surrogates at the boundary. + self.push_code_point_unchecked(code_point) + } + + /// Shortens a string to the specified length. + /// + /// # Panics + /// + /// Panics if `new_len` > current length, + /// or if `new_len` is not a code point boundary. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn truncate(&mut self, new_len: usize) { + assert!(is_code_point_boundary(self, new_len)); + self.bytes.truncate(new_len) + } + + /// Consumes the WTF-8 string and tries to convert it to a vec of bytes. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Consumes the WTF-8 string and tries to convert it to UTF-8. + /// + /// This does not copy the data. + /// + /// If the contents are not well-formed UTF-8 + /// (that is, if the string contains surrogates), + /// the original WTF-8 string is returned instead. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn into_string(self) -> Result { + if self.is_known_utf8 || self.next_surrogate(0).is_none() { + Ok(unsafe { String::from_utf8_unchecked(self.bytes) }) + } else { + Err(self) + } + } + + /// Consumes the WTF-8 string and converts it lossily to UTF-8. + /// + /// This does not copy the data (but may overwrite parts of it in place). + /// + /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn into_string_lossy(mut self) -> String { + // Fast path: If we already have UTF-8, we can return it immediately. + if self.is_known_utf8 { + return unsafe { String::from_utf8_unchecked(self.bytes) }; + } + + let mut pos = 0; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + pos = surrogate_pos + 3; + self.bytes[surrogate_pos..pos] + .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); + } + None => return unsafe { String::from_utf8_unchecked(self.bytes) }, + } + } + } + + /// Converts this `Wtf8Buf` into a boxed `Wtf8`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_box(self) -> Box { + // SAFETY: relies on `Wtf8` being `repr(transparent)`. + unsafe { mem::transmute(self.bytes.into_boxed_slice()) } + } + + /// Converts a `Box` into a `Wtf8Buf`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn from_box(boxed: Box) -> Wtf8Buf { + let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; + Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false } + } + + /// Provides plumbing to core `Vec::extend_from_slice`. + /// More well behaving alternative to allowing outer types + /// full mutable access to the core `Vec`. + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { + self.bytes.extend_from_slice(other); + self.is_known_utf8 = false; + } +} + +/// Creates a new WTF-8 string from an iterator of code points. +/// +/// This replaces surrogate code point pairs with supplementary code points, +/// like concatenating ill-formed UTF-16 strings effectively would. +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl FromIterator for Wtf8Buf { + fn from_iter>(iter: T) -> Wtf8Buf { + let mut string = Wtf8Buf::new(); + string.extend(iter); + string + } +} + +/// Append code points from an iterator to the string. +/// +/// This replaces surrogate code point pairs with supplementary code points, +/// like concatenating ill-formed UTF-16 strings effectively would. +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl Extend for Wtf8Buf { + fn extend>(&mut self, iter: T) { + let iterator = iter.into_iter(); + let (low, _high) = iterator.size_hint(); + // Lower bound of one byte per code point (ASCII only) + self.bytes.reserve(low); + iterator.for_each(move |code_point| self.push(code_point)); + } + + #[inline] + fn extend_one(&mut self, code_point: CodePoint) { + self.push(code_point); + } + + #[inline] + fn extend_reserve(&mut self, additional: usize) { + // Lower bound of one byte per code point (ASCII only) + self.bytes.reserve(additional); + } +} + +impl Wtf8 { + /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_owned(&self) -> Wtf8Buf { + Wtf8Buf { bytes: self.as_bytes().to_vec(), is_known_utf8: false } + } + + /// Lossily converts the string to UTF-8. + /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. + /// + /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). + /// + /// This only copies the data if necessary (if it contains any surrogate). + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_string_lossy(&self) -> Cow<'_, str> { + let surrogate_pos = match self.next_surrogate(0) { + None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(self.as_bytes()) }), + Some((pos, _)) => pos, + }; + let wtf8_bytes = self.as_bytes(); + let mut utf8_bytes = Vec::with_capacity(self.len()); + utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); + utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); + let mut pos = surrogate_pos + 3; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); + utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); + pos = surrogate_pos + 3; + } + None => { + utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); + return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); + } + } + } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn clone_into(&self, buf: &mut Wtf8Buf) { + buf.is_known_utf8 = false; + self.as_bytes().clone_into(&mut buf.bytes); + } + + /// Boxes this `Wtf8`. + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_box(&self) -> Box { + let boxed: Box<[u8]> = self.as_bytes().into(); + unsafe { mem::transmute(boxed) } + } + + /// Creates a boxed, empty `Wtf8`. + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn empty_box() -> Box { + let boxed: Box<[u8]> = Default::default(); + unsafe { mem::transmute(boxed) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_arc(&self) -> Arc { + let arc: Arc<[u8]> = Arc::from(self.as_bytes()); + unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn into_rc(&self) -> Rc { + let rc: Rc<[u8]> = Rc::from(self.as_bytes()); + unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn to_ascii_lowercase(&self) -> Wtf8Buf { + Wtf8Buf { bytes: self.as_bytes().to_ascii_lowercase(), is_known_utf8: false } + } + + #[rustc_allow_incoherent_impl] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn to_ascii_uppercase(&self) -> Wtf8Buf { + Wtf8Buf { bytes: self.as_bytes().to_ascii_uppercase(), is_known_utf8: false } + } +} + +impl Hash for Wtf8Buf { + #[inline] + fn hash(&self, state: &mut H) { + state.write(&self.bytes); + 0xfeu8.hash(state) + } +} diff --git a/library/alloc/src/ffi/wtf8/tests.rs b/library/alloc/src/ffi/wtf8/tests.rs new file mode 100644 index 0000000000000..8f9c11a1c9ea8 --- /dev/null +++ b/library/alloc/src/ffi/wtf8/tests.rs @@ -0,0 +1,616 @@ +use super::*; + +#[test] +fn wtf8buf_new() { + assert_eq!(Wtf8Buf::new().bytes, b""); +} + +#[test] +fn wtf8buf_from_str() { + assert_eq!(Wtf8Buf::from_str("").bytes, b""); + assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); +} + +#[test] +fn wtf8buf_from_string() { + assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); + assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); +} + +#[test] +fn wtf8buf_from_wide() { + let buf = Wtf8Buf::from_wide(&[]); + assert_eq!(buf.bytes, b""); + assert!(buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xDCA9]); + assert_eq!(buf.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); + assert_eq!(buf.bytes, b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xD800]); + assert_eq!(buf.bytes, b"\xED\xA0\x80"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xDBFF]); + assert_eq!(buf.bytes, b"\xED\xAF\xBF"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xDC00]); + assert_eq!(buf.bytes, b"\xED\xB0\x80"); + assert!(!buf.is_known_utf8); + + let buf = Wtf8Buf::from_wide(&[0xDFFF]); + assert_eq!(buf.bytes, b"\xED\xBF\xBF"); + assert!(!buf.is_known_utf8); +} + +#[test] +fn wtf8buf_push_str() { + let mut string = Wtf8Buf::new(); + assert_eq!(string.bytes, b""); + assert!(string.is_known_utf8); + + string.push_str("aé 💩"); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); +} + +#[test] +fn wtf8buf_push_char() { + let mut string = Wtf8Buf::from_str("aé "); + assert_eq!(string.bytes, b"a\xC3\xA9 "); + assert!(string.is_known_utf8); + + string.push_char('💩'); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); +} + +#[test] +fn wtf8buf_push() { + let mut string = Wtf8Buf::from_str("aé "); + assert_eq!(string.bytes, b"a\xC3\xA9 "); + assert!(string.is_known_utf8); + + string.push(CodePoint::from_char('💩')); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); + + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + + let mut string = Wtf8Buf::new(); + string.push(c(0xD83D)); // lead + assert!(!string.is_known_utf8); + string.push(c(0xDCA9)); // trail + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + + let mut string = Wtf8Buf::new(); + string.push(c(0xD83D)); // lead + assert!(!string.is_known_utf8); + string.push(c(0x20)); // not surrogate + string.push(c(0xDCA9)); // trail + assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xD800)); // lead + assert!(!string.is_known_utf8); + string.push(c(0xDBFF)); // lead + assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xD800)); // lead + assert!(!string.is_known_utf8); + string.push(c(0xE000)); // not surrogate + assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xD7FF)); // not surrogate + assert!(string.is_known_utf8); + string.push(c(0xDC00)); // trail + assert!(!string.is_known_utf8); + assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0x61)); // not surrogate, < 3 bytes + assert!(string.is_known_utf8); + string.push(c(0xDC00)); // trail + assert!(!string.is_known_utf8); + assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + + let mut string = Wtf8Buf::new(); + string.push(c(0xDC00)); // trail + assert!(!string.is_known_utf8); + assert_eq!(string.bytes, b"\xED\xB0\x80"); +} + +#[test] +fn wtf8buf_push_wtf8() { + let mut string = Wtf8Buf::from_str("aé"); + assert_eq!(string.bytes, b"a\xC3\xA9"); + string.push_wtf8(Wtf8::from_str(" 💩")); + assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); + + fn w(v: &[u8]) -> &Wtf8 { + unsafe { Wtf8::from_bytes_unchecked(v) } + } + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\xBD")); // lead + string.push_wtf8(w(b"\xED\xB2\xA9")); // trail + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\xBD")); // lead + string.push_wtf8(w(b" ")); // not surrogate + string.push_wtf8(w(b"\xED\xB2\xA9")); // trail + assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); + assert!(!string.is_known_utf8); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\x80")); // lead + string.push_wtf8(w(b"\xED\xAF\xBF")); // lead + assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); + assert!(!string.is_known_utf8); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xA0\x80")); // lead + string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate + assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); + assert!(!string.is_known_utf8); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); + assert!(!string.is_known_utf8); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); + assert!(!string.is_known_utf8); + + let mut string = Wtf8Buf::new(); + string.push_wtf8(w(b"\xED\xB0\x80")); // trail + assert_eq!(string.bytes, b"\xED\xB0\x80"); + assert!(!string.is_known_utf8); +} + +#[test] +fn wtf8buf_truncate() { + let mut string = Wtf8Buf::from_str("aé"); + assert!(string.is_known_utf8); + + string.truncate(3); + assert_eq!(string.bytes, b"a\xC3\xA9"); + assert!(string.is_known_utf8); + + string.truncate(1); + assert_eq!(string.bytes, b"a"); + assert!(string.is_known_utf8); + + string.truncate(0); + assert_eq!(string.bytes, b""); + assert!(string.is_known_utf8); +} + +#[test] +fn wtf8buf_truncate_around_non_bmp() { + let mut string = Wtf8Buf::from_str("💩"); + assert!(string.is_known_utf8); + + string.truncate(4); + assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); + assert!(string.is_known_utf8); + + string.truncate(0); + assert_eq!(string.bytes, b""); + assert!(string.is_known_utf8); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_fail_code_point_boundary() { + let mut string = Wtf8Buf::from_str("aé"); + string.truncate(2); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_fail_longer() { + let mut string = Wtf8Buf::from_str("aé"); + string.truncate(4); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_splitting_non_bmp3() { + let mut string = Wtf8Buf::from_str("💩"); + assert!(string.is_known_utf8); + string.truncate(3); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_splitting_non_bmp2() { + let mut string = Wtf8Buf::from_str("💩"); + assert!(string.is_known_utf8); + string.truncate(2); +} + +#[test] +#[should_panic] +fn wtf8buf_truncate_splitting_non_bmp1() { + let mut string = Wtf8Buf::from_str("💩"); + assert!(string.is_known_utf8); + string.truncate(1); +} + +#[test] +fn wtf8buf_into_string() { + let mut string = Wtf8Buf::from_str("aé 💩"); + assert!(string.is_known_utf8); + assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); + string.push(CodePoint::from_u32(0xD800).unwrap()); + assert!(!string.is_known_utf8); + assert_eq!(string.clone().into_string(), Err(string)); +} + +#[test] +fn wtf8buf_into_string_lossy() { + let mut string = Wtf8Buf::from_str("aé 💩"); + assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); + string.push(CodePoint::from_u32(0xD800).unwrap()); + assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); +} + +#[test] +fn wtf8buf_from_iterator() { + fn f(values: &[u32]) -> Wtf8Buf { + values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::() + } + assert_eq!( + f(&[0x61, 0xE9, 0x20, 0x1F4A9]), + Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true } + ); + + assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!( + f(&[0xD83D, 0x20, 0xDCA9]), + Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0xD800, 0xDBFF]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xED\xAF\xBF".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0xD800, 0xE000]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xEE\x80\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0xD7FF, 0xDC00]), + Wtf8Buf { bytes: b"\xED\x9F\xBF\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + f(&[0x61, 0xDC00]), + Wtf8Buf { bytes: b"\x61\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!(f(&[0xDC00]), Wtf8Buf { bytes: b"\xED\xB0\x80".to_vec(), is_known_utf8: false }); +} + +#[test] +fn wtf8buf_extend() { + fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { + fn c(value: &u32) -> CodePoint { + CodePoint::from_u32(*value).unwrap() + } + let mut string = initial.iter().map(c).collect::(); + string.extend(extended.iter().map(c)); + string + } + + assert_eq!( + e(&[0x61, 0xE9], &[0x20, 0x1F4A9]), + Wtf8Buf { bytes: b"a\xC3\xA9 \xF0\x9F\x92\xA9".to_vec(), is_known_utf8: true } + ); + + assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! + assert_eq!( + e(&[0xD83D, 0x20], &[0xDCA9]), + Wtf8Buf { bytes: b"\xED\xA0\xBD \xED\xB2\xA9".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0xD800], &[0xDBFF]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xED\xAF\xBF".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0xD800], &[0xE000]), + Wtf8Buf { bytes: b"\xED\xA0\x80\xEE\x80\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0xD7FF], &[0xDC00]), + Wtf8Buf { bytes: b"\xED\x9F\xBF\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[0x61], &[0xDC00]), + Wtf8Buf { bytes: b"\x61\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); + assert_eq!( + e(&[], &[0xDC00]), + Wtf8Buf { bytes: b"\xED\xB0\x80".to_vec(), is_known_utf8: false } + ); +} + +#[test] +fn wtf8buf_show() { + let mut string = Wtf8Buf::from_str("a\té \u{7f}💩\r"); + string.push(CodePoint::from_u32(0xD800).unwrap()); + assert_eq!(format!("{string:?}"), "\"a\\té \\u{7f}\u{1f4a9}\\r\\u{d800}\""); +} + +#[test] +fn wtf8buf_as_slice() { + assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé")); +} + +#[test] +fn wtf8buf_show_str() { + let text = "a\té 💩\r"; + let string = Wtf8Buf::from_str(text); + assert_eq!(format!("{text:?}"), format!("{string:?}")); +} + +#[test] +fn wtf8_code_points() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + fn cp(string: &Wtf8Buf) -> Vec> { + string.code_points().map(|c| c.to_char()).collect::>() + } + let mut string = Wtf8Buf::from_str("é "); + assert_eq!(cp(&string), [Some('é'), Some(' ')]); + string.push(c(0xD83D)); + assert_eq!(cp(&string), [Some('é'), Some(' '), None]); + string.push(c(0xDCA9)); + assert_eq!(cp(&string), [Some('é'), Some(' '), Some('💩')]); +} + +#[test] +fn wtf8_as_str() { + assert_eq!(Wtf8::from_str("").as_str(), Ok("")); + assert_eq!(Wtf8::from_str("aé 💩").as_str(), Ok("aé 💩")); + let mut string = Wtf8Buf::new(); + string.push(CodePoint::from_u32(0xD800).unwrap()); + assert!(string.as_str().is_err()); +} + +#[test] +fn wtf8_to_string_lossy() { + assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); + assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩")); + let mut string = Wtf8Buf::from_str("aé 💩"); + string.push(CodePoint::from_u32(0xD800).unwrap()); + let expected: Cow<'_, str> = Cow::Owned(String::from("aé 💩�")); + assert_eq!(string.to_string_lossy(), expected); +} + +#[test] +fn wtf8_display() { + fn d(b: &[u8]) -> String { + (&unsafe { Wtf8::from_bytes_unchecked(b) }).to_string() + } + + assert_eq!("", d("".as_bytes())); + assert_eq!("aé 💩", d("aé 💩".as_bytes())); + + let mut string = Wtf8Buf::from_str("aé 💩"); + string.push(CodePoint::from_u32(0xD800).unwrap()); + assert_eq!("aé 💩�", d(string.as_inner())); +} + +#[test] +fn wtf8_encode_wide() { + let mut string = Wtf8Buf::from_str("aé "); + string.push(CodePoint::from_u32(0xD83D).unwrap()); + string.push_char('💩'); + assert_eq!( + string.encode_wide().collect::>(), + vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9] + ); +} + +#[test] +fn wtf8_encode_wide_size_hint() { + let string = Wtf8Buf::from_str("\u{12345}"); + let mut iter = string.encode_wide(); + assert_eq!((1, Some(8)), iter.size_hint()); + iter.next().unwrap(); + assert_eq!((1, Some(1)), iter.size_hint()); + iter.next().unwrap(); + assert_eq!((0, Some(0)), iter.size_hint()); + assert!(iter.next().is_none()); +} + +#[test] +fn wtf8_clone_into() { + let mut string = Wtf8Buf::new(); + Wtf8::from_str("green").clone_into(&mut string); + assert_eq!(string.bytes, b"green"); + + let mut string = Wtf8Buf::from_str("green"); + Wtf8::from_str("").clone_into(&mut string); + assert_eq!(string.bytes, b""); + + let mut string = Wtf8Buf::from_str("red"); + Wtf8::from_str("green").clone_into(&mut string); + assert_eq!(string.bytes, b"green"); + + let mut string = Wtf8Buf::from_str("green"); + Wtf8::from_str("red").clone_into(&mut string); + assert_eq!(string.bytes, b"red"); + + let mut string = Wtf8Buf::from_str("green"); + assert!(string.is_known_utf8); + unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").clone_into(&mut string) }; + assert_eq!(string.bytes, b"\xED\xA0\x80"); + assert!(!string.is_known_utf8); +} + +#[test] +fn wtf8_to_ascii_lowercase() { + let lowercase = Wtf8::from_str("").to_ascii_lowercase(); + assert_eq!(lowercase.bytes, b""); + + let lowercase = Wtf8::from_str("GrEeN gRaPeS! 🍇").to_ascii_lowercase(); + assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); + + let lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_lowercase() }; + assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); + assert!(!lowercase.is_known_utf8); +} + +#[test] +fn wtf8_to_ascii_uppercase() { + let uppercase = Wtf8::from_str("").to_ascii_uppercase(); + assert_eq!(uppercase.bytes, b""); + + let uppercase = Wtf8::from_str("GrEeN gRaPeS! 🍇").to_ascii_uppercase(); + assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); + + let uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_ascii_uppercase() }; + assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); + assert!(!uppercase.is_known_utf8); +} + +#[test] +fn wtf8_make_ascii_lowercase() { + let mut lowercase = Wtf8Buf::from_str(""); + lowercase.make_ascii_lowercase(); + assert_eq!(lowercase.bytes, b""); + + let mut lowercase = Wtf8Buf::from_str("GrEeN gRaPeS! 🍇"); + lowercase.make_ascii_lowercase(); + assert_eq!(lowercase.bytes, b"green grapes! \xf0\x9f\x8d\x87"); + + let mut lowercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + lowercase.make_ascii_lowercase(); + assert_eq!(lowercase.bytes, b"\xED\xA0\x80"); + assert!(!lowercase.is_known_utf8); +} + +#[test] +fn wtf8_make_ascii_uppercase() { + let mut uppercase = Wtf8Buf::from_str(""); + uppercase.make_ascii_uppercase(); + assert_eq!(uppercase.bytes, b""); + + let mut uppercase = Wtf8Buf::from_str("GrEeN gRaPeS! 🍇"); + uppercase.make_ascii_uppercase(); + assert_eq!(uppercase.bytes, b"GREEN GRAPES! \xf0\x9f\x8d\x87"); + + let mut uppercase = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + uppercase.make_ascii_uppercase(); + assert_eq!(uppercase.bytes, b"\xED\xA0\x80"); + assert!(!uppercase.is_known_utf8); +} + +#[test] +fn wtf8_to_owned() { + let string = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + assert_eq!(string.bytes, b"\xED\xA0\x80"); + assert!(!string.is_known_utf8); +} + +#[test] +fn wtf8_valid_utf8_boundaries() { + let mut string = Wtf8Buf::from_str("aé 💩"); + string.push(CodePoint::from_u32(0xD800).unwrap()); + string.push(CodePoint::from_u32(0xD800).unwrap()); + check_utf8_boundary(&string, 0); + check_utf8_boundary(&string, 1); + check_utf8_boundary(&string, 3); + check_utf8_boundary(&string, 4); + check_utf8_boundary(&string, 8); + check_utf8_boundary(&string, 14); + assert_eq!(string.len(), 14); + + string.push_char('a'); + check_utf8_boundary(&string, 14); + check_utf8_boundary(&string, 15); + + let mut string = Wtf8Buf::from_str("a"); + string.push(CodePoint::from_u32(0xD800).unwrap()); + check_utf8_boundary(&string, 1); + + let mut string = Wtf8Buf::from_str("\u{D7FF}"); + string.push(CodePoint::from_u32(0xD800).unwrap()); + check_utf8_boundary(&string, 3); + + let mut string = Wtf8Buf::new(); + string.push(CodePoint::from_u32(0xD800).unwrap()); + string.push_char('\u{D7FF}'); + check_utf8_boundary(&string, 3); +} + +#[test] +#[should_panic(expected = "byte index 4 is out of bounds")] +fn wtf8_utf8_boundary_out_of_bounds() { + let string = Wtf8::from_str("aé"); + check_utf8_boundary(&string, 4); +} + +#[test] +#[should_panic(expected = "byte index 1 is not a codepoint boundary")] +fn wtf8_utf8_boundary_inside_codepoint() { + let string = Wtf8::from_str("é"); + check_utf8_boundary(&string, 1); +} + +#[test] +#[should_panic(expected = "byte index 1 is not a codepoint boundary")] +fn wtf8_utf8_boundary_inside_surrogate() { + let mut string = Wtf8Buf::new(); + string.push(CodePoint::from_u32(0xD800).unwrap()); + check_utf8_boundary(&string, 1); +} + +#[test] +#[should_panic(expected = "byte index 3 lies between surrogate codepoints")] +fn wtf8_utf8_boundary_between_surrogates() { + let mut string = Wtf8Buf::new(); + string.push(CodePoint::from_u32(0xD800).unwrap()); + string.push(CodePoint::from_u32(0xD800).unwrap()); + check_utf8_boundary(&string, 3); +} + +#[test] +fn wobbled_wtf8_plus_bytes_isnt_utf8() { + let mut string: Wtf8Buf = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + assert!(!string.is_known_utf8); + string.extend_from_slice(b"some utf-8"); + assert!(!string.is_known_utf8); +} + +#[test] +fn wobbled_wtf8_plus_str_isnt_utf8() { + let mut string: Wtf8Buf = unsafe { Wtf8::from_bytes_unchecked(b"\xED\xA0\x80").to_owned() }; + assert!(!string.is_known_utf8); + string.push_str("some utf-8"); + assert!(!string.is_known_utf8); +} + +#[test] +fn unwobbly_wtf8_plus_utf8_is_utf8() { + let mut string: Wtf8Buf = Wtf8Buf::from_str("hello world"); + assert!(string.is_known_utf8); + string.push_str("some utf-8"); + assert!(string.is_known_utf8); +} diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index 5e4b08df6cb55..45810cb4fc95f 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -104,6 +104,7 @@ #![feature(async_closure)] #![feature(async_fn_traits)] #![feature(async_iterator)] +#![feature(char_internals)] #![feature(clone_to_uninit)] #![feature(coerce_unsized)] #![feature(const_align_of_val)] @@ -135,6 +136,7 @@ #![feature(local_waker)] #![feature(maybe_uninit_slice)] #![feature(maybe_uninit_uninit_array_transpose)] +#![feature(os_str_internals)] #![feature(panic_internals)] #![feature(pattern)] #![feature(pin_coerce_unsized_trait)] diff --git a/library/core/src/ffi/mod.rs b/library/core/src/ffi/mod.rs index ec1f9052a1564..b0d7993530118 100644 --- a/library/core/src/ffi/mod.rs +++ b/library/core/src/ffi/mod.rs @@ -23,6 +23,22 @@ use crate::fmt; #[unstable(feature = "c_str_module", issue = "112134")] pub mod c_str; +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub mod os_str; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub mod wtf8; + #[unstable( feature = "c_variadic", issue = "44930", diff --git a/library/core/src/ffi/os_str.rs b/library/core/src/ffi/os_str.rs new file mode 100644 index 0000000000000..f010d7f0e7f0c --- /dev/null +++ b/library/core/src/ffi/os_str.rs @@ -0,0 +1,626 @@ +//! [`OsStr`] abd their related types. + +use crate::clone::CloneToUninit; +use crate::hash::{Hash, Hasher}; +use crate::ops::{self, Range}; +use crate::ptr::addr_of_mut; +use crate::{cmp, fmt, slice}; + +mod private { + /// This trait being unreachable from outside the crate + /// prevents outside implementations of our extension traits. + /// This allows adding more trait methods in the future. + #[unstable(feature = "sealed", issue = "none")] + pub trait Sealed {} +} + +#[cfg(any(target_os = "windows", target_os = "uefi"))] +mod wtf8; + +#[cfg(any(target_os = "windows", target_os = "uefi"))] +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub use wtf8::Slice; + +#[cfg(not(any(target_os = "windows", target_os = "uefi")))] +mod bytes; + +#[cfg(not(any(target_os = "windows", target_os = "uefi")))] +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +pub use bytes::Slice; + +#[cfg(any(target_os = "windows", target_os = "uefi"))] +#[stable(feature = "rust1", since = "1.0.0")] +pub mod os_str_ext_windows; + +#[cfg(not(any(target_os = "windows", target_os = "uefi")))] +#[stable(feature = "rust1", since = "1.0.0")] +pub mod os_str_ext_unix; + +/// Borrowed reference to an OS string (see [`OsString`]). +/// +/// This type represents a borrowed reference to a string in the operating system's preferred +/// representation. +/// +/// `&OsStr` is to [`OsString`] as &[str] is to [`String`]: the +/// former in each pair are borrowed references; the latter are owned strings. +/// +/// See the [module's toplevel documentation about conversions][conversions] for a discussion on +/// the traits which `OsStr` implements for [conversions] from/to native representations. +/// +/// [conversions]: super#conversions +#[cfg_attr(not(test), rustc_diagnostic_item = "OsStr")] +#[stable(feature = "rust1", since = "1.0.0")] +// `OsStr::from_inner` current implementation relies +// on `OsStr` being layout-compatible with `Slice`. +// However, `OsStr` layout is considered an implementation detail and must not be relied upon. +#[repr(transparent)] +#[rustc_has_incoherent_inherent_impls] +pub struct OsStr { + inner: Slice, +} + +/// Allows extension traits within `std`. +#[unstable(feature = "sealed", issue = "none")] +impl private::Sealed for OsStr {} + +impl OsStr { + /// Coerces into an `OsStr` slice. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("foo"); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn new + ?Sized>(s: &S) -> &OsStr { + s.as_ref() + } + + /// Converts a slice of bytes to an OS string slice without checking that the string contains + /// valid `OsStr`-encoded data. + /// + /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. + /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit + /// ASCII. + /// + /// See the [module's toplevel documentation about conversions][conversions] for safe, + /// cross-platform [conversions] from/to native representations. + /// + /// # Safety + /// + /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of + /// validated UTF-8 and bytes from [`OsStr::as_encoded_bytes`] from within the same Rust version + /// built for the same target platform. For example, reconstructing an `OsStr` from bytes sent + /// over the network or stored in a file will likely violate these safety rules. + /// + /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_encoded_bytes`] can be + /// split either immediately before or immediately after any valid non-empty UTF-8 substring. + /// + /// # Example + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("Mary had a little lamb"); + /// let bytes = os_str.as_encoded_bytes(); + /// let words = bytes.split(|b| *b == b' '); + /// let words: Vec<&OsStr> = words.map(|word| { + /// // SAFETY: + /// // - Each `word` only contains content that originated from `OsStr::as_encoded_bytes` + /// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring + /// unsafe { OsStr::from_encoded_bytes_unchecked(word) } + /// }).collect(); + /// ``` + /// + /// [conversions]: super#conversions + #[inline] + #[stable(feature = "os_str_bytes", since = "1.74.0")] + pub unsafe fn from_encoded_bytes_unchecked(bytes: &[u8]) -> &Self { + // SAFETY: unsafe fn + Self::from_inner(unsafe { Slice::from_encoded_bytes_unchecked(bytes) }) + } + + /// Create immutable [`OsStr`] from [`Slice`] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + #[doc(hidden)] + pub fn from_inner(inner: &Slice) -> &OsStr { + // SAFETY: OsStr is just a wrapper of Slice, + // therefore converting &Slice to &OsStr is safe. + unsafe { &*(inner as *const Slice as *const OsStr) } + } + + /// Create mutable [`OsStr`] from [`Slice`] + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + #[doc(hidden)] + pub fn from_inner_mut(inner: &mut Slice) -> &mut OsStr { + // SAFETY: OsStr is just a wrapper of Slice, + // therefore converting &mut Slice to &mut OsStr is safe. + // Any method that mutates OsStr must be careful not to + // break platform-specific encoding, in particular Wtf8 on Windows. + unsafe { &mut *(inner as *mut Slice as *mut OsStr) } + } + + /// Yields a &[str] slice if the `OsStr` is valid Unicode. + /// + /// This conversion may entail doing a check for UTF-8 validity. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("foo"); + /// assert_eq!(os_str.to_str(), Some("foo")); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use = "this returns the result of the operation, \ + without modifying the original"] + #[inline] + pub fn to_str(&self) -> Option<&str> { + self.inner.to_str().ok() + } + + /// Checks whether the `OsStr` is empty. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new(""); + /// assert!(os_str.is_empty()); + /// + /// let os_str = OsStr::new("foo"); + /// assert!(!os_str.is_empty()); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[must_use] + #[inline] + pub fn is_empty(&self) -> bool { + self.inner.inner.is_empty() + } + + /// Returns the length of this `OsStr`. + /// + /// Note that this does **not** return the number of bytes in the string in + /// OS string form. + /// + /// The length returned is that of the underlying storage used by `OsStr`. + /// As discussed in the [`OsString`] introduction, [`OsString`] and `OsStr` + /// store strings in a form best suited for cheap inter-conversion between + /// native-platform and Rust string forms, which may differ significantly + /// from both of them, including in storage size and encoding. + /// + /// This number is simply useful for passing to other methods, like + /// [`OsString::with_capacity`] to avoid reallocations. + /// + /// See the main `OsString` documentation information about encoding and capacity units. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new(""); + /// assert_eq!(os_str.len(), 0); + /// + /// let os_str = OsStr::new("foo"); + /// assert_eq!(os_str.len(), 3); + /// ``` + #[stable(feature = "osstring_simple_functions", since = "1.9.0")] + #[must_use] + #[inline] + pub fn len(&self) -> usize { + self.inner.inner.len() + } + + /// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS + /// string slice, use the [`OsStr::from_encoded_bytes_unchecked`] function. + /// + /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. + /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit + /// ASCII. + /// + /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should + /// be treated as opaque and only comparable within the same Rust version built for the same + /// target platform. For example, sending the slice over the network or storing it in a file + /// will likely result in incompatible byte slices. See [`OsString`] for more encoding details + /// and [`std::ffi`] for platform-specific, specified conversions. + /// + /// [`std::ffi`]: crate::ffi + #[inline] + #[stable(feature = "os_str_bytes", since = "1.74.0")] + pub fn as_encoded_bytes(&self) -> &[u8] { + self.inner.as_encoded_bytes() + } + + /// Takes a substring based on a range that corresponds to the return value of + /// [`OsStr::as_encoded_bytes`]. + /// + /// The range's start and end must lie on valid `OsStr` boundaries. + /// A valid `OsStr` boundary is one of: + /// - The start of the string + /// - The end of the string + /// - Immediately before a valid non-empty UTF-8 substring + /// - Immediately after a valid non-empty UTF-8 substring + /// + /// # Panics + /// + /// Panics if `range` does not lie on valid `OsStr` boundaries or if it + /// exceeds the end of the string. + /// + /// # Example + /// + /// ``` + /// #![feature(os_str_slice)] + /// + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("foo=bar"); + /// let bytes = os_str.as_encoded_bytes(); + /// if let Some(index) = bytes.iter().position(|b| *b == b'=') { + /// let key = os_str.slice_encoded_bytes(..index); + /// let value = os_str.slice_encoded_bytes(index + 1..); + /// assert_eq!(key, "foo"); + /// assert_eq!(value, "bar"); + /// } + /// ``` + #[unstable(feature = "os_str_slice", issue = "118485")] + pub fn slice_encoded_bytes>(&self, range: R) -> &Self { + let encoded_bytes = self.as_encoded_bytes(); + let Range { start, end } = slice::range(range, ..encoded_bytes.len()); + + // `check_public_boundary` should panic if the index does not lie on an + // `OsStr` boundary as described above. It's possible to do this in an + // encoding-agnostic way, but details of the internal encoding might + // permit a more efficient implementation. + self.inner.check_public_boundary(start); + self.inner.check_public_boundary(end); + + // SAFETY: `slice::range` ensures that `start` and `end` are valid + let slice = unsafe { encoded_bytes.get_unchecked(start..end) }; + + // SAFETY: `slice` comes from `self` and we validated the boundaries + unsafe { Self::from_encoded_bytes_unchecked(slice) } + } + + /// Converts this string to its ASCII lower case equivalent in-place. + /// + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', + /// but non-ASCII letters are unchanged. + /// + /// To return a new lowercased value without modifying the existing one, use + /// [`OsStr::to_ascii_lowercase`]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut s = OsString::from("GRÜßE, JÜRGEN ❤"); + /// + /// s.make_ascii_lowercase(); + /// + /// assert_eq!("grÜße, jÜrgen ❤", s); + /// ``` + #[stable(feature = "osstring_ascii", since = "1.53.0")] + #[inline] + pub fn make_ascii_lowercase(&mut self) { + self.inner.make_ascii_lowercase() + } + + /// Converts this string to its ASCII upper case equivalent in-place. + /// + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', + /// but non-ASCII letters are unchanged. + /// + /// To return a new uppercased value without modifying the existing one, use + /// [`OsStr::to_ascii_uppercase`]. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let mut s = OsString::from("Grüße, Jürgen ❤"); + /// + /// s.make_ascii_uppercase(); + /// + /// assert_eq!("GRüßE, JüRGEN ❤", s); + /// ``` + #[stable(feature = "osstring_ascii", since = "1.53.0")] + #[inline] + pub fn make_ascii_uppercase(&mut self) { + self.inner.make_ascii_uppercase() + } + + /// Checks if all characters in this string are within the ASCII range. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// let ascii = OsString::from("hello!\n"); + /// let non_ascii = OsString::from("Grüße, Jürgen ❤"); + /// + /// assert!(ascii.is_ascii()); + /// assert!(!non_ascii.is_ascii()); + /// ``` + #[stable(feature = "osstring_ascii", since = "1.53.0")] + #[must_use] + #[inline] + pub fn is_ascii(&self) -> bool { + self.inner.is_ascii() + } + + /// Checks that two strings are an ASCII case-insensitive match. + /// + /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, + /// but without allocating and copying temporaries. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// + /// assert!(OsString::from("Ferris").eq_ignore_ascii_case("FERRIS")); + /// assert!(OsString::from("Ferrös").eq_ignore_ascii_case("FERRöS")); + /// assert!(!OsString::from("Ferrös").eq_ignore_ascii_case("FERRÖS")); + /// ``` + #[stable(feature = "osstring_ascii", since = "1.53.0")] + pub fn eq_ignore_ascii_case>(&self, other: S) -> bool { + self.inner.eq_ignore_ascii_case(&other.as_ref().inner) + } + + /// Returns an object that implements [`Display`] for safely printing an + /// [`OsStr`] that may contain non-Unicode data. This may perform lossy + /// conversion, depending on the platform. If you would like an + /// implementation which escapes the [`OsStr`] please use [`Debug`] + /// instead. + /// + /// [`Display`]: fmt::Display + /// [`Debug`]: fmt::Debug + /// + /// # Examples + /// + /// ``` + /// #![feature(os_str_display)] + /// use std::ffi::OsStr; + /// + /// let s = OsStr::new("Hello, world!"); + /// println!("{}", s.display()); + /// ``` + #[unstable(feature = "os_str_display", issue = "120048")] + #[must_use = "this does not display the `OsStr`; \ + it returns an object that can be displayed"] + #[inline] + pub fn display(&self) -> Display<'_> { + Display { os_str: self } + } + + /// Get inner slice + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + #[doc(hidden)] + pub fn as_inner(&self) -> &Slice { + &self.inner + } +} + +#[unstable(feature = "clone_to_uninit", issue = "126799")] +unsafe impl CloneToUninit for OsStr { + #[inline] + #[cfg_attr(debug_assertions, track_caller)] + unsafe fn clone_to_uninit(&self, dst: *mut Self) { + // SAFETY: we're just a wrapper around a platform-specific Slice + unsafe { self.inner.clone_to_uninit(addr_of_mut!((*dst).inner)) } + } +} + +#[stable(feature = "str_tryfrom_osstr_impl", since = "1.72.0")] +impl<'a> TryFrom<&'a OsStr> for &'a str { + type Error = crate::str::Utf8Error; + + /// Tries to convert an `&OsStr` to a `&str`. + /// + /// ``` + /// use std::ffi::OsStr; + /// + /// let os_str = OsStr::new("foo"); + /// let as_str = <&str>::try_from(os_str).unwrap(); + /// assert_eq!(as_str, "foo"); + /// ``` + fn try_from(value: &'a OsStr) -> Result { + value.inner.to_str() + } +} + +#[stable(feature = "osstring_default", since = "1.9.0")] +impl Default for &OsStr { + /// Creates an empty `OsStr`. + #[inline] + fn default() -> Self { + OsStr::new("") + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for OsStr { + #[inline] + fn eq(&self, other: &OsStr) -> bool { + self.as_encoded_bytes().eq(other.as_encoded_bytes()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for OsStr { + #[inline] + fn eq(&self, other: &str) -> bool { + *self == *OsStr::new(other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for str { + #[inline] + fn eq(&self, other: &OsStr) -> bool { + *other == *OsStr::new(self) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for OsStr {} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for OsStr { + #[inline] + fn partial_cmp(&self, other: &OsStr) -> Option { + self.as_encoded_bytes().partial_cmp(other.as_encoded_bytes()) + } + #[inline] + fn lt(&self, other: &OsStr) -> bool { + self.as_encoded_bytes().lt(other.as_encoded_bytes()) + } + #[inline] + fn le(&self, other: &OsStr) -> bool { + self.as_encoded_bytes().le(other.as_encoded_bytes()) + } + #[inline] + fn gt(&self, other: &OsStr) -> bool { + self.as_encoded_bytes().gt(other.as_encoded_bytes()) + } + #[inline] + fn ge(&self, other: &OsStr) -> bool { + self.as_encoded_bytes().ge(other.as_encoded_bytes()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for OsStr { + #[inline] + fn partial_cmp(&self, other: &str) -> Option { + self.partial_cmp(OsStr::new(other)) + } +} + +// FIXME (#19470): cannot provide PartialOrd for str until we +// have more flexible coherence rules. + +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for OsStr { + #[inline] + fn cmp(&self, other: &OsStr) -> cmp::Ordering { + self.as_encoded_bytes().cmp(other.as_encoded_bytes()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Hash for OsStr { + #[inline] + fn hash(&self, state: &mut H) { + self.as_encoded_bytes().hash(state) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Debug for OsStr { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.inner, formatter) + } +} + +/// Helper struct for safely printing an [`OsStr`] with [`format!`] and `{}`. +/// +/// An [`OsStr`] might contain non-Unicode data. This `struct` implements the +/// [`Display`] trait in a way that mitigates that. It is created by the +/// [`display`](OsStr::display) method on [`OsStr`]. This may perform lossy +/// conversion, depending on the platform. If you would like an implementation +/// which escapes the [`OsStr`] please use [`Debug`] instead. +/// +/// # Examples +/// +/// ``` +/// #![feature(os_str_display)] +/// use std::ffi::OsStr; +/// +/// let s = OsStr::new("Hello, world!"); +/// println!("{}", s.display()); +/// ``` +/// +/// [`Display`]: fmt::Display +/// [`format!`]: crate::format +#[unstable(feature = "os_str_display", issue = "120048")] +pub struct Display<'a> { + os_str: &'a OsStr, +} + +#[unstable(feature = "os_str_display", issue = "120048")] +impl fmt::Debug for Display<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.os_str, f) + } +} + +#[unstable(feature = "os_str_display", issue = "120048")] +impl fmt::Display for Display<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.os_str.inner, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef for OsStr { + #[inline] + fn as_ref(&self) -> &OsStr { + self + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef for str { + #[inline] + fn as_ref(&self) -> &OsStr { + OsStr::from_inner(Slice::from_str(self)) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[doc(hidden)] +impl AsRef for OsStr { + #[inline] + fn as_ref(&self) -> &Slice { + &self.inner + } +} diff --git a/library/core/src/ffi/os_str/bytes.rs b/library/core/src/ffi/os_str/bytes.rs new file mode 100644 index 0000000000000..d51045878cd23 --- /dev/null +++ b/library/core/src/ffi/os_str/bytes.rs @@ -0,0 +1,202 @@ +#![allow(missing_docs)] +#![allow(missing_debug_implementations)] + +//! The underlying OsString/OsStr implementation on Unix and many other +//! systems: just a `Vec`/`[u8]`. + +use crate::clone::CloneToUninit; +use crate::fmt::Write; +use crate::ptr::addr_of_mut; +use crate::{fmt, mem, str}; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[repr(transparent)] +#[rustc_has_incoherent_inherent_impls] +pub struct Slice { + pub inner: [u8], +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Debug for Slice { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Display for Slice { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // If we're the empty string then our iterator won't actually yield + // anything, so perform the formatting manually + if self.inner.is_empty() { + return "".fmt(f); + } + + for chunk in self.inner.utf8_chunks() { + let valid = chunk.valid(); + // If we successfully decoded the whole chunk as a valid string then + // we can return a direct formatting of the string which will also + // respect various formatting flags if possible. + if chunk.invalid().is_empty() { + return valid.fmt(f); + } + + f.write_str(valid)?; + f.write_char(char::REPLACEMENT_CHARACTER)?; + } + Ok(()) + } +} + +impl Slice { + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn as_encoded_bytes(&self) -> &[u8] { + &self.inner + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub unsafe fn from_encoded_bytes_unchecked(s: &[u8]) -> &Slice { + // SAFETY: Slice is just a wrapper of [u8] + unsafe { mem::transmute(s) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[track_caller] + #[inline] + pub fn check_public_boundary(&self, index: usize) { + if index == 0 || index == self.inner.len() { + return; + } + if index < self.inner.len() + && (self.inner[index - 1].is_ascii() || self.inner[index].is_ascii()) + { + return; + } + + slow_path(&self.inner, index); + + /// We're betting that typical splits will involve an ASCII character. + /// + /// Putting the expensive checks in a separate function generates notably + /// better assembly. + #[track_caller] + #[inline(never)] + fn slow_path(bytes: &[u8], index: usize) { + let (before, after) = bytes.split_at(index); + + // UTF-8 takes at most 4 bytes per codepoint, so we don't + // need to check more than that. + let after = after.get(..4).unwrap_or(after); + match str::from_utf8(after) { + Ok(_) => return, + Err(err) if err.valid_up_to() != 0 => return, + Err(_) => (), + } + + for len in 2..=4.min(index) { + let before = &before[index - len..]; + if str::from_utf8(before).is_ok() { + return; + } + } + + panic!("byte index {index} is not an OsStr boundary"); + } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn from_str(s: &str) -> &Slice { + // SAFETY: Slice is just a wrapper of [u8] + unsafe { Slice::from_encoded_bytes_unchecked(s.as_bytes()) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> { + str::from_utf8(&self.inner) + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn make_ascii_lowercase(&mut self) { + self.inner.make_ascii_lowercase() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn make_ascii_uppercase(&mut self) { + self.inner.make_ascii_uppercase() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn is_ascii(&self) -> bool { + self.inner.is_ascii() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + self.inner.eq_ignore_ascii_case(&other.inner) + } +} + +#[unstable(feature = "clone_to_uninit", issue = "126799")] +unsafe impl CloneToUninit for Slice { + #[inline] + #[cfg_attr(debug_assertions, track_caller)] + unsafe fn clone_to_uninit(&self, dst: *mut Self) { + // SAFETY: we're just a wrapper around [u8] + unsafe { self.inner.clone_to_uninit(addr_of_mut!((*dst).inner)) } + } +} diff --git a/library/core/src/ffi/os_str/os_str_ext_unix.rs b/library/core/src/ffi/os_str/os_str_ext_unix.rs new file mode 100644 index 0000000000000..f2040c62ff77c --- /dev/null +++ b/library/core/src/ffi/os_str/os_str_ext_unix.rs @@ -0,0 +1,36 @@ +//! [`OsStrExt`] for unix. + +use super::{private, OsStr}; +use crate::mem; + +/// Platform-specific extensions to [`OsStr`]. +/// +/// This trait is sealed: it cannot be implemented outside the standard library. +/// This is so that future additional methods are not breaking changes. +#[stable(feature = "rust1", since = "1.0.0")] +pub trait OsStrExt: private::Sealed { + #[stable(feature = "rust1", since = "1.0.0")] + /// Creates an [`OsStr`] from a byte slice. + /// + /// See the module documentation for an example. + fn from_bytes(slice: &[u8]) -> &Self; + + /// Gets the underlying byte view of the [`OsStr`] slice. + /// + /// See the module documentation for an example. + #[stable(feature = "rust1", since = "1.0.0")] + fn as_bytes(&self) -> &[u8]; +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl OsStrExt for OsStr { + #[inline] + fn from_bytes(slice: &[u8]) -> &OsStr { + // SAFETY: OsStr is just a wrapper of [u8] + unsafe { mem::transmute(slice) } + } + #[inline] + fn as_bytes(&self) -> &[u8] { + &self.inner.inner + } +} diff --git a/library/core/src/ffi/os_str/os_str_ext_windows.rs b/library/core/src/ffi/os_str/os_str_ext_windows.rs new file mode 100644 index 0000000000000..6205b74872e55 --- /dev/null +++ b/library/core/src/ffi/os_str/os_str_ext_windows.rs @@ -0,0 +1,44 @@ +//! [`OsStrExt`] for windows + +use super::{private, OsStr}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use crate::ffi::wtf8::EncodeWide; + +/// Windows-specific extensions to [`OsStr`]. +/// +/// This trait is sealed: it cannot be implemented outside the standard library. +/// This is so that future additional methods are not breaking changes. +#[stable(feature = "rust1", since = "1.0.0")] +pub trait OsStrExt: private::Sealed { + /// Re-encodes an `OsStr` as a wide character sequence, i.e., potentially + /// ill-formed UTF-16. + /// + /// This is lossless: calling [`OsStringExt::from_wide`] and then + /// `encode_wide` on the result will yield the original code units. + /// Note that the encoding does not add a final null terminator. + /// + /// # Examples + /// + /// ``` + /// use std::ffi::OsString; + /// use std::os::windows::prelude::*; + /// + /// // UTF-16 encoding for "Unicode". + /// let source = [0x0055, 0x006E, 0x0069, 0x0063, 0x006F, 0x0064, 0x0065]; + /// + /// let string = OsString::from_wide(&source[..]); + /// + /// let result: Vec = string.encode_wide().collect(); + /// assert_eq!(&source[..], &result[..]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + fn encode_wide(&self) -> EncodeWide<'_>; +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl OsStrExt for OsStr { + #[inline] + fn encode_wide(&self) -> EncodeWide<'_> { + self.inner.inner.encode_wide() + } +} diff --git a/library/core/src/ffi/os_str/wtf8.rs b/library/core/src/ffi/os_str/wtf8.rs new file mode 100644 index 0000000000000..4e3ff8d2400bd --- /dev/null +++ b/library/core/src/ffi/os_str/wtf8.rs @@ -0,0 +1,145 @@ +#![allow(missing_docs)] +#![allow(missing_debug_implementations)] + +//! The underlying OsString/OsStr implementation on Windows is a +//! wrapper around the "WTF-8" encoding; see the `wtf8` module for more. +use crate::clone::CloneToUninit; +use crate::ffi::wtf8::{check_utf8_boundary, Wtf8}; +use crate::ptr::addr_of_mut; +use crate::{fmt, mem}; + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +#[repr(transparent)] +#[rustc_has_incoherent_inherent_impls] +pub struct Slice { + pub inner: Wtf8, +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Debug for Slice { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.inner, formatter) + } +} + +#[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" +)] +impl fmt::Display for Slice { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.inner, formatter) + } +} + +impl Slice { + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn as_encoded_bytes(&self) -> &[u8] { + self.inner.as_bytes() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub unsafe fn from_encoded_bytes_unchecked(s: &[u8]) -> &Slice { + // SAFETY:: Slice is just a wrapper of Wtf8 + unsafe { mem::transmute(Wtf8::from_bytes_unchecked(s)) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[track_caller] + pub fn check_public_boundary(&self, index: usize) { + check_utf8_boundary(&self.inner, index); + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn from_str(s: &str) -> &Slice { + // SAFETY: Slice is just a wrapper of wtf8 + unsafe { mem::transmute(Wtf8::from_str(s)) } + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> { + self.inner.as_str() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn make_ascii_lowercase(&mut self) { + self.inner.make_ascii_lowercase() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn make_ascii_uppercase(&mut self) { + self.inner.make_ascii_uppercase() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn is_ascii(&self) -> bool { + self.inner.is_ascii() + } + + #[unstable( + feature = "os_str_internals", + reason = "internal details of the implementation of os str", + issue = "none" + )] + #[inline] + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + self.inner.eq_ignore_ascii_case(&other.inner) + } +} + +#[unstable(feature = "clone_to_uninit", issue = "126799")] +unsafe impl CloneToUninit for Slice { + #[inline] + #[cfg_attr(debug_assertions, track_caller)] + unsafe fn clone_to_uninit(&self, dst: *mut Self) { + // SAFETY: we're just a wrapper around Wtf8 + unsafe { self.inner.clone_to_uninit(addr_of_mut!((*dst).inner)) } + } +} diff --git a/library/core/src/ffi/wtf8.rs b/library/core/src/ffi/wtf8.rs new file mode 100644 index 0000000000000..00b7abfbac1a8 --- /dev/null +++ b/library/core/src/ffi/wtf8.rs @@ -0,0 +1,595 @@ +#![allow(missing_docs)] +#![allow(missing_debug_implementations)] + +//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). +//! +//! This library uses Rust’s type system to maintain +//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), +//! like the `String` and `&str` types do for UTF-8. +//! +//! Since [WTF-8 must not be used +//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), +//! this library deliberately does not provide access to the underlying bytes +//! of WTF-8 strings, +//! nor can it decode WTF-8 from arbitrary bytes. +//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. + +// this module is imported from @SimonSapin's repo and has tons of dead code on +// unix (it's mostly used on windows), so don't worry about dead code here. +#![allow(dead_code)] + +use crate::char::encode_utf16_raw; +use crate::clone::CloneToUninit; +use crate::hash::{Hash, Hasher}; +use crate::iter::FusedIterator; +use crate::ptr::addr_of_mut; +use crate::str::next_code_point; +use crate::{fmt, ops, slice, str}; + +pub const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; + +/// A Unicode code point: from U+0000 to U+10FFFF. +/// +/// Compares with the `char` type, +/// which represents a Unicode scalar value: +/// a code point that is not a surrogate (U+D800 to U+DFFF). +#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] +pub struct CodePoint { + value: u32, +} + +/// Format the code point as `U+` followed by four to six hexadecimal digits. +/// Example: `U+1F4A9` +impl fmt::Debug for CodePoint { + #[inline] + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(formatter, "U+{:04X}", self.value) + } +} + +impl CodePoint { + /// Unsafely creates a new `CodePoint` without checking the value. + /// + /// Only use when `value` is known to be less than or equal to 0x10FFFF. + #[inline] + pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { + CodePoint { value } + } + + /// Creates a new `CodePoint` if the value is a valid code point. + /// + /// Returns `None` if `value` is above 0x10FFFF. + #[inline] + pub fn from_u32(value: u32) -> Option { + match value { + 0..=0x10FFFF => Some(CodePoint { value }), + _ => None, + } + } + + /// Creates a new `CodePoint` from a `char`. + /// + /// Since all Unicode scalar values are code points, this always succeeds. + #[inline] + pub fn from_char(value: char) -> CodePoint { + CodePoint { value: value as u32 } + } + + /// Returns the numeric value of the code point. + #[inline] + pub fn to_u32(&self) -> u32 { + self.value + } + + /// Returns the numeric value of the code point if it is a leading surrogate. + #[inline] + pub fn to_lead_surrogate(&self) -> Option { + match self.value { + lead @ 0xD800..=0xDBFF => Some(lead as u16), + _ => None, + } + } + + /// Returns the numeric value of the code point if it is a trailing surrogate. + #[inline] + pub fn to_trail_surrogate(&self) -> Option { + match self.value { + trail @ 0xDC00..=0xDFFF => Some(trail as u16), + _ => None, + } + } + + /// Optionally returns a Unicode scalar value for the code point. + /// + /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). + #[inline] + pub fn to_char(&self) -> Option { + match self.value { + 0xD800..=0xDFFF => None, + // SAFETY: self.value is valid char + _ => Some(unsafe { char::from_u32_unchecked(self.value) }), + } + } + + /// Returns a Unicode scalar value for the code point. + /// + /// Returns `'\u{FFFD}'` (the replacement character “�”) + /// if the code point is a surrogate (from U+D800 to U+DFFF). + #[inline] + pub fn to_char_lossy(&self) -> char { + self.to_char().unwrap_or('\u{FFFD}') + } +} + +/// A borrowed slice of well-formed WTF-8 data. +/// +/// Similar to `&str`, but can additionally contain surrogate code points +/// if they’re not in a surrogate pair. +#[derive(Eq, Ord, PartialEq, PartialOrd)] +#[repr(transparent)] +#[rustc_has_incoherent_inherent_impls] +pub struct Wtf8 { + bytes: [u8], +} + +impl AsRef<[u8]> for Wtf8 { + #[inline] + fn as_ref(&self) -> &[u8] { + &self.bytes + } +} + +/// Format the slice with double quotes, +/// and surrogates as `\u` followed by four hexadecimal digits. +/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] +impl fmt::Debug for Wtf8 { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result { + use crate::fmt::Write; + for c in s.chars().flat_map(|c| c.escape_debug()) { + f.write_char(c)? + } + Ok(()) + } + + formatter.write_str("\"")?; + let mut pos = 0; + while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) { + // SAFETY: self.bytes[pos..surrogate_pos] is valid utf-8 + write_str_escaped(formatter, unsafe { + str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) + })?; + write!(formatter, "\\u{{{:x}}}", surrogate)?; + pos = surrogate_pos + 3; + } + // SAFETY: self.bytes[pos..] is valid utf-8 + write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; + formatter.write_str("\"") + } +} + +impl fmt::Display for Wtf8 { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + let wtf8_bytes = &self.bytes; + let mut pos = 0; + loop { + match self.next_surrogate(pos) { + Some((surrogate_pos, _)) => { + // SAFETY: self.bytes[pos..surrogate_pos] is valid utf-8 + formatter.write_str(unsafe { + str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) + })?; + formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?; + pos = surrogate_pos + 3; + } + None => { + // SAFETY: self.bytes[pos..] is valid utf-8 + let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) }; + if pos == 0 { + return s.fmt(formatter); + } else { + return formatter.write_str(s); + } + } + } + } + } +} + +impl Wtf8 { + /// Creates a WTF-8 slice from a UTF-8 `&str` slice. + /// + /// Since WTF-8 is a superset of UTF-8, this always succeeds. + #[inline] + pub fn from_str(value: &str) -> &Wtf8 { + // SAFETY: value is valid utf-8 + unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } + } + + /// Creates a WTF-8 slice from a WTF-8 byte slice. + /// + /// Since the byte slice is not checked for valid WTF-8, this functions is + /// marked unsafe. + #[inline] + pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { + // SAFETY: start with &[u8], end with fancy &[u8] + unsafe { &*(value as *const [u8] as *const Wtf8) } + } + + /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice. + /// + /// Since the byte slice is not checked for valid WTF-8, this functions is + /// marked unsafe. + #[inline] + pub unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { + // SAFETY: start with &mut [u8], end with fancy &mut [u8] + unsafe { &mut *(value as *mut [u8] as *mut Wtf8) } + } + + /// Returns the length, in WTF-8 bytes. + #[inline] + pub fn len(&self) -> usize { + self.bytes.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.bytes.is_empty() + } + + /// Returns the code point at `position` if it is in the ASCII range, + /// or `b'\xFF'` otherwise. + /// + /// # Panics + /// + /// Panics if `position` is beyond the end of the string. + #[inline] + pub fn ascii_byte_at(&self, position: usize) -> u8 { + match self.bytes[position] { + ascii_byte @ 0x00..=0x7F => ascii_byte, + _ => 0xFF, + } + } + + /// Returns an iterator for the string’s code points. + #[inline] + pub fn code_points(&self) -> Wtf8CodePoints<'_> { + Wtf8CodePoints { bytes: self.bytes.iter() } + } + + /// Access raw bytes of WTF-8 data + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes + } + + /// Tries to convert the string to UTF-8 and return a `&str` slice. + /// + /// Returns `None` if the string contains surrogates. + /// + /// This does not copy the data. + #[inline] + pub fn as_str(&self) -> Result<&str, str::Utf8Error> { + str::from_utf8(&self.bytes) + } + + /// Converts the WTF-8 string to potentially ill-formed UTF-16 + /// and return an iterator of 16-bit code units. + /// + /// This is lossless: + /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units + /// would always return the original WTF-8 string. + #[inline] + pub fn encode_wide(&self) -> EncodeWide<'_> { + EncodeWide { code_points: self.code_points(), extra: 0 } + } + + /// Next suroogate + #[inline] + pub fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { + let mut iter = self.bytes[pos..].iter(); + loop { + let b = *iter.next()?; + if b < 0x80 { + pos += 1; + } else if b < 0xE0 { + iter.next(); + pos += 2; + } else if b == 0xED { + match (iter.next(), iter.next()) { + (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { + return Some((pos, decode_surrogate(b2, b3))); + } + _ => pos += 3, + } + } else if b < 0xF0 { + iter.next(); + iter.next(); + pos += 3; + } else { + iter.next(); + iter.next(); + iter.next(); + pos += 4; + } + } + } + + /// Get final suroogate + #[inline] + pub fn final_lead_surrogate(&self) -> Option { + match self.bytes { + [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), + _ => None, + } + } + + /// Get initial trai suroogate + #[inline] + pub fn initial_trail_surrogate(&self) -> Option { + match self.bytes { + [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)), + _ => None, + } + } + + #[inline] + pub fn make_ascii_lowercase(&mut self) { + self.bytes.make_ascii_lowercase() + } + + #[inline] + pub fn make_ascii_uppercase(&mut self) { + self.bytes.make_ascii_uppercase() + } + + #[inline] + pub fn is_ascii(&self) -> bool { + self.bytes.is_ascii() + } + + #[inline] + pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { + self.bytes.eq_ignore_ascii_case(&other.bytes) + } +} + +/// Returns a slice of the given string for the byte range \[`begin`..`end`). +/// +/// # Panics +/// +/// Panics when `begin` and `end` do not point to code point boundaries, +/// or point beyond the end of the string. +impl ops::Index> for Wtf8 { + type Output = Wtf8; + + #[inline] + fn index(&self, range: ops::Range) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if range.start <= range.end + && is_code_point_boundary(self, range.start) + && is_code_point_boundary(self, range.end) + { + // SAFETY: start and end is at boundary + unsafe { slice_unchecked(self, range.start, range.end) } + } else { + slice_error_fail(self, range.start, range.end) + } + } +} + +/// Returns a slice of the given string from byte `begin` to its end. +/// +/// # Panics +/// +/// Panics when `begin` is not at a code point boundary, +/// or is beyond the end of the string. +impl ops::Index> for Wtf8 { + type Output = Wtf8; + + #[inline] + fn index(&self, range: ops::RangeFrom) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if is_code_point_boundary(self, range.start) { + // SAFETY: start is at boundary + unsafe { slice_unchecked(self, range.start, self.len()) } + } else { + slice_error_fail(self, range.start, self.len()) + } + } +} + +/// Returns a slice of the given string from its beginning to byte `end`. +/// +/// # Panics +/// +/// Panics when `end` is not at a code point boundary, +/// or is beyond the end of the string. +impl ops::Index> for Wtf8 { + type Output = Wtf8; + + #[inline] + fn index(&self, range: ops::RangeTo) -> &Wtf8 { + // is_code_point_boundary checks that the index is in [0, .len()] + if is_code_point_boundary(self, range.end) { + // SAFETY: end is at boundary + unsafe { slice_unchecked(self, 0, range.end) } + } else { + slice_error_fail(self, 0, range.end) + } + } +} + +impl ops::Index for Wtf8 { + type Output = Wtf8; + + #[inline] + fn index(&self, _range: ops::RangeFull) -> &Wtf8 { + self + } +} + +#[inline] +pub fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { + // The first byte is assumed to be 0xED + 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F +} + +#[inline] +pub fn decode_surrogate_pair(lead: u16, trail: u16) -> char { + let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); + // SAFETY: code_point is valid char + unsafe { char::from_u32_unchecked(code_point) } +} + +/// Copied from str::is_char_boundary +#[inline] +pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { + if index == 0 { + return true; + } + match slice.bytes.get(index) { + None => index == slice.len(), + Some(&b) => (b as i8) >= -0x40, + } +} + +/// Verify that `index` is at the edge of either a valid UTF-8 codepoint +/// (i.e. a codepoint that's not a surrogate) or of the whole string. +/// +/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`. +/// Splitting between surrogates is valid as far as WTF-8 is concerned, but +/// we do not permit it in the public API because WTF-8 is considered an +/// implementation detail. +#[track_caller] +#[inline] +pub fn check_utf8_boundary(slice: &Wtf8, index: usize) { + if index == 0 { + return; + } + match slice.bytes.get(index) { + Some(0xED) => (), // Might be a surrogate + Some(&b) if (b as i8) >= -0x40 => return, + Some(_) => panic!("byte index {index} is not a codepoint boundary"), + None if index == slice.len() => return, + None => panic!("byte index {index} is out of bounds"), + } + if slice.bytes[index + 1] >= 0xA0 { + // There's a surrogate after index. Now check before index. + if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 { + panic!("byte index {index} lies between surrogate codepoints"); + } + } +} + +/// Copied from core::str::raw::slice_unchecked +#[inline] +pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { + // SAFETY: memory layout of a &[u8] and &Wtf8 are the same + unsafe { + let len = end - begin; + let start = s.as_bytes().as_ptr().add(begin); + Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len)) + } +} + +/// Copied from core::str::raw::slice_error_fail +#[inline(never)] +pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { + assert!(begin <= end); + panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); +} + +/// Iterator for the code points of a WTF-8 string. +/// +/// Created with the method `.code_points()`. +#[derive(Clone)] +pub struct Wtf8CodePoints<'a> { + bytes: slice::Iter<'a, u8>, +} + +impl<'a> Iterator for Wtf8CodePoints<'a> { + type Item = CodePoint; + + #[inline] + fn next(&mut self) -> Option { + // SAFETY: `self.bytes` has been created from a WTF-8 string + unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.bytes.len(); + (len.saturating_add(3) / 4, Some(len)) + } +} + +/// Generates a wide character sequence for potentially ill-formed UTF-16. +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Clone)] +pub struct EncodeWide<'a> { + code_points: Wtf8CodePoints<'a>, + extra: u16, +} + +// Copied from libunicode/u_str.rs +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Iterator for EncodeWide<'a> { + type Item = u16; + + #[inline] + fn next(&mut self) -> Option { + if self.extra != 0 { + let tmp = self.extra; + self.extra = 0; + return Some(tmp); + } + + let mut buf = [0; 2]; + self.code_points.next().map(|code_point| { + let n = encode_utf16_raw(code_point.value, &mut buf).len(); + if n == 2 { + self.extra = buf[1]; + } + buf[0] + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.code_points.size_hint(); + let ext = (self.extra != 0) as usize; + // every code point gets either one u16 or two u16, + // so this iterator is between 1 or 2 times as + // long as the underlying iterator. + (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext))) + } +} + +#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")] +impl FusedIterator for EncodeWide<'_> {} + +impl Hash for CodePoint { + #[inline] + fn hash(&self, state: &mut H) { + self.value.hash(state) + } +} + +impl Hash for Wtf8 { + #[inline] + fn hash(&self, state: &mut H) { + state.write(&self.bytes); + 0xfeu8.hash(state) + } +} + +#[unstable(feature = "clone_to_uninit", issue = "126799")] +unsafe impl CloneToUninit for Wtf8 { + #[inline] + #[cfg_attr(debug_assertions, track_caller)] + unsafe fn clone_to_uninit(&self, dst: *mut Self) { + // SAFETY: we're just a wrapper around [u8] + unsafe { self.bytes.clone_to_uninit(addr_of_mut!((*dst).bytes)) } + } +} diff --git a/library/core/tests/ffi.rs b/library/core/tests/ffi.rs index 2b33fbd95f073..4ee1c2754761e 100644 --- a/library/core/tests/ffi.rs +++ b/library/core/tests/ffi.rs @@ -1 +1,4 @@ +mod bytes; mod cstr; +mod os_str; +mod wtf8; diff --git a/library/std/src/sys/os_str/bytes/tests.rs b/library/core/tests/ffi/bytes.rs similarity index 93% rename from library/std/src/sys/os_str/bytes/tests.rs rename to library/core/tests/ffi/bytes.rs index e2a99045e41f6..3311943803a0b 100644 --- a/library/std/src/sys/os_str/bytes/tests.rs +++ b/library/core/tests/ffi/bytes.rs @@ -1,4 +1,4 @@ -use super::*; +use core::ffi::os_str::Slice; #[test] fn slice_debug_output() { diff --git a/library/core/tests/ffi/os_str.rs b/library/core/tests/ffi/os_str.rs new file mode 100644 index 0000000000000..d24c316ecc158 --- /dev/null +++ b/library/core/tests/ffi/os_str.rs @@ -0,0 +1,43 @@ +use core::ffi::OsStr; +use core::mem::MaybeUninit; +use core::ptr; + +#[test] +fn test_os_str_default() { + let os_str: &OsStr = Default::default(); + assert_eq!("", os_str); +} + +#[test] +fn slice_encoded_bytes() { + let os_str = OsStr::new("123θგ🦀"); + // ASCII + let digits = os_str.slice_encoded_bytes(..3); + assert_eq!(digits, "123"); + let three = os_str.slice_encoded_bytes(2..3); + assert_eq!(three, "3"); + // 2-byte UTF-8 + let theta = os_str.slice_encoded_bytes(3..5); + assert_eq!(theta, "θ"); + // 3-byte UTF-8 + let gani = os_str.slice_encoded_bytes(5..8); + assert_eq!(gani, "გ"); + // 4-byte UTF-8 + let crab = os_str.slice_encoded_bytes(8..); + assert_eq!(crab, "🦀"); +} + +#[test] +fn clone_to_uninit() { + let a = OsStr::new("hello.txt"); + + let mut storage = vec![MaybeUninit::::uninit(); size_of_val::(a)]; + unsafe { a.clone_to_uninit(ptr::from_mut::<[_]>(storage.as_mut_slice()) as *mut OsStr) }; + assert_eq!(a.as_encoded_bytes(), unsafe { MaybeUninit::slice_assume_init_ref(&storage) }); + + let mut b: Box = OsStr::new("world.exe").into(); + assert_eq!(size_of_val::(a), size_of_val::(&b)); + assert_ne!(a, &*b); + unsafe { a.clone_to_uninit(ptr::from_mut::(&mut b)) }; + assert_eq!(a, &*b); +} diff --git a/library/core/tests/ffi/wtf8.rs b/library/core/tests/ffi/wtf8.rs new file mode 100644 index 0000000000000..a9c1a6c592734 --- /dev/null +++ b/library/core/tests/ffi/wtf8.rs @@ -0,0 +1,150 @@ +use core::ffi::wtf8::*; + +#[test] +fn code_point_from_u32() { + assert!(CodePoint::from_u32(0).is_some()); + assert!(CodePoint::from_u32(0xD800).is_some()); + assert!(CodePoint::from_u32(0x10FFFF).is_some()); + assert!(CodePoint::from_u32(0x110000).is_none()); +} + +#[test] +fn code_point_to_u32() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_u32(), 0); + assert_eq!(c(0xD800).to_u32(), 0xD800); + assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF); +} + +#[test] +fn code_point_to_lead_surrogate() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_lead_surrogate(), None); + assert_eq!(c(0xE9).to_lead_surrogate(), None); + assert_eq!(c(0xD800).to_lead_surrogate(), Some(0xD800)); + assert_eq!(c(0xDBFF).to_lead_surrogate(), Some(0xDBFF)); + assert_eq!(c(0xDC00).to_lead_surrogate(), None); + assert_eq!(c(0xDFFF).to_lead_surrogate(), None); + assert_eq!(c(0x1F4A9).to_lead_surrogate(), None); + assert_eq!(c(0x10FFFF).to_lead_surrogate(), None); +} + +#[test] +fn code_point_to_trail_surrogate() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0).to_trail_surrogate(), None); + assert_eq!(c(0xE9).to_trail_surrogate(), None); + assert_eq!(c(0xD800).to_trail_surrogate(), None); + assert_eq!(c(0xDBFF).to_trail_surrogate(), None); + assert_eq!(c(0xDC00).to_trail_surrogate(), Some(0xDC00)); + assert_eq!(c(0xDFFF).to_trail_surrogate(), Some(0xDFFF)); + assert_eq!(c(0x1F4A9).to_trail_surrogate(), None); + assert_eq!(c(0x10FFFF).to_trail_surrogate(), None); +} + +#[test] +fn code_point_from_char() { + assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); + assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9); +} + +#[test] +fn code_point_to_string() { + assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061"); + assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9"); +} + +#[test] +fn code_point_to_char() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0x61).to_char(), Some('a')); + assert_eq!(c(0x1F4A9).to_char(), Some('💩')); + assert_eq!(c(0xD800).to_char(), None); +} + +#[test] +fn code_point_to_char_lossy() { + fn c(value: u32) -> CodePoint { + CodePoint::from_u32(value).unwrap() + } + assert_eq!(c(0x61).to_char_lossy(), 'a'); + assert_eq!(c(0x1F4A9).to_char_lossy(), '💩'); + assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}'); +} + +#[test] +fn wtf8_from_str() { + assert_eq!(&Wtf8::from_str("").bytes, b""); + assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); +} + +#[test] +fn wtf8_len() { + assert_eq!(Wtf8::from_str("").len(), 0); + assert_eq!(Wtf8::from_str("aé 💩").len(), 8); +} + +#[test] +fn wtf8_slice() { + assert_eq!(&Wtf8::from_str("aé 💩")[1..4].bytes, b"\xC3\xA9 "); +} + +#[test] +#[should_panic] +fn wtf8_slice_not_code_point_boundary() { + let _ = &Wtf8::from_str("aé 💩")[2..4]; +} + +#[test] +fn wtf8_slice_from() { + assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9"); +} + +#[test] +#[should_panic] +fn wtf8_slice_from_not_code_point_boundary() { + let _ = &Wtf8::from_str("aé 💩")[2..]; +} + +#[test] +fn wtf8_slice_to() { + assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 "); +} + +#[test] +#[should_panic] +fn wtf8_slice_to_not_code_point_boundary() { + let _ = &Wtf8::from_str("aé 💩")[5..]; +} + +#[test] +fn wtf8_ascii_byte_at() { + let slice = Wtf8::from_str("aé 💩"); + assert_eq!(slice.ascii_byte_at(0), b'a'); + assert_eq!(slice.ascii_byte_at(1), b'\xFF'); + assert_eq!(slice.ascii_byte_at(2), b'\xFF'); + assert_eq!(slice.ascii_byte_at(3), b' '); + assert_eq!(slice.ascii_byte_at(4), b'\xFF'); +} + +#[test] +#[should_panic(expected = "byte index 4 is out of bounds")] +fn wtf8_utf8_boundary_out_of_bounds() { + let string = Wtf8::from_str("aé"); + check_utf8_boundary(&string, 4); +} + +#[test] +#[should_panic(expected = "byte index 1 is not a codepoint boundary")] +fn wtf8_utf8_boundary_inside_codepoint() { + let string = Wtf8::from_str("é"); + check_utf8_boundary(&string, 1); +} diff --git a/library/std/src/ffi/os_str.rs b/library/std/src/ffi/os_str.rs index 918eec2d0d8ef..a8e7b65e5b8de 100644 --- a/library/std/src/ffi/os_str.rs +++ b/library/std/src/ffi/os_str.rs @@ -1,1742 +1,4 @@ //! The [`OsStr`] and [`OsString`] types and associated utilities. -#[cfg(test)] -mod tests; - -use core::clone::CloneToUninit; - -use crate::borrow::{Borrow, Cow}; -use crate::collections::TryReserveError; -use crate::hash::{Hash, Hasher}; -use crate::ops::{self, Range}; -use crate::ptr::addr_of_mut; -use crate::rc::Rc; -use crate::str::FromStr; -use crate::sync::Arc; -use crate::sys::os_str::{Buf, Slice}; -use crate::sys_common::{AsInner, FromInner, IntoInner}; -use crate::{cmp, fmt, slice}; - -/// A type that can represent owned, mutable platform-native strings, but is -/// cheaply inter-convertible with Rust strings. -/// -/// The need for this type arises from the fact that: -/// -/// * On Unix systems, strings are often arbitrary sequences of non-zero -/// bytes, in many cases interpreted as UTF-8. -/// -/// * On Windows, strings are often arbitrary sequences of non-zero 16-bit -/// values, interpreted as UTF-16 when it is valid to do so. -/// -/// * In Rust, strings are always valid UTF-8, which may contain zeros. -/// -/// `OsString` and [`OsStr`] bridge this gap by simultaneously representing Rust -/// and platform-native string values, and in particular allowing a Rust string -/// to be converted into an "OS" string with no cost if possible. A consequence -/// of this is that `OsString` instances are *not* `NUL` terminated; in order -/// to pass to e.g., Unix system call, you should create a [`CStr`]. -/// -/// `OsString` is to &[OsStr] as [`String`] is to &[str]: the former -/// in each pair are owned strings; the latter are borrowed -/// references. -/// -/// Note, `OsString` and [`OsStr`] internally do not necessarily hold strings in -/// the form native to the platform; While on Unix, strings are stored as a -/// sequence of 8-bit values, on Windows, where strings are 16-bit value based -/// as just discussed, strings are also actually stored as a sequence of 8-bit -/// values, encoded in a less-strict variant of UTF-8. This is useful to -/// understand when handling capacity and length values. -/// -/// # Capacity of `OsString` -/// -/// Capacity uses units of UTF-8 bytes for OS strings which were created from valid unicode, and -/// uses units of bytes in an unspecified encoding for other contents. On a given target, all -/// `OsString` and `OsStr` values use the same units for capacity, so the following will work: -/// ``` -/// use std::ffi::{OsStr, OsString}; -/// -/// fn concat_os_strings(a: &OsStr, b: &OsStr) -> OsString { -/// let mut ret = OsString::with_capacity(a.len() + b.len()); // This will allocate -/// ret.push(a); // This will not allocate further -/// ret.push(b); // This will not allocate further -/// ret -/// } -/// ``` -/// -/// # Creating an `OsString` -/// -/// **From a Rust string**: `OsString` implements -/// [From]<[String]>, so you can use my_string.[into]\() to -/// create an `OsString` from a normal Rust string. -/// -/// **From slices:** Just like you can start with an empty Rust -/// [`String`] and then [`String::push_str`] some &[str] -/// sub-string slices into it, you can create an empty `OsString` with -/// the [`OsString::new`] method and then push string slices into it with the -/// [`OsString::push`] method. -/// -/// # Extracting a borrowed reference to the whole OS string -/// -/// You can use the [`OsString::as_os_str`] method to get an &[OsStr] from -/// an `OsString`; this is effectively a borrowed reference to the -/// whole string. -/// -/// # Conversions -/// -/// See the [module's toplevel documentation about conversions][conversions] for a discussion on -/// the traits which `OsString` implements for [conversions] from/to native representations. -/// -/// [`CStr`]: crate::ffi::CStr -/// [conversions]: super#conversions -/// [into]: Into::into -#[cfg_attr(not(test), rustc_diagnostic_item = "OsString")] -#[stable(feature = "rust1", since = "1.0.0")] -pub struct OsString { - inner: Buf, -} - -/// Allows extension traits within `std`. -#[unstable(feature = "sealed", issue = "none")] -impl crate::sealed::Sealed for OsString {} - -/// Borrowed reference to an OS string (see [`OsString`]). -/// -/// This type represents a borrowed reference to a string in the operating system's preferred -/// representation. -/// -/// `&OsStr` is to [`OsString`] as &[str] is to [`String`]: the -/// former in each pair are borrowed references; the latter are owned strings. -/// -/// See the [module's toplevel documentation about conversions][conversions] for a discussion on -/// the traits which `OsStr` implements for [conversions] from/to native representations. -/// -/// [conversions]: super#conversions -#[cfg_attr(not(test), rustc_diagnostic_item = "OsStr")] -#[stable(feature = "rust1", since = "1.0.0")] -// `OsStr::from_inner` current implementation relies -// on `OsStr` being layout-compatible with `Slice`. -// However, `OsStr` layout is considered an implementation detail and must not be relied upon. -#[repr(transparent)] -pub struct OsStr { - inner: Slice, -} - -/// Allows extension traits within `std`. -#[unstable(feature = "sealed", issue = "none")] -impl crate::sealed::Sealed for OsStr {} - -impl OsString { - /// Constructs a new empty `OsString`. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let os_string = OsString::new(); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use] - #[inline] - pub fn new() -> OsString { - OsString { inner: Buf::from_string(String::new()) } - } - - /// Converts bytes to an `OsString` without checking that the bytes contains - /// valid [`OsStr`]-encoded data. - /// - /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. - /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit - /// ASCII. - /// - /// See the [module's toplevel documentation about conversions][conversions] for safe, - /// cross-platform [conversions] from/to native representations. - /// - /// # Safety - /// - /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of - /// validated UTF-8 and bytes from [`OsStr::as_encoded_bytes`] from within the same Rust version - /// built for the same target platform. For example, reconstructing an `OsString` from bytes sent - /// over the network or stored in a file will likely violate these safety rules. - /// - /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_encoded_bytes`] can be - /// split either immediately before or immediately after any valid non-empty UTF-8 substring. - /// - /// # Example - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new("Mary had a little lamb"); - /// let bytes = os_str.as_encoded_bytes(); - /// let words = bytes.split(|b| *b == b' '); - /// let words: Vec<&OsStr> = words.map(|word| { - /// // SAFETY: - /// // - Each `word` only contains content that originated from `OsStr::as_encoded_bytes` - /// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring - /// unsafe { OsStr::from_encoded_bytes_unchecked(word) } - /// }).collect(); - /// ``` - /// - /// [conversions]: super#conversions - #[inline] - #[stable(feature = "os_str_bytes", since = "1.74.0")] - pub unsafe fn from_encoded_bytes_unchecked(bytes: Vec) -> Self { - OsString { inner: unsafe { Buf::from_encoded_bytes_unchecked(bytes) } } - } - - /// Converts to an [`OsStr`] slice. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::{OsString, OsStr}; - /// - /// let os_string = OsString::from("foo"); - /// let os_str = OsStr::new("foo"); - /// assert_eq!(os_string.as_os_str(), os_str); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use] - #[inline] - pub fn as_os_str(&self) -> &OsStr { - self - } - - /// Converts the `OsString` into a byte slice. To convert the byte slice back into an - /// `OsString`, use the [`OsStr::from_encoded_bytes_unchecked`] function. - /// - /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. - /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit - /// ASCII. - /// - /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should - /// be treated as opaque and only comparable within the same Rust version built for the same - /// target platform. For example, sending the bytes over the network or storing it in a file - /// will likely result in incompatible data. See [`OsString`] for more encoding details - /// and [`std::ffi`] for platform-specific, specified conversions. - /// - /// [`std::ffi`]: crate::ffi - #[inline] - #[stable(feature = "os_str_bytes", since = "1.74.0")] - pub fn into_encoded_bytes(self) -> Vec { - self.inner.into_encoded_bytes() - } - - /// Converts the `OsString` into a [`String`] if it contains valid Unicode data. - /// - /// On failure, ownership of the original `OsString` is returned. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let os_string = OsString::from("foo"); - /// let string = os_string.into_string(); - /// assert_eq!(string, Ok(String::from("foo"))); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[inline] - pub fn into_string(self) -> Result { - self.inner.into_string().map_err(|buf| OsString { inner: buf }) - } - - /// Extends the string with the given &[OsStr] slice. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut os_string = OsString::from("foo"); - /// os_string.push("bar"); - /// assert_eq!(&os_string, "foobar"); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[inline] - #[rustc_confusables("append", "put")] - pub fn push>(&mut self, s: T) { - self.inner.push_slice(&s.as_ref().inner) - } - - /// Creates a new `OsString` with at least the given capacity. - /// - /// The string will be able to hold at least `capacity` length units of other - /// OS strings without reallocating. This method is allowed to allocate for - /// more units than `capacity`. If `capacity` is 0, the string will not - /// allocate. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut os_string = OsString::with_capacity(10); - /// let capacity = os_string.capacity(); - /// - /// // This push is done without reallocating - /// os_string.push("foo"); - /// - /// assert_eq!(capacity, os_string.capacity()); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[must_use] - #[inline] - pub fn with_capacity(capacity: usize) -> OsString { - OsString { inner: Buf::with_capacity(capacity) } - } - - /// Truncates the `OsString` to zero length. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut os_string = OsString::from("foo"); - /// assert_eq!(&os_string, "foo"); - /// - /// os_string.clear(); - /// assert_eq!(&os_string, ""); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[inline] - pub fn clear(&mut self) { - self.inner.clear() - } - - /// Returns the capacity this `OsString` can hold without reallocating. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let os_string = OsString::with_capacity(10); - /// assert!(os_string.capacity() >= 10); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[must_use] - #[inline] - pub fn capacity(&self) -> usize { - self.inner.capacity() - } - - /// Reserves capacity for at least `additional` more capacity to be inserted - /// in the given `OsString`. Does nothing if the capacity is - /// already sufficient. - /// - /// The collection may reserve more space to speculatively avoid frequent reallocations. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut s = OsString::new(); - /// s.reserve(10); - /// assert!(s.capacity() >= 10); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[inline] - pub fn reserve(&mut self, additional: usize) { - self.inner.reserve(additional) - } - - /// Tries to reserve capacity for at least `additional` more length units - /// in the given `OsString`. The string may reserve more space to speculatively avoid - /// frequent reallocations. After calling `try_reserve`, capacity will be - /// greater than or equal to `self.len() + additional` if it returns `Ok(())`. - /// Does nothing if capacity is already sufficient. This method preserves - /// the contents even if an error occurs. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Errors - /// - /// If the capacity overflows, or the allocator reports a failure, then an error - /// is returned. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::{OsStr, OsString}; - /// use std::collections::TryReserveError; - /// - /// fn process_data(data: &str) -> Result { - /// let mut s = OsString::new(); - /// - /// // Pre-reserve the memory, exiting if we can't - /// s.try_reserve(OsStr::new(data).len())?; - /// - /// // Now we know this can't OOM in the middle of our complex work - /// s.push(data); - /// - /// Ok(s) - /// } - /// # process_data("123").expect("why is the test harness OOMing on 3 bytes?"); - /// ``` - #[stable(feature = "try_reserve_2", since = "1.63.0")] - #[inline] - pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.inner.try_reserve(additional) - } - - /// Reserves the minimum capacity for at least `additional` more capacity to - /// be inserted in the given `OsString`. Does nothing if the capacity is - /// already sufficient. - /// - /// Note that the allocator may give the collection more space than it - /// requests. Therefore, capacity can not be relied upon to be precisely - /// minimal. Prefer [`reserve`] if future insertions are expected. - /// - /// [`reserve`]: OsString::reserve - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut s = OsString::new(); - /// s.reserve_exact(10); - /// assert!(s.capacity() >= 10); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[inline] - pub fn reserve_exact(&mut self, additional: usize) { - self.inner.reserve_exact(additional) - } - - /// Tries to reserve the minimum capacity for at least `additional` - /// more length units in the given `OsString`. After calling - /// `try_reserve_exact`, capacity will be greater than or equal to - /// `self.len() + additional` if it returns `Ok(())`. - /// Does nothing if the capacity is already sufficient. - /// - /// Note that the allocator may give the `OsString` more space than it - /// requests. Therefore, capacity can not be relied upon to be precisely - /// minimal. Prefer [`try_reserve`] if future insertions are expected. - /// - /// [`try_reserve`]: OsString::try_reserve - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Errors - /// - /// If the capacity overflows, or the allocator reports a failure, then an error - /// is returned. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::{OsStr, OsString}; - /// use std::collections::TryReserveError; - /// - /// fn process_data(data: &str) -> Result { - /// let mut s = OsString::new(); - /// - /// // Pre-reserve the memory, exiting if we can't - /// s.try_reserve_exact(OsStr::new(data).len())?; - /// - /// // Now we know this can't OOM in the middle of our complex work - /// s.push(data); - /// - /// Ok(s) - /// } - /// # process_data("123").expect("why is the test harness OOMing on 3 bytes?"); - /// ``` - #[stable(feature = "try_reserve_2", since = "1.63.0")] - #[inline] - pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.inner.try_reserve_exact(additional) - } - - /// Shrinks the capacity of the `OsString` to match its length. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut s = OsString::from("foo"); - /// - /// s.reserve(100); - /// assert!(s.capacity() >= 100); - /// - /// s.shrink_to_fit(); - /// assert_eq!(3, s.capacity()); - /// ``` - #[stable(feature = "osstring_shrink_to_fit", since = "1.19.0")] - #[inline] - pub fn shrink_to_fit(&mut self) { - self.inner.shrink_to_fit() - } - - /// Shrinks the capacity of the `OsString` with a lower bound. - /// - /// The capacity will remain at least as large as both the length - /// and the supplied value. - /// - /// If the current capacity is less than the lower limit, this is a no-op. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut s = OsString::from("foo"); - /// - /// s.reserve(100); - /// assert!(s.capacity() >= 100); - /// - /// s.shrink_to(10); - /// assert!(s.capacity() >= 10); - /// s.shrink_to(0); - /// assert!(s.capacity() >= 3); - /// ``` - #[inline] - #[stable(feature = "shrink_to", since = "1.56.0")] - pub fn shrink_to(&mut self, min_capacity: usize) { - self.inner.shrink_to(min_capacity) - } - - /// Converts this `OsString` into a boxed [`OsStr`]. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::{OsString, OsStr}; - /// - /// let s = OsString::from("hello"); - /// - /// let b: Box = s.into_boxed_os_str(); - /// ``` - #[must_use = "`self` will be dropped if the result is not used"] - #[stable(feature = "into_boxed_os_str", since = "1.20.0")] - pub fn into_boxed_os_str(self) -> Box { - let rw = Box::into_raw(self.inner.into_box()) as *mut OsStr; - unsafe { Box::from_raw(rw) } - } - - /// Consumes and leaks the `OsString`, returning a mutable reference to the contents, - /// `&'a mut OsStr`. - /// - /// The caller has free choice over the returned lifetime, including 'static. - /// Indeed, this function is ideally used for data that lives for the remainder of - /// the program’s life, as dropping the returned reference will cause a memory leak. - /// - /// It does not reallocate or shrink the `OsString`, so the leaked allocation may include - /// unused capacity that is not part of the returned slice. If you want to discard excess - /// capacity, call [`into_boxed_os_str`], and then [`Box::leak`] instead. - /// However, keep in mind that trimming the capacity may result in a reallocation and copy. - /// - /// [`into_boxed_os_str`]: Self::into_boxed_os_str - #[unstable(feature = "os_string_pathbuf_leak", issue = "125965")] - #[inline] - pub fn leak<'a>(self) -> &'a mut OsStr { - OsStr::from_inner_mut(self.inner.leak()) - } - - /// Provides plumbing to core `Vec::truncate`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn truncate(&mut self, len: usize) { - self.inner.truncate(len); - } - - /// Provides plumbing to core `Vec::extend_from_slice`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { - self.inner.extend_from_slice(other); - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl From for OsString { - /// Converts a [`String`] into an [`OsString`]. - /// - /// This conversion does not allocate or copy memory. - #[inline] - fn from(s: String) -> OsString { - OsString { inner: Buf::from_string(s) } - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl> From<&T> for OsString { - /// Copies any value implementing [AsRef]<[OsStr]> - /// into a newly allocated [`OsString`]. - fn from(s: &T) -> OsString { - s.as_ref().to_os_string() - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl ops::Index for OsString { - type Output = OsStr; - - #[inline] - fn index(&self, _index: ops::RangeFull) -> &OsStr { - OsStr::from_inner(self.inner.as_slice()) - } -} - -#[stable(feature = "mut_osstr", since = "1.44.0")] -impl ops::IndexMut for OsString { - #[inline] - fn index_mut(&mut self, _index: ops::RangeFull) -> &mut OsStr { - OsStr::from_inner_mut(self.inner.as_mut_slice()) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl ops::Deref for OsString { - type Target = OsStr; - - #[inline] - fn deref(&self) -> &OsStr { - &self[..] - } -} - -#[stable(feature = "mut_osstr", since = "1.44.0")] -impl ops::DerefMut for OsString { - #[inline] - fn deref_mut(&mut self) -> &mut OsStr { - &mut self[..] - } -} - -#[stable(feature = "osstring_default", since = "1.9.0")] -impl Default for OsString { - /// Constructs an empty `OsString`. - #[inline] - fn default() -> OsString { - OsString::new() - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Clone for OsString { - #[inline] - fn clone(&self) -> Self { - OsString { inner: self.inner.clone() } - } - - /// Clones the contents of `source` into `self`. - /// - /// This method is preferred over simply assigning `source.clone()` to `self`, - /// as it avoids reallocation if possible. - #[inline] - fn clone_from(&mut self, source: &Self) { - self.inner.clone_from(&source.inner) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl fmt::Debug for OsString { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&**self, formatter) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialEq for OsString { - #[inline] - fn eq(&self, other: &OsString) -> bool { - &**self == &**other - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialEq for OsString { - #[inline] - fn eq(&self, other: &str) -> bool { - &**self == other - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialEq for str { - #[inline] - fn eq(&self, other: &OsString) -> bool { - &**other == self - } -} - -#[stable(feature = "os_str_str_ref_eq", since = "1.29.0")] -impl PartialEq<&str> for OsString { - #[inline] - fn eq(&self, other: &&str) -> bool { - **self == **other - } -} - -#[stable(feature = "os_str_str_ref_eq", since = "1.29.0")] -impl<'a> PartialEq for &'a str { - #[inline] - fn eq(&self, other: &OsString) -> bool { - **other == **self - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Eq for OsString {} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialOrd for OsString { - #[inline] - fn partial_cmp(&self, other: &OsString) -> Option { - (&**self).partial_cmp(&**other) - } - #[inline] - fn lt(&self, other: &OsString) -> bool { - &**self < &**other - } - #[inline] - fn le(&self, other: &OsString) -> bool { - &**self <= &**other - } - #[inline] - fn gt(&self, other: &OsString) -> bool { - &**self > &**other - } - #[inline] - fn ge(&self, other: &OsString) -> bool { - &**self >= &**other - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialOrd for OsString { - #[inline] - fn partial_cmp(&self, other: &str) -> Option { - (&**self).partial_cmp(other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Ord for OsString { - #[inline] - fn cmp(&self, other: &OsString) -> cmp::Ordering { - (&**self).cmp(&**other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Hash for OsString { - #[inline] - fn hash(&self, state: &mut H) { - (&**self).hash(state) - } -} - -#[stable(feature = "os_string_fmt_write", since = "1.64.0")] -impl fmt::Write for OsString { - fn write_str(&mut self, s: &str) -> fmt::Result { - self.push(s); - Ok(()) - } -} - -impl OsStr { - /// Coerces into an `OsStr` slice. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new("foo"); - /// ``` - #[inline] - #[stable(feature = "rust1", since = "1.0.0")] - pub fn new + ?Sized>(s: &S) -> &OsStr { - s.as_ref() - } - - /// Converts a slice of bytes to an OS string slice without checking that the string contains - /// valid `OsStr`-encoded data. - /// - /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. - /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit - /// ASCII. - /// - /// See the [module's toplevel documentation about conversions][conversions] for safe, - /// cross-platform [conversions] from/to native representations. - /// - /// # Safety - /// - /// As the encoding is unspecified, callers must pass in bytes that originated as a mixture of - /// validated UTF-8 and bytes from [`OsStr::as_encoded_bytes`] from within the same Rust version - /// built for the same target platform. For example, reconstructing an `OsStr` from bytes sent - /// over the network or stored in a file will likely violate these safety rules. - /// - /// Due to the encoding being self-synchronizing, the bytes from [`OsStr::as_encoded_bytes`] can be - /// split either immediately before or immediately after any valid non-empty UTF-8 substring. - /// - /// # Example - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new("Mary had a little lamb"); - /// let bytes = os_str.as_encoded_bytes(); - /// let words = bytes.split(|b| *b == b' '); - /// let words: Vec<&OsStr> = words.map(|word| { - /// // SAFETY: - /// // - Each `word` only contains content that originated from `OsStr::as_encoded_bytes` - /// // - Only split with ASCII whitespace which is a non-empty UTF-8 substring - /// unsafe { OsStr::from_encoded_bytes_unchecked(word) } - /// }).collect(); - /// ``` - /// - /// [conversions]: super#conversions - #[inline] - #[stable(feature = "os_str_bytes", since = "1.74.0")] - pub unsafe fn from_encoded_bytes_unchecked(bytes: &[u8]) -> &Self { - Self::from_inner(unsafe { Slice::from_encoded_bytes_unchecked(bytes) }) - } - - #[inline] - fn from_inner(inner: &Slice) -> &OsStr { - // SAFETY: OsStr is just a wrapper of Slice, - // therefore converting &Slice to &OsStr is safe. - unsafe { &*(inner as *const Slice as *const OsStr) } - } - - #[inline] - fn from_inner_mut(inner: &mut Slice) -> &mut OsStr { - // SAFETY: OsStr is just a wrapper of Slice, - // therefore converting &mut Slice to &mut OsStr is safe. - // Any method that mutates OsStr must be careful not to - // break platform-specific encoding, in particular Wtf8 on Windows. - unsafe { &mut *(inner as *mut Slice as *mut OsStr) } - } - - /// Yields a &[str] slice if the `OsStr` is valid Unicode. - /// - /// This conversion may entail doing a check for UTF-8 validity. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new("foo"); - /// assert_eq!(os_str.to_str(), Some("foo")); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use = "this returns the result of the operation, \ - without modifying the original"] - #[inline] - pub fn to_str(&self) -> Option<&str> { - self.inner.to_str().ok() - } - - /// Converts an `OsStr` to a [Cow]<[str]>. - /// - /// Any non-Unicode sequences are replaced with - /// [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]. - /// - /// [U+FFFD]: crate::char::REPLACEMENT_CHARACTER - /// - /// # Examples - /// - /// Calling `to_string_lossy` on an `OsStr` with invalid unicode: - /// - /// ``` - /// // Note, due to differences in how Unix and Windows represent strings, - /// // we are forced to complicate this example, setting up example `OsStr`s - /// // with different source data and via different platform extensions. - /// // Understand that in reality you could end up with such example invalid - /// // sequences simply through collecting user command line arguments, for - /// // example. - /// - /// #[cfg(unix)] { - /// use std::ffi::OsStr; - /// use std::os::unix::ffi::OsStrExt; - /// - /// // Here, the values 0x66 and 0x6f correspond to 'f' and 'o' - /// // respectively. The value 0x80 is a lone continuation byte, invalid - /// // in a UTF-8 sequence. - /// let source = [0x66, 0x6f, 0x80, 0x6f]; - /// let os_str = OsStr::from_bytes(&source[..]); - /// - /// assert_eq!(os_str.to_string_lossy(), "fo�o"); - /// } - /// #[cfg(windows)] { - /// use std::ffi::OsString; - /// use std::os::windows::prelude::*; - /// - /// // Here the values 0x0066 and 0x006f correspond to 'f' and 'o' - /// // respectively. The value 0xD800 is a lone surrogate half, invalid - /// // in a UTF-16 sequence. - /// let source = [0x0066, 0x006f, 0xD800, 0x006f]; - /// let os_string = OsString::from_wide(&source[..]); - /// let os_str = os_string.as_os_str(); - /// - /// assert_eq!(os_str.to_string_lossy(), "fo�o"); - /// } - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use = "this returns the result of the operation, \ - without modifying the original"] - #[inline] - pub fn to_string_lossy(&self) -> Cow<'_, str> { - self.inner.to_string_lossy() - } - - /// Copies the slice into an owned [`OsString`]. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::{OsStr, OsString}; - /// - /// let os_str = OsStr::new("foo"); - /// let os_string = os_str.to_os_string(); - /// assert_eq!(os_string, OsString::from("foo")); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - #[must_use = "this returns the result of the operation, \ - without modifying the original"] - #[inline] - pub fn to_os_string(&self) -> OsString { - OsString { inner: self.inner.to_owned() } - } - - /// Checks whether the `OsStr` is empty. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new(""); - /// assert!(os_str.is_empty()); - /// - /// let os_str = OsStr::new("foo"); - /// assert!(!os_str.is_empty()); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[must_use] - #[inline] - pub fn is_empty(&self) -> bool { - self.inner.inner.is_empty() - } - - /// Returns the length of this `OsStr`. - /// - /// Note that this does **not** return the number of bytes in the string in - /// OS string form. - /// - /// The length returned is that of the underlying storage used by `OsStr`. - /// As discussed in the [`OsString`] introduction, [`OsString`] and `OsStr` - /// store strings in a form best suited for cheap inter-conversion between - /// native-platform and Rust string forms, which may differ significantly - /// from both of them, including in storage size and encoding. - /// - /// This number is simply useful for passing to other methods, like - /// [`OsString::with_capacity`] to avoid reallocations. - /// - /// See the main `OsString` documentation information about encoding and capacity units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new(""); - /// assert_eq!(os_str.len(), 0); - /// - /// let os_str = OsStr::new("foo"); - /// assert_eq!(os_str.len(), 3); - /// ``` - #[stable(feature = "osstring_simple_functions", since = "1.9.0")] - #[must_use] - #[inline] - pub fn len(&self) -> usize { - self.inner.inner.len() - } - - /// Converts a [Box]<[OsStr]> into an [`OsString`] without copying or allocating. - #[stable(feature = "into_boxed_os_str", since = "1.20.0")] - #[must_use = "`self` will be dropped if the result is not used"] - pub fn into_os_string(self: Box) -> OsString { - let boxed = unsafe { Box::from_raw(Box::into_raw(self) as *mut Slice) }; - OsString { inner: Buf::from_box(boxed) } - } - - /// Converts an OS string slice to a byte slice. To convert the byte slice back into an OS - /// string slice, use the [`OsStr::from_encoded_bytes_unchecked`] function. - /// - /// The byte encoding is an unspecified, platform-specific, self-synchronizing superset of UTF-8. - /// By being a self-synchronizing superset of UTF-8, this encoding is also a superset of 7-bit - /// ASCII. - /// - /// Note: As the encoding is unspecified, any sub-slice of bytes that is not valid UTF-8 should - /// be treated as opaque and only comparable within the same Rust version built for the same - /// target platform. For example, sending the slice over the network or storing it in a file - /// will likely result in incompatible byte slices. See [`OsString`] for more encoding details - /// and [`std::ffi`] for platform-specific, specified conversions. - /// - /// [`std::ffi`]: crate::ffi - #[inline] - #[stable(feature = "os_str_bytes", since = "1.74.0")] - pub fn as_encoded_bytes(&self) -> &[u8] { - self.inner.as_encoded_bytes() - } - - /// Takes a substring based on a range that corresponds to the return value of - /// [`OsStr::as_encoded_bytes`]. - /// - /// The range's start and end must lie on valid `OsStr` boundaries. - /// A valid `OsStr` boundary is one of: - /// - The start of the string - /// - The end of the string - /// - Immediately before a valid non-empty UTF-8 substring - /// - Immediately after a valid non-empty UTF-8 substring - /// - /// # Panics - /// - /// Panics if `range` does not lie on valid `OsStr` boundaries or if it - /// exceeds the end of the string. - /// - /// # Example - /// - /// ``` - /// #![feature(os_str_slice)] - /// - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new("foo=bar"); - /// let bytes = os_str.as_encoded_bytes(); - /// if let Some(index) = bytes.iter().position(|b| *b == b'=') { - /// let key = os_str.slice_encoded_bytes(..index); - /// let value = os_str.slice_encoded_bytes(index + 1..); - /// assert_eq!(key, "foo"); - /// assert_eq!(value, "bar"); - /// } - /// ``` - #[unstable(feature = "os_str_slice", issue = "118485")] - pub fn slice_encoded_bytes>(&self, range: R) -> &Self { - let encoded_bytes = self.as_encoded_bytes(); - let Range { start, end } = slice::range(range, ..encoded_bytes.len()); - - // `check_public_boundary` should panic if the index does not lie on an - // `OsStr` boundary as described above. It's possible to do this in an - // encoding-agnostic way, but details of the internal encoding might - // permit a more efficient implementation. - self.inner.check_public_boundary(start); - self.inner.check_public_boundary(end); - - // SAFETY: `slice::range` ensures that `start` and `end` are valid - let slice = unsafe { encoded_bytes.get_unchecked(start..end) }; - - // SAFETY: `slice` comes from `self` and we validated the boundaries - unsafe { Self::from_encoded_bytes_unchecked(slice) } - } - - /// Converts this string to its ASCII lower case equivalent in-place. - /// - /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', - /// but non-ASCII letters are unchanged. - /// - /// To return a new lowercased value without modifying the existing one, use - /// [`OsStr::to_ascii_lowercase`]. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut s = OsString::from("GRÜßE, JÜRGEN ❤"); - /// - /// s.make_ascii_lowercase(); - /// - /// assert_eq!("grÜße, jÜrgen ❤", s); - /// ``` - #[stable(feature = "osstring_ascii", since = "1.53.0")] - #[inline] - pub fn make_ascii_lowercase(&mut self) { - self.inner.make_ascii_lowercase() - } - - /// Converts this string to its ASCII upper case equivalent in-place. - /// - /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', - /// but non-ASCII letters are unchanged. - /// - /// To return a new uppercased value without modifying the existing one, use - /// [`OsStr::to_ascii_uppercase`]. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let mut s = OsString::from("Grüße, Jürgen ❤"); - /// - /// s.make_ascii_uppercase(); - /// - /// assert_eq!("GRüßE, JüRGEN ❤", s); - /// ``` - #[stable(feature = "osstring_ascii", since = "1.53.0")] - #[inline] - pub fn make_ascii_uppercase(&mut self) { - self.inner.make_ascii_uppercase() - } - - /// Returns a copy of this string where each character is mapped to its - /// ASCII lower case equivalent. - /// - /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', - /// but non-ASCII letters are unchanged. - /// - /// To lowercase the value in-place, use [`OsStr::make_ascii_lowercase`]. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// let s = OsString::from("Grüße, Jürgen ❤"); - /// - /// assert_eq!("grüße, jürgen ❤", s.to_ascii_lowercase()); - /// ``` - #[must_use = "to lowercase the value in-place, use `make_ascii_lowercase`"] - #[stable(feature = "osstring_ascii", since = "1.53.0")] - pub fn to_ascii_lowercase(&self) -> OsString { - OsString::from_inner(self.inner.to_ascii_lowercase()) - } - - /// Returns a copy of this string where each character is mapped to its - /// ASCII upper case equivalent. - /// - /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', - /// but non-ASCII letters are unchanged. - /// - /// To uppercase the value in-place, use [`OsStr::make_ascii_uppercase`]. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// let s = OsString::from("Grüße, Jürgen ❤"); - /// - /// assert_eq!("GRüßE, JüRGEN ❤", s.to_ascii_uppercase()); - /// ``` - #[must_use = "to uppercase the value in-place, use `make_ascii_uppercase`"] - #[stable(feature = "osstring_ascii", since = "1.53.0")] - pub fn to_ascii_uppercase(&self) -> OsString { - OsString::from_inner(self.inner.to_ascii_uppercase()) - } - - /// Checks if all characters in this string are within the ASCII range. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// let ascii = OsString::from("hello!\n"); - /// let non_ascii = OsString::from("Grüße, Jürgen ❤"); - /// - /// assert!(ascii.is_ascii()); - /// assert!(!non_ascii.is_ascii()); - /// ``` - #[stable(feature = "osstring_ascii", since = "1.53.0")] - #[must_use] - #[inline] - pub fn is_ascii(&self) -> bool { - self.inner.is_ascii() - } - - /// Checks that two strings are an ASCII case-insensitive match. - /// - /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, - /// but without allocating and copying temporaries. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// - /// assert!(OsString::from("Ferris").eq_ignore_ascii_case("FERRIS")); - /// assert!(OsString::from("Ferrös").eq_ignore_ascii_case("FERRöS")); - /// assert!(!OsString::from("Ferrös").eq_ignore_ascii_case("FERRÖS")); - /// ``` - #[stable(feature = "osstring_ascii", since = "1.53.0")] - pub fn eq_ignore_ascii_case>(&self, other: S) -> bool { - self.inner.eq_ignore_ascii_case(&other.as_ref().inner) - } - - /// Returns an object that implements [`Display`] for safely printing an - /// [`OsStr`] that may contain non-Unicode data. This may perform lossy - /// conversion, depending on the platform. If you would like an - /// implementation which escapes the [`OsStr`] please use [`Debug`] - /// instead. - /// - /// [`Display`]: fmt::Display - /// [`Debug`]: fmt::Debug - /// - /// # Examples - /// - /// ``` - /// #![feature(os_str_display)] - /// use std::ffi::OsStr; - /// - /// let s = OsStr::new("Hello, world!"); - /// println!("{}", s.display()); - /// ``` - #[unstable(feature = "os_str_display", issue = "120048")] - #[must_use = "this does not display the `OsStr`; \ - it returns an object that can be displayed"] - #[inline] - pub fn display(&self) -> Display<'_> { - Display { os_str: self } - } -} - -#[stable(feature = "box_from_os_str", since = "1.17.0")] -impl From<&OsStr> for Box { - /// Copies the string into a newly allocated [Box]<[OsStr]>. - #[inline] - fn from(s: &OsStr) -> Box { - let rw = Box::into_raw(s.inner.into_box()) as *mut OsStr; - unsafe { Box::from_raw(rw) } - } -} - -#[stable(feature = "box_from_cow", since = "1.45.0")] -impl From> for Box { - /// Converts a `Cow<'a, OsStr>` into a [Box]<[OsStr]>, - /// by copying the contents if they are borrowed. - #[inline] - fn from(cow: Cow<'_, OsStr>) -> Box { - match cow { - Cow::Borrowed(s) => Box::from(s), - Cow::Owned(s) => Box::from(s), - } - } -} - -#[stable(feature = "os_string_from_box", since = "1.18.0")] -impl From> for OsString { - /// Converts a [Box]<[OsStr]> into an [`OsString`] without copying or - /// allocating. - #[inline] - fn from(boxed: Box) -> OsString { - boxed.into_os_string() - } -} - -#[stable(feature = "box_from_os_string", since = "1.20.0")] -impl From for Box { - /// Converts an [`OsString`] into a [Box]<[OsStr]> without copying or allocating. - #[inline] - fn from(s: OsString) -> Box { - s.into_boxed_os_str() - } -} - -#[stable(feature = "more_box_slice_clone", since = "1.29.0")] -impl Clone for Box { - #[inline] - fn clone(&self) -> Self { - self.to_os_string().into_boxed_os_str() - } -} - -#[unstable(feature = "clone_to_uninit", issue = "126799")] -unsafe impl CloneToUninit for OsStr { - #[inline] - #[cfg_attr(debug_assertions, track_caller)] - unsafe fn clone_to_uninit(&self, dst: *mut Self) { - // SAFETY: we're just a wrapper around a platform-specific Slice - unsafe { self.inner.clone_to_uninit(addr_of_mut!((*dst).inner)) } - } -} - -#[stable(feature = "shared_from_slice2", since = "1.24.0")] -impl From for Arc { - /// Converts an [`OsString`] into an [Arc]<[OsStr]> by moving the [`OsString`] - /// data into a new [`Arc`] buffer. - #[inline] - fn from(s: OsString) -> Arc { - let arc = s.inner.into_arc(); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const OsStr) } - } -} - -#[stable(feature = "shared_from_slice2", since = "1.24.0")] -impl From<&OsStr> for Arc { - /// Copies the string into a newly allocated [Arc]<[OsStr]>. - #[inline] - fn from(s: &OsStr) -> Arc { - let arc = s.inner.into_arc(); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const OsStr) } - } -} - -#[stable(feature = "shared_from_slice2", since = "1.24.0")] -impl From for Rc { - /// Converts an [`OsString`] into an [Rc]<[OsStr]> by moving the [`OsString`] - /// data into a new [`Rc`] buffer. - #[inline] - fn from(s: OsString) -> Rc { - let rc = s.inner.into_rc(); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const OsStr) } - } -} - -#[stable(feature = "shared_from_slice2", since = "1.24.0")] -impl From<&OsStr> for Rc { - /// Copies the string into a newly allocated [Rc]<[OsStr]>. - #[inline] - fn from(s: &OsStr) -> Rc { - let rc = s.inner.into_rc(); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const OsStr) } - } -} - -#[stable(feature = "cow_from_osstr", since = "1.28.0")] -impl<'a> From for Cow<'a, OsStr> { - /// Moves the string into a [`Cow::Owned`]. - #[inline] - fn from(s: OsString) -> Cow<'a, OsStr> { - Cow::Owned(s) - } -} - -#[stable(feature = "cow_from_osstr", since = "1.28.0")] -impl<'a> From<&'a OsStr> for Cow<'a, OsStr> { - /// Converts the string reference into a [`Cow::Borrowed`]. - #[inline] - fn from(s: &'a OsStr) -> Cow<'a, OsStr> { - Cow::Borrowed(s) - } -} - -#[stable(feature = "cow_from_osstr", since = "1.28.0")] -impl<'a> From<&'a OsString> for Cow<'a, OsStr> { - /// Converts the string reference into a [`Cow::Borrowed`]. - #[inline] - fn from(s: &'a OsString) -> Cow<'a, OsStr> { - Cow::Borrowed(s.as_os_str()) - } -} - -#[stable(feature = "osstring_from_cow_osstr", since = "1.28.0")] -impl<'a> From> for OsString { - /// Converts a `Cow<'a, OsStr>` into an [`OsString`], - /// by copying the contents if they are borrowed. - #[inline] - fn from(s: Cow<'a, OsStr>) -> Self { - s.into_owned() - } -} - -#[stable(feature = "str_tryfrom_osstr_impl", since = "1.72.0")] -impl<'a> TryFrom<&'a OsStr> for &'a str { - type Error = crate::str::Utf8Error; - - /// Tries to convert an `&OsStr` to a `&str`. - /// - /// ``` - /// use std::ffi::OsStr; - /// - /// let os_str = OsStr::new("foo"); - /// let as_str = <&str>::try_from(os_str).unwrap(); - /// assert_eq!(as_str, "foo"); - /// ``` - fn try_from(value: &'a OsStr) -> Result { - value.inner.to_str() - } -} - -#[stable(feature = "box_default_extra", since = "1.17.0")] -impl Default for Box { - #[inline] - fn default() -> Box { - let rw = Box::into_raw(Slice::empty_box()) as *mut OsStr; - unsafe { Box::from_raw(rw) } - } -} - -#[stable(feature = "osstring_default", since = "1.9.0")] -impl Default for &OsStr { - /// Creates an empty `OsStr`. - #[inline] - fn default() -> Self { - OsStr::new("") - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialEq for OsStr { - #[inline] - fn eq(&self, other: &OsStr) -> bool { - self.as_encoded_bytes().eq(other.as_encoded_bytes()) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialEq for OsStr { - #[inline] - fn eq(&self, other: &str) -> bool { - *self == *OsStr::new(other) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialEq for str { - #[inline] - fn eq(&self, other: &OsStr) -> bool { - *other == *OsStr::new(self) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Eq for OsStr {} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialOrd for OsStr { - #[inline] - fn partial_cmp(&self, other: &OsStr) -> Option { - self.as_encoded_bytes().partial_cmp(other.as_encoded_bytes()) - } - #[inline] - fn lt(&self, other: &OsStr) -> bool { - self.as_encoded_bytes().lt(other.as_encoded_bytes()) - } - #[inline] - fn le(&self, other: &OsStr) -> bool { - self.as_encoded_bytes().le(other.as_encoded_bytes()) - } - #[inline] - fn gt(&self, other: &OsStr) -> bool { - self.as_encoded_bytes().gt(other.as_encoded_bytes()) - } - #[inline] - fn ge(&self, other: &OsStr) -> bool { - self.as_encoded_bytes().ge(other.as_encoded_bytes()) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl PartialOrd for OsStr { - #[inline] - fn partial_cmp(&self, other: &str) -> Option { - self.partial_cmp(OsStr::new(other)) - } -} - -// FIXME (#19470): cannot provide PartialOrd for str until we -// have more flexible coherence rules. - -#[stable(feature = "rust1", since = "1.0.0")] -impl Ord for OsStr { - #[inline] - fn cmp(&self, other: &OsStr) -> cmp::Ordering { - self.as_encoded_bytes().cmp(other.as_encoded_bytes()) - } -} - -macro_rules! impl_cmp { - ($lhs:ty, $rhs: ty) => { - #[stable(feature = "cmp_os_str", since = "1.8.0")] - impl<'a, 'b> PartialEq<$rhs> for $lhs { - #[inline] - fn eq(&self, other: &$rhs) -> bool { - ::eq(self, other) - } - } - - #[stable(feature = "cmp_os_str", since = "1.8.0")] - impl<'a, 'b> PartialEq<$lhs> for $rhs { - #[inline] - fn eq(&self, other: &$lhs) -> bool { - ::eq(self, other) - } - } - - #[stable(feature = "cmp_os_str", since = "1.8.0")] - impl<'a, 'b> PartialOrd<$rhs> for $lhs { - #[inline] - fn partial_cmp(&self, other: &$rhs) -> Option { - ::partial_cmp(self, other) - } - } - - #[stable(feature = "cmp_os_str", since = "1.8.0")] - impl<'a, 'b> PartialOrd<$lhs> for $rhs { - #[inline] - fn partial_cmp(&self, other: &$lhs) -> Option { - ::partial_cmp(self, other) - } - } - }; -} - -impl_cmp!(OsString, OsStr); -impl_cmp!(OsString, &'a OsStr); -impl_cmp!(Cow<'a, OsStr>, OsStr); -impl_cmp!(Cow<'a, OsStr>, &'b OsStr); -impl_cmp!(Cow<'a, OsStr>, OsString); - -#[stable(feature = "rust1", since = "1.0.0")] -impl Hash for OsStr { - #[inline] - fn hash(&self, state: &mut H) { - self.as_encoded_bytes().hash(state) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl fmt::Debug for OsStr { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&self.inner, formatter) - } -} - -/// Helper struct for safely printing an [`OsStr`] with [`format!`] and `{}`. -/// -/// An [`OsStr`] might contain non-Unicode data. This `struct` implements the -/// [`Display`] trait in a way that mitigates that. It is created by the -/// [`display`](OsStr::display) method on [`OsStr`]. This may perform lossy -/// conversion, depending on the platform. If you would like an implementation -/// which escapes the [`OsStr`] please use [`Debug`] instead. -/// -/// # Examples -/// -/// ``` -/// #![feature(os_str_display)] -/// use std::ffi::OsStr; -/// -/// let s = OsStr::new("Hello, world!"); -/// println!("{}", s.display()); -/// ``` -/// -/// [`Display`]: fmt::Display -/// [`format!`]: crate::format -#[unstable(feature = "os_str_display", issue = "120048")] -pub struct Display<'a> { - os_str: &'a OsStr, -} - -#[unstable(feature = "os_str_display", issue = "120048")] -impl fmt::Debug for Display<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&self.os_str, f) - } -} - -#[unstable(feature = "os_str_display", issue = "120048")] -impl fmt::Display for Display<'_> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(&self.os_str.inner, f) - } -} - -#[unstable(feature = "slice_concat_ext", issue = "27747")] -impl> alloc::slice::Join<&OsStr> for [S] { - type Output = OsString; - - fn join(slice: &Self, sep: &OsStr) -> OsString { - let Some((first, suffix)) = slice.split_first() else { - return OsString::new(); - }; - let first_owned = first.borrow().to_owned(); - suffix.iter().fold(first_owned, |mut a, b| { - a.push(sep); - a.push(b.borrow()); - a - }) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl Borrow for OsString { - #[inline] - fn borrow(&self) -> &OsStr { - &self[..] - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl ToOwned for OsStr { - type Owned = OsString; - #[inline] - fn to_owned(&self) -> OsString { - self.to_os_string() - } - #[inline] - fn clone_into(&self, target: &mut OsString) { - self.inner.clone_into(&mut target.inner) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl AsRef for OsStr { - #[inline] - fn as_ref(&self) -> &OsStr { - self - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl AsRef for OsString { - #[inline] - fn as_ref(&self) -> &OsStr { - self - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl AsRef for str { - #[inline] - fn as_ref(&self) -> &OsStr { - OsStr::from_inner(Slice::from_str(self)) - } -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl AsRef for String { - #[inline] - fn as_ref(&self) -> &OsStr { - (&**self).as_ref() - } -} - -impl FromInner for OsString { - #[inline] - fn from_inner(buf: Buf) -> OsString { - OsString { inner: buf } - } -} - -impl IntoInner for OsString { - #[inline] - fn into_inner(self) -> Buf { - self.inner - } -} - -impl AsInner for OsStr { - #[inline] - fn as_inner(&self) -> &Slice { - &self.inner - } -} - -#[stable(feature = "osstring_from_str", since = "1.45.0")] -impl FromStr for OsString { - type Err = core::convert::Infallible; - - #[inline] - fn from_str(s: &str) -> Result { - Ok(OsString::from(s)) - } -} - -#[stable(feature = "osstring_extend", since = "1.52.0")] -impl Extend for OsString { - #[inline] - fn extend>(&mut self, iter: T) { - for s in iter { - self.push(&s); - } - } -} - -#[stable(feature = "osstring_extend", since = "1.52.0")] -impl<'a> Extend<&'a OsStr> for OsString { - #[inline] - fn extend>(&mut self, iter: T) { - for s in iter { - self.push(s); - } - } -} - -#[stable(feature = "osstring_extend", since = "1.52.0")] -impl<'a> Extend> for OsString { - #[inline] - fn extend>>(&mut self, iter: T) { - for s in iter { - self.push(&s); - } - } -} - -#[stable(feature = "osstring_extend", since = "1.52.0")] -impl FromIterator for OsString { - #[inline] - fn from_iter>(iter: I) -> Self { - let mut iterator = iter.into_iter(); - - // Because we're iterating over `OsString`s, we can avoid at least - // one allocation by getting the first string from the iterator - // and appending to it all the subsequent strings. - match iterator.next() { - None => OsString::new(), - Some(mut buf) => { - buf.extend(iterator); - buf - } - } - } -} - -#[stable(feature = "osstring_extend", since = "1.52.0")] -impl<'a> FromIterator<&'a OsStr> for OsString { - #[inline] - fn from_iter>(iter: I) -> Self { - let mut buf = Self::new(); - for s in iter { - buf.push(s); - } - buf - } -} - -#[stable(feature = "osstring_extend", since = "1.52.0")] -impl<'a> FromIterator> for OsString { - #[inline] - fn from_iter>>(iter: I) -> Self { - let mut iterator = iter.into_iter(); - - // Because we're iterating over `OsString`s, we can avoid at least - // one allocation by getting the first owned string from the iterator - // and appending to it all the subsequent strings. - match iterator.next() { - None => OsString::new(), - Some(Cow::Owned(mut buf)) => { - buf.extend(iterator); - buf - } - Some(Cow::Borrowed(buf)) => { - let mut buf = OsString::from(buf); - buf.extend(iterator); - buf - } - } - } -} +pub use alloc::ffi::os_str::OsString; +pub use core::ffi::os_str::{Display, OsStr}; diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 2530a37638757..4ac7a0b7e1111 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -306,6 +306,9 @@ #![feature(negative_impls)] #![feature(never_type)] #![feature(no_sanitize)] +#![feature(os_str_display)] +#![feature(os_str_internals)] +#![feature(os_string_pathbuf_leak)] #![feature(prelude_import)] #![feature(rustc_attrs)] #![feature(rustdoc_internals)] diff --git a/library/std/src/os/unix/ffi/os_str.rs b/library/std/src/os/unix/ffi/os_str.rs index 650f712bc6eef..68974e8874d2f 100644 --- a/library/std/src/os/unix/ffi/os_str.rs +++ b/library/std/src/os/unix/ffi/os_str.rs @@ -1,70 +1,7 @@ -use crate::ffi::{OsStr, OsString}; -use crate::mem; -use crate::sealed::Sealed; -use crate::sys::os_str::Buf; -use crate::sys_common::{AsInner, FromInner, IntoInner}; - // Note: this file is currently reused in other `std::os::{platform}::ffi` modules to reduce duplication. // Keep this in mind when applying changes to this file that only apply to `unix`. -/// Platform-specific extensions to [`OsString`]. -/// -/// This trait is sealed: it cannot be implemented outside the standard library. -/// This is so that future additional methods are not breaking changes. -#[stable(feature = "rust1", since = "1.0.0")] -pub trait OsStringExt: Sealed { - /// Creates an [`OsString`] from a byte vector. - /// - /// See the module documentation for an example. - #[stable(feature = "rust1", since = "1.0.0")] - fn from_vec(vec: Vec) -> Self; - - /// Yields the underlying byte vector of this [`OsString`]. - /// - /// See the module documentation for an example. - #[stable(feature = "rust1", since = "1.0.0")] - fn into_vec(self) -> Vec; -} - #[stable(feature = "rust1", since = "1.0.0")] -impl OsStringExt for OsString { - #[inline] - fn from_vec(vec: Vec) -> OsString { - FromInner::from_inner(Buf { inner: vec }) - } - #[inline] - fn into_vec(self) -> Vec { - self.into_inner().inner - } -} - -/// Platform-specific extensions to [`OsStr`]. -/// -/// This trait is sealed: it cannot be implemented outside the standard library. -/// This is so that future additional methods are not breaking changes. -#[stable(feature = "rust1", since = "1.0.0")] -pub trait OsStrExt: Sealed { - #[stable(feature = "rust1", since = "1.0.0")] - /// Creates an [`OsStr`] from a byte slice. - /// - /// See the module documentation for an example. - fn from_bytes(slice: &[u8]) -> &Self; - - /// Gets the underlying byte view of the [`OsStr`] slice. - /// - /// See the module documentation for an example. - #[stable(feature = "rust1", since = "1.0.0")] - fn as_bytes(&self) -> &[u8]; -} - +pub use alloc::ffi::os_str::os_str_ext_unix::OsStringExt; #[stable(feature = "rust1", since = "1.0.0")] -impl OsStrExt for OsStr { - #[inline] - fn from_bytes(slice: &[u8]) -> &OsStr { - unsafe { mem::transmute(slice) } - } - #[inline] - fn as_bytes(&self) -> &[u8] { - &self.as_inner().inner - } -} +pub use core::ffi::os_str::os_str_ext_unix::OsStrExt; diff --git a/library/std/src/os/unix/net/addr.rs b/library/std/src/os/unix/net/addr.rs index 79f2c365025b7..1021de40a43ba 100644 --- a/library/std/src/os/unix/net/addr.rs +++ b/library/std/src/os/unix/net/addr.rs @@ -1,3 +1,5 @@ +use core::ffi::os_str::os_str_ext_unix::OsStrExt; + use crate::ffi::OsStr; #[cfg(any(doc, target_os = "android", target_os = "linux"))] use crate::os::net::linux_ext; diff --git a/library/std/src/os/windows/ffi.rs b/library/std/src/os/windows/ffi.rs index 496443dbbc3ac..31e7e674a21cf 100644 --- a/library/std/src/os/windows/ffi.rs +++ b/library/std/src/os/windows/ffi.rs @@ -53,83 +53,7 @@ #![stable(feature = "rust1", since = "1.0.0")] -use crate::ffi::{OsStr, OsString}; -use crate::sealed::Sealed; -use crate::sys::os_str::Buf; #[stable(feature = "rust1", since = "1.0.0")] -pub use crate::sys_common::wtf8::EncodeWide; -use crate::sys_common::wtf8::Wtf8Buf; -use crate::sys_common::{AsInner, FromInner}; - -/// Windows-specific extensions to [`OsString`]. -/// -/// This trait is sealed: it cannot be implemented outside the standard library. -/// This is so that future additional methods are not breaking changes. -#[stable(feature = "rust1", since = "1.0.0")] -pub trait OsStringExt: Sealed { - /// Creates an `OsString` from a potentially ill-formed UTF-16 slice of - /// 16-bit code units. - /// - /// This is lossless: calling [`OsStrExt::encode_wide`] on the resulting string - /// will always return the original code units. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// use std::os::windows::prelude::*; - /// - /// // UTF-16 encoding for "Unicode". - /// let source = [0x0055, 0x006E, 0x0069, 0x0063, 0x006F, 0x0064, 0x0065]; - /// - /// let string = OsString::from_wide(&source[..]); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - fn from_wide(wide: &[u16]) -> Self; -} - -#[stable(feature = "rust1", since = "1.0.0")] -impl OsStringExt for OsString { - fn from_wide(wide: &[u16]) -> OsString { - FromInner::from_inner(Buf { inner: Wtf8Buf::from_wide(wide) }) - } -} - -/// Windows-specific extensions to [`OsStr`]. -/// -/// This trait is sealed: it cannot be implemented outside the standard library. -/// This is so that future additional methods are not breaking changes. -#[stable(feature = "rust1", since = "1.0.0")] -pub trait OsStrExt: Sealed { - /// Re-encodes an `OsStr` as a wide character sequence, i.e., potentially - /// ill-formed UTF-16. - /// - /// This is lossless: calling [`OsStringExt::from_wide`] and then - /// `encode_wide` on the result will yield the original code units. - /// Note that the encoding does not add a final null terminator. - /// - /// # Examples - /// - /// ``` - /// use std::ffi::OsString; - /// use std::os::windows::prelude::*; - /// - /// // UTF-16 encoding for "Unicode". - /// let source = [0x0055, 0x006E, 0x0069, 0x0063, 0x006F, 0x0064, 0x0065]; - /// - /// let string = OsString::from_wide(&source[..]); - /// - /// let result: Vec = string.encode_wide().collect(); - /// assert_eq!(&source[..], &result[..]); - /// ``` - #[stable(feature = "rust1", since = "1.0.0")] - fn encode_wide(&self) -> EncodeWide<'_>; -} - +pub use alloc::ffi::os_str::os_str_ext_windows::OsStringExt; #[stable(feature = "rust1", since = "1.0.0")] -impl OsStrExt for OsStr { - #[inline] - fn encode_wide(&self) -> EncodeWide<'_> { - self.as_inner().inner.encode_wide() - } -} +pub use core::ffi::os_str::os_str_ext_windows::{EncodeWide, OsStrExt}; diff --git a/library/std/src/sys/mod.rs b/library/std/src/sys/mod.rs index a86b3628f249a..ebf6121abeb84 100644 --- a/library/std/src/sys/mod.rs +++ b/library/std/src/sys/mod.rs @@ -11,7 +11,6 @@ pub mod anonymous_pipe; pub mod backtrace; pub mod cmath; pub mod exit_guard; -pub mod os_str; pub mod path; pub mod sync; pub mod thread_local; diff --git a/library/std/src/sys/os_str/bytes.rs b/library/std/src/sys/os_str/bytes.rs deleted file mode 100644 index 992767211d083..0000000000000 --- a/library/std/src/sys/os_str/bytes.rs +++ /dev/null @@ -1,360 +0,0 @@ -//! The underlying OsString/OsStr implementation on Unix and many other -//! systems: just a `Vec`/`[u8]`. - -use core::clone::CloneToUninit; -use core::ptr::addr_of_mut; - -use crate::borrow::Cow; -use crate::collections::TryReserveError; -use crate::fmt::Write; -use crate::rc::Rc; -use crate::sync::Arc; -use crate::sys_common::{AsInner, IntoInner}; -use crate::{fmt, mem, str}; - -#[cfg(test)] -mod tests; - -#[derive(Hash)] -#[repr(transparent)] -pub struct Buf { - pub inner: Vec, -} - -#[repr(transparent)] -pub struct Slice { - pub inner: [u8], -} - -impl fmt::Debug for Slice { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f) - } -} - -impl fmt::Display for Slice { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // If we're the empty string then our iterator won't actually yield - // anything, so perform the formatting manually - if self.inner.is_empty() { - return "".fmt(f); - } - - for chunk in self.inner.utf8_chunks() { - let valid = chunk.valid(); - // If we successfully decoded the whole chunk as a valid string then - // we can return a direct formatting of the string which will also - // respect various formatting flags if possible. - if chunk.invalid().is_empty() { - return valid.fmt(f); - } - - f.write_str(valid)?; - f.write_char(char::REPLACEMENT_CHARACTER)?; - } - Ok(()) - } -} - -impl fmt::Debug for Buf { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(self.as_slice(), formatter) - } -} - -impl fmt::Display for Buf { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self.as_slice(), formatter) - } -} - -impl Clone for Buf { - #[inline] - fn clone(&self) -> Self { - Buf { inner: self.inner.clone() } - } - - #[inline] - fn clone_from(&mut self, source: &Self) { - self.inner.clone_from(&source.inner) - } -} - -impl IntoInner> for Buf { - fn into_inner(self) -> Vec { - self.inner - } -} - -impl AsInner<[u8]> for Buf { - #[inline] - fn as_inner(&self) -> &[u8] { - &self.inner - } -} - -impl Buf { - #[inline] - pub fn into_encoded_bytes(self) -> Vec { - self.inner - } - - #[inline] - pub unsafe fn from_encoded_bytes_unchecked(s: Vec) -> Self { - Self { inner: s } - } - - pub fn from_string(s: String) -> Buf { - Buf { inner: s.into_bytes() } - } - - #[inline] - pub fn with_capacity(capacity: usize) -> Buf { - Buf { inner: Vec::with_capacity(capacity) } - } - - #[inline] - pub fn clear(&mut self) { - self.inner.clear() - } - - #[inline] - pub fn capacity(&self) -> usize { - self.inner.capacity() - } - - #[inline] - pub fn reserve(&mut self, additional: usize) { - self.inner.reserve(additional) - } - - #[inline] - pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.inner.try_reserve(additional) - } - - #[inline] - pub fn reserve_exact(&mut self, additional: usize) { - self.inner.reserve_exact(additional) - } - - #[inline] - pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.inner.try_reserve_exact(additional) - } - - #[inline] - pub fn shrink_to_fit(&mut self) { - self.inner.shrink_to_fit() - } - - #[inline] - pub fn shrink_to(&mut self, min_capacity: usize) { - self.inner.shrink_to(min_capacity) - } - - #[inline] - pub fn as_slice(&self) -> &Slice { - // SAFETY: Slice just wraps [u8], - // and &*self.inner is &[u8], therefore - // transmuting &[u8] to &Slice is safe. - unsafe { mem::transmute(&*self.inner) } - } - - #[inline] - pub fn as_mut_slice(&mut self) -> &mut Slice { - // SAFETY: Slice just wraps [u8], - // and &mut *self.inner is &mut [u8], therefore - // transmuting &mut [u8] to &mut Slice is safe. - unsafe { mem::transmute(&mut *self.inner) } - } - - pub fn into_string(self) -> Result { - String::from_utf8(self.inner).map_err(|p| Buf { inner: p.into_bytes() }) - } - - pub fn push_slice(&mut self, s: &Slice) { - self.inner.extend_from_slice(&s.inner) - } - - #[inline] - pub fn leak<'a>(self) -> &'a mut Slice { - unsafe { mem::transmute(self.inner.leak()) } - } - - #[inline] - pub fn into_box(self) -> Box { - unsafe { mem::transmute(self.inner.into_boxed_slice()) } - } - - #[inline] - pub fn from_box(boxed: Box) -> Buf { - let inner: Box<[u8]> = unsafe { mem::transmute(boxed) }; - Buf { inner: inner.into_vec() } - } - - #[inline] - pub fn into_arc(&self) -> Arc { - self.as_slice().into_arc() - } - - #[inline] - pub fn into_rc(&self) -> Rc { - self.as_slice().into_rc() - } - - /// Provides plumbing to core `Vec::truncate`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn truncate(&mut self, len: usize) { - self.inner.truncate(len); - } - - /// Provides plumbing to core `Vec::extend_from_slice`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { - self.inner.extend_from_slice(other); - } -} - -impl Slice { - #[inline] - pub fn as_encoded_bytes(&self) -> &[u8] { - &self.inner - } - - #[inline] - pub unsafe fn from_encoded_bytes_unchecked(s: &[u8]) -> &Slice { - unsafe { mem::transmute(s) } - } - - #[track_caller] - #[inline] - pub fn check_public_boundary(&self, index: usize) { - if index == 0 || index == self.inner.len() { - return; - } - if index < self.inner.len() - && (self.inner[index - 1].is_ascii() || self.inner[index].is_ascii()) - { - return; - } - - slow_path(&self.inner, index); - - /// We're betting that typical splits will involve an ASCII character. - /// - /// Putting the expensive checks in a separate function generates notably - /// better assembly. - #[track_caller] - #[inline(never)] - fn slow_path(bytes: &[u8], index: usize) { - let (before, after) = bytes.split_at(index); - - // UTF-8 takes at most 4 bytes per codepoint, so we don't - // need to check more than that. - let after = after.get(..4).unwrap_or(after); - match str::from_utf8(after) { - Ok(_) => return, - Err(err) if err.valid_up_to() != 0 => return, - Err(_) => (), - } - - for len in 2..=4.min(index) { - let before = &before[index - len..]; - if str::from_utf8(before).is_ok() { - return; - } - } - - panic!("byte index {index} is not an OsStr boundary"); - } - } - - #[inline] - pub fn from_str(s: &str) -> &Slice { - unsafe { Slice::from_encoded_bytes_unchecked(s.as_bytes()) } - } - - pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> { - str::from_utf8(&self.inner) - } - - pub fn to_string_lossy(&self) -> Cow<'_, str> { - String::from_utf8_lossy(&self.inner) - } - - pub fn to_owned(&self) -> Buf { - Buf { inner: self.inner.to_vec() } - } - - pub fn clone_into(&self, buf: &mut Buf) { - self.inner.clone_into(&mut buf.inner) - } - - #[inline] - pub fn into_box(&self) -> Box { - let boxed: Box<[u8]> = self.inner.into(); - unsafe { mem::transmute(boxed) } - } - - pub fn empty_box() -> Box { - let boxed: Box<[u8]> = Default::default(); - unsafe { mem::transmute(boxed) } - } - - #[inline] - pub fn into_arc(&self) -> Arc { - let arc: Arc<[u8]> = Arc::from(&self.inner); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Slice) } - } - - #[inline] - pub fn into_rc(&self) -> Rc { - let rc: Rc<[u8]> = Rc::from(&self.inner); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } - } - - #[inline] - pub fn make_ascii_lowercase(&mut self) { - self.inner.make_ascii_lowercase() - } - - #[inline] - pub fn make_ascii_uppercase(&mut self) { - self.inner.make_ascii_uppercase() - } - - #[inline] - pub fn to_ascii_lowercase(&self) -> Buf { - Buf { inner: self.inner.to_ascii_lowercase() } - } - - #[inline] - pub fn to_ascii_uppercase(&self) -> Buf { - Buf { inner: self.inner.to_ascii_uppercase() } - } - - #[inline] - pub fn is_ascii(&self) -> bool { - self.inner.is_ascii() - } - - #[inline] - pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - self.inner.eq_ignore_ascii_case(&other.inner) - } -} - -#[unstable(feature = "clone_to_uninit", issue = "126799")] -unsafe impl CloneToUninit for Slice { - #[inline] - #[cfg_attr(debug_assertions, track_caller)] - unsafe fn clone_to_uninit(&self, dst: *mut Self) { - // SAFETY: we're just a wrapper around [u8] - unsafe { self.inner.clone_to_uninit(addr_of_mut!((*dst).inner)) } - } -} diff --git a/library/std/src/sys/os_str/mod.rs b/library/std/src/sys/os_str/mod.rs deleted file mode 100644 index 345e661586d03..0000000000000 --- a/library/std/src/sys/os_str/mod.rs +++ /dev/null @@ -1,14 +0,0 @@ -#![forbid(unsafe_op_in_unsafe_fn)] - -cfg_if::cfg_if! { - if #[cfg(any( - target_os = "windows", - target_os = "uefi", - ))] { - mod wtf8; - pub use wtf8::{Buf, Slice}; - } else { - mod bytes; - pub use bytes::{Buf, Slice}; - } -} diff --git a/library/std/src/sys/os_str/wtf8.rs b/library/std/src/sys/os_str/wtf8.rs deleted file mode 100644 index 433237aa6e7bf..0000000000000 --- a/library/std/src/sys/os_str/wtf8.rs +++ /dev/null @@ -1,283 +0,0 @@ -//! The underlying OsString/OsStr implementation on Windows is a -//! wrapper around the "WTF-8" encoding; see the `wtf8` module for more. -use core::clone::CloneToUninit; -use core::ptr::addr_of_mut; - -use crate::borrow::Cow; -use crate::collections::TryReserveError; -use crate::rc::Rc; -use crate::sync::Arc; -use crate::sys_common::wtf8::{check_utf8_boundary, Wtf8, Wtf8Buf}; -use crate::sys_common::{AsInner, FromInner, IntoInner}; -use crate::{fmt, mem}; - -#[derive(Clone, Hash)] -pub struct Buf { - pub inner: Wtf8Buf, -} - -impl IntoInner for Buf { - fn into_inner(self) -> Wtf8Buf { - self.inner - } -} - -impl FromInner for Buf { - fn from_inner(inner: Wtf8Buf) -> Self { - Buf { inner } - } -} - -impl AsInner for Buf { - #[inline] - fn as_inner(&self) -> &Wtf8 { - &self.inner - } -} - -impl fmt::Debug for Buf { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(self.as_slice(), formatter) - } -} - -impl fmt::Display for Buf { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self.as_slice(), formatter) - } -} - -#[repr(transparent)] -pub struct Slice { - pub inner: Wtf8, -} - -impl fmt::Debug for Slice { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&self.inner, formatter) - } -} - -impl fmt::Display for Slice { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(&self.inner, formatter) - } -} - -impl Buf { - #[inline] - pub fn into_encoded_bytes(self) -> Vec { - self.inner.into_bytes() - } - - #[inline] - pub unsafe fn from_encoded_bytes_unchecked(s: Vec) -> Self { - unsafe { Self { inner: Wtf8Buf::from_bytes_unchecked(s) } } - } - - pub fn with_capacity(capacity: usize) -> Buf { - Buf { inner: Wtf8Buf::with_capacity(capacity) } - } - - pub fn clear(&mut self) { - self.inner.clear() - } - - pub fn capacity(&self) -> usize { - self.inner.capacity() - } - - pub fn from_string(s: String) -> Buf { - Buf { inner: Wtf8Buf::from_string(s) } - } - - pub fn as_slice(&self) -> &Slice { - // SAFETY: Slice is just a wrapper for Wtf8, - // and self.inner.as_slice() returns &Wtf8. - // Therefore, transmuting &Wtf8 to &Slice is safe. - unsafe { mem::transmute(self.inner.as_slice()) } - } - - pub fn as_mut_slice(&mut self) -> &mut Slice { - // SAFETY: Slice is just a wrapper for Wtf8, - // and self.inner.as_mut_slice() returns &mut Wtf8. - // Therefore, transmuting &mut Wtf8 to &mut Slice is safe. - // Additionally, care should be taken to ensure the slice - // is always valid Wtf8. - unsafe { mem::transmute(self.inner.as_mut_slice()) } - } - - pub fn into_string(self) -> Result { - self.inner.into_string().map_err(|buf| Buf { inner: buf }) - } - - pub fn push_slice(&mut self, s: &Slice) { - self.inner.push_wtf8(&s.inner) - } - - pub fn reserve(&mut self, additional: usize) { - self.inner.reserve(additional) - } - - pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.inner.try_reserve(additional) - } - - pub fn reserve_exact(&mut self, additional: usize) { - self.inner.reserve_exact(additional) - } - - pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.inner.try_reserve_exact(additional) - } - - pub fn shrink_to_fit(&mut self) { - self.inner.shrink_to_fit() - } - - #[inline] - pub fn shrink_to(&mut self, min_capacity: usize) { - self.inner.shrink_to(min_capacity) - } - - #[inline] - pub fn leak<'a>(self) -> &'a mut Slice { - unsafe { mem::transmute(self.inner.leak()) } - } - - #[inline] - pub fn into_box(self) -> Box { - unsafe { mem::transmute(self.inner.into_box()) } - } - - #[inline] - pub fn from_box(boxed: Box) -> Buf { - let inner: Box = unsafe { mem::transmute(boxed) }; - Buf { inner: Wtf8Buf::from_box(inner) } - } - - #[inline] - pub fn into_arc(&self) -> Arc { - self.as_slice().into_arc() - } - - #[inline] - pub fn into_rc(&self) -> Rc { - self.as_slice().into_rc() - } - - /// Provides plumbing to core `Vec::truncate`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn truncate(&mut self, len: usize) { - self.inner.truncate(len); - } - - /// Provides plumbing to core `Vec::extend_from_slice`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { - self.inner.extend_from_slice(other); - } -} - -impl Slice { - #[inline] - pub fn as_encoded_bytes(&self) -> &[u8] { - self.inner.as_bytes() - } - - #[inline] - pub unsafe fn from_encoded_bytes_unchecked(s: &[u8]) -> &Slice { - unsafe { mem::transmute(Wtf8::from_bytes_unchecked(s)) } - } - - #[track_caller] - pub fn check_public_boundary(&self, index: usize) { - check_utf8_boundary(&self.inner, index); - } - - #[inline] - pub fn from_str(s: &str) -> &Slice { - unsafe { mem::transmute(Wtf8::from_str(s)) } - } - - pub fn to_str(&self) -> Result<&str, crate::str::Utf8Error> { - self.inner.as_str() - } - - pub fn to_string_lossy(&self) -> Cow<'_, str> { - self.inner.to_string_lossy() - } - - pub fn to_owned(&self) -> Buf { - Buf { inner: self.inner.to_owned() } - } - - pub fn clone_into(&self, buf: &mut Buf) { - self.inner.clone_into(&mut buf.inner) - } - - #[inline] - pub fn into_box(&self) -> Box { - unsafe { mem::transmute(self.inner.into_box()) } - } - - pub fn empty_box() -> Box { - unsafe { mem::transmute(Wtf8::empty_box()) } - } - - #[inline] - pub fn into_arc(&self) -> Arc { - let arc = self.inner.into_arc(); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Slice) } - } - - #[inline] - pub fn into_rc(&self) -> Rc { - let rc = self.inner.into_rc(); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Slice) } - } - - #[inline] - pub fn make_ascii_lowercase(&mut self) { - self.inner.make_ascii_lowercase() - } - - #[inline] - pub fn make_ascii_uppercase(&mut self) { - self.inner.make_ascii_uppercase() - } - - #[inline] - pub fn to_ascii_lowercase(&self) -> Buf { - Buf { inner: self.inner.to_ascii_lowercase() } - } - - #[inline] - pub fn to_ascii_uppercase(&self) -> Buf { - Buf { inner: self.inner.to_ascii_uppercase() } - } - - #[inline] - pub fn is_ascii(&self) -> bool { - self.inner.is_ascii() - } - - #[inline] - pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - self.inner.eq_ignore_ascii_case(&other.inner) - } -} - -#[unstable(feature = "clone_to_uninit", issue = "126799")] -unsafe impl CloneToUninit for Slice { - #[inline] - #[cfg_attr(debug_assertions, track_caller)] - unsafe fn clone_to_uninit(&self, dst: *mut Self) { - // SAFETY: we're just a wrapper around Wtf8 - unsafe { self.inner.clone_to_uninit(addr_of_mut!((*dst).inner)) } - } -} diff --git a/library/std/src/sys/pal/unix/fs.rs b/library/std/src/sys/pal/unix/fs.rs index fc9d7e988835e..69c76fb4d9fc6 100644 --- a/library/std/src/sys/pal/unix/fs.rs +++ b/library/std/src/sys/pal/unix/fs.rs @@ -4,6 +4,9 @@ #[cfg(test)] mod tests; +use alloc::ffi::os_str::os_str_ext_unix::OsStringExt; +use core::ffi::os_str::os_str_ext_unix::OsStrExt; + #[cfg(all(target_os = "linux", target_env = "gnu"))] use libc::c_char; #[cfg(any( diff --git a/library/std/src/sys/pal/unix/os.rs b/library/std/src/sys/pal/unix/os.rs index a785b97ac8dc5..8514a1fbb9c8b 100644 --- a/library/std/src/sys/pal/unix/os.rs +++ b/library/std/src/sys/pal/unix/os.rs @@ -5,6 +5,8 @@ #[cfg(test)] mod tests; +use alloc::ffi::os_str::os_str_ext_unix::OsStringExt; +use core::ffi::os_str::os_str_ext_unix::OsStrExt; use core::slice::memchr; use libc::{c_char, c_int, c_void}; diff --git a/library/std/src/sys/pal/unix/process/process_common.rs b/library/std/src/sys/pal/unix/process/process_common.rs index fec825054195a..af96db64da2f9 100644 --- a/library/std/src/sys/pal/unix/process/process_common.rs +++ b/library/std/src/sys/pal/unix/process/process_common.rs @@ -1,6 +1,9 @@ #[cfg(all(test, not(target_os = "emscripten")))] mod tests; +use alloc::ffi::os_str::os_str_ext_unix::OsStringExt; +use core::ffi::os_str::os_str_ext_unix::OsStrExt; + use libc::{c_char, c_int, gid_t, pid_t, uid_t, EXIT_FAILURE, EXIT_SUCCESS}; use crate::collections::BTreeMap; diff --git a/library/std/src/sys_common/mod.rs b/library/std/src/sys_common/mod.rs index 1c884f107beeb..66e4208e1f2ab 100644 --- a/library/std/src/sys_common/mod.rs +++ b/library/std/src/sys_common/mod.rs @@ -25,7 +25,6 @@ pub mod io; pub mod lazy_box; pub mod process; pub mod wstr; -pub mod wtf8; cfg_if::cfg_if! { if #[cfg(any( diff --git a/library/std/src/sys_common/wtf8.rs b/library/std/src/sys_common/wtf8.rs deleted file mode 100644 index 063451ad54e1c..0000000000000 --- a/library/std/src/sys_common/wtf8.rs +++ /dev/null @@ -1,1060 +0,0 @@ -//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). -//! -//! This library uses Rust’s type system to maintain -//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), -//! like the `String` and `&str` types do for UTF-8. -//! -//! Since [WTF-8 must not be used -//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), -//! this library deliberately does not provide access to the underlying bytes -//! of WTF-8 strings, -//! nor can it decode WTF-8 from arbitrary bytes. -//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. - -// this module is imported from @SimonSapin's repo and has tons of dead code on -// unix (it's mostly used on windows), so don't worry about dead code here. -#![allow(dead_code)] - -#[cfg(test)] -mod tests; - -use core::char::{encode_utf16_raw, encode_utf8_raw}; -use core::clone::CloneToUninit; -use core::str::next_code_point; - -use crate::borrow::Cow; -use crate::collections::TryReserveError; -use crate::hash::{Hash, Hasher}; -use crate::iter::FusedIterator; -use crate::ptr::addr_of_mut; -use crate::rc::Rc; -use crate::sync::Arc; -use crate::sys_common::AsInner; -use crate::{fmt, mem, ops, slice, str}; - -const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; - -/// A Unicode code point: from U+0000 to U+10FFFF. -/// -/// Compares with the `char` type, -/// which represents a Unicode scalar value: -/// a code point that is not a surrogate (U+D800 to U+DFFF). -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] -pub struct CodePoint { - value: u32, -} - -/// Format the code point as `U+` followed by four to six hexadecimal digits. -/// Example: `U+1F4A9` -impl fmt::Debug for CodePoint { - #[inline] - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(formatter, "U+{:04X}", self.value) - } -} - -impl CodePoint { - /// Unsafely creates a new `CodePoint` without checking the value. - /// - /// Only use when `value` is known to be less than or equal to 0x10FFFF. - #[inline] - pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { - CodePoint { value } - } - - /// Creates a new `CodePoint` if the value is a valid code point. - /// - /// Returns `None` if `value` is above 0x10FFFF. - #[inline] - pub fn from_u32(value: u32) -> Option { - match value { - 0..=0x10FFFF => Some(CodePoint { value }), - _ => None, - } - } - - /// Creates a new `CodePoint` from a `char`. - /// - /// Since all Unicode scalar values are code points, this always succeeds. - #[inline] - pub fn from_char(value: char) -> CodePoint { - CodePoint { value: value as u32 } - } - - /// Returns the numeric value of the code point. - #[inline] - pub fn to_u32(&self) -> u32 { - self.value - } - - /// Returns the numeric value of the code point if it is a leading surrogate. - #[inline] - pub fn to_lead_surrogate(&self) -> Option { - match self.value { - lead @ 0xD800..=0xDBFF => Some(lead as u16), - _ => None, - } - } - - /// Returns the numeric value of the code point if it is a trailing surrogate. - #[inline] - pub fn to_trail_surrogate(&self) -> Option { - match self.value { - trail @ 0xDC00..=0xDFFF => Some(trail as u16), - _ => None, - } - } - - /// Optionally returns a Unicode scalar value for the code point. - /// - /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). - #[inline] - pub fn to_char(&self) -> Option { - match self.value { - 0xD800..=0xDFFF => None, - _ => Some(unsafe { char::from_u32_unchecked(self.value) }), - } - } - - /// Returns a Unicode scalar value for the code point. - /// - /// Returns `'\u{FFFD}'` (the replacement character “�”) - /// if the code point is a surrogate (from U+D800 to U+DFFF). - #[inline] - pub fn to_char_lossy(&self) -> char { - self.to_char().unwrap_or('\u{FFFD}') - } -} - -/// An owned, growable string of well-formed WTF-8 data. -/// -/// Similar to `String`, but can additionally contain surrogate code points -/// if they’re not in a surrogate pair. -#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] -pub struct Wtf8Buf { - bytes: Vec, - - /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily - /// know this if we're constructed from a `String` or `&str`. - /// - /// It is possible for `bytes` to have valid UTF-8 without this being - /// set, such as when we're concatenating `&Wtf8`'s and surrogates become - /// paired, as we don't bother to rescan the entire string. - is_known_utf8: bool, -} - -impl ops::Deref for Wtf8Buf { - type Target = Wtf8; - - fn deref(&self) -> &Wtf8 { - self.as_slice() - } -} - -impl ops::DerefMut for Wtf8Buf { - fn deref_mut(&mut self) -> &mut Wtf8 { - self.as_mut_slice() - } -} - -/// Format the string with double quotes, -/// and surrogates as `\u` followed by four hexadecimal digits. -/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] -impl fmt::Debug for Wtf8Buf { - #[inline] - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&**self, formatter) - } -} - -impl Wtf8Buf { - /// Creates a new, empty WTF-8 string. - #[inline] - pub fn new() -> Wtf8Buf { - Wtf8Buf { bytes: Vec::new(), is_known_utf8: true } - } - - /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes. - #[inline] - pub fn with_capacity(capacity: usize) -> Wtf8Buf { - Wtf8Buf { bytes: Vec::with_capacity(capacity), is_known_utf8: true } - } - - /// Creates a WTF-8 string from a WTF-8 byte vec. - /// - /// Since the byte vec is not checked for valid WTF-8, this functions is - /// marked unsafe. - #[inline] - pub unsafe fn from_bytes_unchecked(value: Vec) -> Wtf8Buf { - Wtf8Buf { bytes: value, is_known_utf8: false } - } - - /// Creates a WTF-8 string from a UTF-8 `String`. - /// - /// This takes ownership of the `String` and does not copy. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. - #[inline] - pub fn from_string(string: String) -> Wtf8Buf { - Wtf8Buf { bytes: string.into_bytes(), is_known_utf8: true } - } - - /// Creates a WTF-8 string from a UTF-8 `&str` slice. - /// - /// This copies the content of the slice. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. - #[inline] - pub fn from_str(str: &str) -> Wtf8Buf { - Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()), is_known_utf8: true } - } - - pub fn clear(&mut self) { - self.bytes.clear(); - self.is_known_utf8 = true; - } - - /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. - /// - /// This is lossless: calling `.encode_wide()` on the resulting string - /// will always return the original code units. - pub fn from_wide(v: &[u16]) -> Wtf8Buf { - let mut string = Wtf8Buf::with_capacity(v.len()); - for item in char::decode_utf16(v.iter().cloned()) { - match item { - Ok(ch) => string.push_char(ch), - Err(surrogate) => { - let surrogate = surrogate.unpaired_surrogate(); - // Surrogates are known to be in the code point range. - let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; - // The string will now contain an unpaired surrogate. - string.is_known_utf8 = false; - // Skip the WTF-8 concatenation check, - // surrogate pairs are already decoded by decode_utf16 - string.push_code_point_unchecked(code_point); - } - } - } - string - } - - /// Copied from String::push - /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check. - fn push_code_point_unchecked(&mut self, code_point: CodePoint) { - let mut bytes = [0; 4]; - let bytes = encode_utf8_raw(code_point.value, &mut bytes); - self.bytes.extend_from_slice(bytes) - } - - #[inline] - pub fn as_slice(&self) -> &Wtf8 { - unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } - } - - #[inline] - pub fn as_mut_slice(&mut self) -> &mut Wtf8 { - // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would - // cause them to change from well-formed UTF-8 to ill-formed UTF-8, - // which would break the assumptions of the `is_known_utf8` field. - unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } - } - - /// Reserves capacity for at least `additional` more bytes to be inserted - /// in the given `Wtf8Buf`. - /// The collection may reserve more space to avoid frequent reallocations. - /// - /// # Panics - /// - /// Panics if the new capacity overflows `usize`. - #[inline] - pub fn reserve(&mut self, additional: usize) { - self.bytes.reserve(additional) - } - - /// Tries to reserve capacity for at least `additional` more length units - /// in the given `Wtf8Buf`. The `Wtf8Buf` may reserve more space to avoid - /// frequent reallocations. After calling `try_reserve`, capacity will be - /// greater than or equal to `self.len() + additional`. Does nothing if - /// capacity is already sufficient. This method preserves the contents even - /// if an error occurs. - /// - /// # Errors - /// - /// If the capacity overflows, or the allocator reports a failure, then an error - /// is returned. - #[inline] - pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.bytes.try_reserve(additional) - } - - #[inline] - pub fn reserve_exact(&mut self, additional: usize) { - self.bytes.reserve_exact(additional) - } - - /// Tries to reserve the minimum capacity for exactly `additional` - /// length units in the given `Wtf8Buf`. After calling - /// `try_reserve_exact`, capacity will be greater than or equal to - /// `self.len() + additional` if it returns `Ok(())`. - /// Does nothing if the capacity is already sufficient. - /// - /// Note that the allocator may give the `Wtf8Buf` more space than it - /// requests. Therefore, capacity can not be relied upon to be precisely - /// minimal. Prefer [`try_reserve`] if future insertions are expected. - /// - /// [`try_reserve`]: Wtf8Buf::try_reserve - /// - /// # Errors - /// - /// If the capacity overflows, or the allocator reports a failure, then an error - /// is returned. - #[inline] - pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { - self.bytes.try_reserve_exact(additional) - } - - #[inline] - pub fn shrink_to_fit(&mut self) { - self.bytes.shrink_to_fit() - } - - #[inline] - pub fn shrink_to(&mut self, min_capacity: usize) { - self.bytes.shrink_to(min_capacity) - } - - #[inline] - pub fn leak<'a>(self) -> &'a mut Wtf8 { - unsafe { Wtf8::from_mut_bytes_unchecked(self.bytes.leak()) } - } - - /// Returns the number of bytes that this string buffer can hold without reallocating. - #[inline] - pub fn capacity(&self) -> usize { - self.bytes.capacity() - } - - /// Append a UTF-8 slice at the end of the string. - #[inline] - pub fn push_str(&mut self, other: &str) { - self.bytes.extend_from_slice(other.as_bytes()) - } - - /// Append a WTF-8 slice at the end of the string. - /// - /// This replaces newly paired surrogates at the boundary - /// with a supplementary code point, - /// like concatenating ill-formed UTF-16 strings effectively would. - #[inline] - pub fn push_wtf8(&mut self, other: &Wtf8) { - match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { - // Replace newly paired surrogates by a supplementary code point. - (Some(lead), Some(trail)) => { - let len_without_lead_surrogate = self.len() - 3; - self.bytes.truncate(len_without_lead_surrogate); - let other_without_trail_surrogate = &other.bytes[3..]; - // 4 bytes for the supplementary code point - self.bytes.reserve(4 + other_without_trail_surrogate.len()); - self.push_char(decode_surrogate_pair(lead, trail)); - self.bytes.extend_from_slice(other_without_trail_surrogate); - } - _ => { - // If we'll be pushing a string containing a surrogate, we may - // no longer have UTF-8. - if other.next_surrogate(0).is_some() { - self.is_known_utf8 = false; - } - - self.bytes.extend_from_slice(&other.bytes); - } - } - } - - /// Append a Unicode scalar value at the end of the string. - #[inline] - pub fn push_char(&mut self, c: char) { - self.push_code_point_unchecked(CodePoint::from_char(c)) - } - - /// Append a code point at the end of the string. - /// - /// This replaces newly paired surrogates at the boundary - /// with a supplementary code point, - /// like concatenating ill-formed UTF-16 strings effectively would. - #[inline] - pub fn push(&mut self, code_point: CodePoint) { - if let Some(trail) = code_point.to_trail_surrogate() { - if let Some(lead) = (&*self).final_lead_surrogate() { - let len_without_lead_surrogate = self.len() - 3; - self.bytes.truncate(len_without_lead_surrogate); - self.push_char(decode_surrogate_pair(lead, trail)); - return; - } - - // We're pushing a trailing surrogate. - self.is_known_utf8 = false; - } else if code_point.to_lead_surrogate().is_some() { - // We're pushing a leading surrogate. - self.is_known_utf8 = false; - } - - // No newly paired surrogates at the boundary. - self.push_code_point_unchecked(code_point) - } - - /// Shortens a string to the specified length. - /// - /// # Panics - /// - /// Panics if `new_len` > current length, - /// or if `new_len` is not a code point boundary. - #[inline] - pub fn truncate(&mut self, new_len: usize) { - assert!(is_code_point_boundary(self, new_len)); - self.bytes.truncate(new_len) - } - - /// Consumes the WTF-8 string and tries to convert it to a vec of bytes. - #[inline] - pub fn into_bytes(self) -> Vec { - self.bytes - } - - /// Consumes the WTF-8 string and tries to convert it to UTF-8. - /// - /// This does not copy the data. - /// - /// If the contents are not well-formed UTF-8 - /// (that is, if the string contains surrogates), - /// the original WTF-8 string is returned instead. - pub fn into_string(self) -> Result { - if self.is_known_utf8 || self.next_surrogate(0).is_none() { - Ok(unsafe { String::from_utf8_unchecked(self.bytes) }) - } else { - Err(self) - } - } - - /// Consumes the WTF-8 string and converts it lossily to UTF-8. - /// - /// This does not copy the data (but may overwrite parts of it in place). - /// - /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) - pub fn into_string_lossy(mut self) -> String { - // Fast path: If we already have UTF-8, we can return it immediately. - if self.is_known_utf8 { - return unsafe { String::from_utf8_unchecked(self.bytes) }; - } - - let mut pos = 0; - loop { - match self.next_surrogate(pos) { - Some((surrogate_pos, _)) => { - pos = surrogate_pos + 3; - self.bytes[surrogate_pos..pos] - .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); - } - None => return unsafe { String::from_utf8_unchecked(self.bytes) }, - } - } - } - - /// Converts this `Wtf8Buf` into a boxed `Wtf8`. - #[inline] - pub fn into_box(self) -> Box { - // SAFETY: relies on `Wtf8` being `repr(transparent)`. - unsafe { mem::transmute(self.bytes.into_boxed_slice()) } - } - - /// Converts a `Box` into a `Wtf8Buf`. - pub fn from_box(boxed: Box) -> Wtf8Buf { - let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; - Wtf8Buf { bytes: bytes.into_vec(), is_known_utf8: false } - } - - /// Provides plumbing to core `Vec::extend_from_slice`. - /// More well behaving alternative to allowing outer types - /// full mutable access to the core `Vec`. - #[inline] - pub(crate) fn extend_from_slice(&mut self, other: &[u8]) { - self.bytes.extend_from_slice(other); - self.is_known_utf8 = false; - } -} - -/// Creates a new WTF-8 string from an iterator of code points. -/// -/// This replaces surrogate code point pairs with supplementary code points, -/// like concatenating ill-formed UTF-16 strings effectively would. -impl FromIterator for Wtf8Buf { - fn from_iter>(iter: T) -> Wtf8Buf { - let mut string = Wtf8Buf::new(); - string.extend(iter); - string - } -} - -/// Append code points from an iterator to the string. -/// -/// This replaces surrogate code point pairs with supplementary code points, -/// like concatenating ill-formed UTF-16 strings effectively would. -impl Extend for Wtf8Buf { - fn extend>(&mut self, iter: T) { - let iterator = iter.into_iter(); - let (low, _high) = iterator.size_hint(); - // Lower bound of one byte per code point (ASCII only) - self.bytes.reserve(low); - iterator.for_each(move |code_point| self.push(code_point)); - } - - #[inline] - fn extend_one(&mut self, code_point: CodePoint) { - self.push(code_point); - } - - #[inline] - fn extend_reserve(&mut self, additional: usize) { - // Lower bound of one byte per code point (ASCII only) - self.bytes.reserve(additional); - } -} - -/// A borrowed slice of well-formed WTF-8 data. -/// -/// Similar to `&str`, but can additionally contain surrogate code points -/// if they’re not in a surrogate pair. -#[derive(Eq, Ord, PartialEq, PartialOrd)] -#[repr(transparent)] -pub struct Wtf8 { - bytes: [u8], -} - -impl AsInner<[u8]> for Wtf8 { - #[inline] - fn as_inner(&self) -> &[u8] { - &self.bytes - } -} - -/// Format the slice with double quotes, -/// and surrogates as `\u` followed by four hexadecimal digits. -/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] -impl fmt::Debug for Wtf8 { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result { - use crate::fmt::Write; - for c in s.chars().flat_map(|c| c.escape_debug()) { - f.write_char(c)? - } - Ok(()) - } - - formatter.write_str("\"")?; - let mut pos = 0; - while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) { - write_str_escaped(formatter, unsafe { - str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) - })?; - write!(formatter, "\\u{{{:x}}}", surrogate)?; - pos = surrogate_pos + 3; - } - write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; - formatter.write_str("\"") - } -} - -impl fmt::Display for Wtf8 { - fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { - let wtf8_bytes = &self.bytes; - let mut pos = 0; - loop { - match self.next_surrogate(pos) { - Some((surrogate_pos, _)) => { - formatter.write_str(unsafe { - str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) - })?; - formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?; - pos = surrogate_pos + 3; - } - None => { - let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) }; - if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) } - } - } - } - } -} - -impl Wtf8 { - /// Creates a WTF-8 slice from a UTF-8 `&str` slice. - /// - /// Since WTF-8 is a superset of UTF-8, this always succeeds. - #[inline] - pub fn from_str(value: &str) -> &Wtf8 { - unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } - } - - /// Creates a WTF-8 slice from a WTF-8 byte slice. - /// - /// Since the byte slice is not checked for valid WTF-8, this functions is - /// marked unsafe. - #[inline] - pub unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { - // SAFETY: start with &[u8], end with fancy &[u8] - unsafe { &*(value as *const [u8] as *const Wtf8) } - } - - /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice. - /// - /// Since the byte slice is not checked for valid WTF-8, this functions is - /// marked unsafe. - #[inline] - unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { - // SAFETY: start with &mut [u8], end with fancy &mut [u8] - unsafe { &mut *(value as *mut [u8] as *mut Wtf8) } - } - - /// Returns the length, in WTF-8 bytes. - #[inline] - pub fn len(&self) -> usize { - self.bytes.len() - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.bytes.is_empty() - } - - /// Returns the code point at `position` if it is in the ASCII range, - /// or `b'\xFF'` otherwise. - /// - /// # Panics - /// - /// Panics if `position` is beyond the end of the string. - #[inline] - pub fn ascii_byte_at(&self, position: usize) -> u8 { - match self.bytes[position] { - ascii_byte @ 0x00..=0x7F => ascii_byte, - _ => 0xFF, - } - } - - /// Returns an iterator for the string’s code points. - #[inline] - pub fn code_points(&self) -> Wtf8CodePoints<'_> { - Wtf8CodePoints { bytes: self.bytes.iter() } - } - - /// Access raw bytes of WTF-8 data - #[inline] - pub fn as_bytes(&self) -> &[u8] { - &self.bytes - } - - /// Tries to convert the string to UTF-8 and return a `&str` slice. - /// - /// Returns `None` if the string contains surrogates. - /// - /// This does not copy the data. - #[inline] - pub fn as_str(&self) -> Result<&str, str::Utf8Error> { - str::from_utf8(&self.bytes) - } - - /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`. - pub fn to_owned(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_vec(), is_known_utf8: false } - } - - /// Lossily converts the string to UTF-8. - /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. - /// - /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). - /// - /// This only copies the data if necessary (if it contains any surrogate). - pub fn to_string_lossy(&self) -> Cow<'_, str> { - let surrogate_pos = match self.next_surrogate(0) { - None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), - Some((pos, _)) => pos, - }; - let wtf8_bytes = &self.bytes; - let mut utf8_bytes = Vec::with_capacity(self.len()); - utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); - utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); - let mut pos = surrogate_pos + 3; - loop { - match self.next_surrogate(pos) { - Some((surrogate_pos, _)) => { - utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); - utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); - pos = surrogate_pos + 3; - } - None => { - utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); - return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); - } - } - } - } - - /// Converts the WTF-8 string to potentially ill-formed UTF-16 - /// and return an iterator of 16-bit code units. - /// - /// This is lossless: - /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units - /// would always return the original WTF-8 string. - #[inline] - pub fn encode_wide(&self) -> EncodeWide<'_> { - EncodeWide { code_points: self.code_points(), extra: 0 } - } - - #[inline] - fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { - let mut iter = self.bytes[pos..].iter(); - loop { - let b = *iter.next()?; - if b < 0x80 { - pos += 1; - } else if b < 0xE0 { - iter.next(); - pos += 2; - } else if b == 0xED { - match (iter.next(), iter.next()) { - (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { - return Some((pos, decode_surrogate(b2, b3))); - } - _ => pos += 3, - } - } else if b < 0xF0 { - iter.next(); - iter.next(); - pos += 3; - } else { - iter.next(); - iter.next(); - iter.next(); - pos += 4; - } - } - } - - #[inline] - fn final_lead_surrogate(&self) -> Option { - match self.bytes { - [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), - _ => None, - } - } - - #[inline] - fn initial_trail_surrogate(&self) -> Option { - match self.bytes { - [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)), - _ => None, - } - } - - pub fn clone_into(&self, buf: &mut Wtf8Buf) { - buf.is_known_utf8 = false; - self.bytes.clone_into(&mut buf.bytes); - } - - /// Boxes this `Wtf8`. - #[inline] - pub fn into_box(&self) -> Box { - let boxed: Box<[u8]> = self.bytes.into(); - unsafe { mem::transmute(boxed) } - } - - /// Creates a boxed, empty `Wtf8`. - pub fn empty_box() -> Box { - let boxed: Box<[u8]> = Default::default(); - unsafe { mem::transmute(boxed) } - } - - #[inline] - pub fn into_arc(&self) -> Arc { - let arc: Arc<[u8]> = Arc::from(&self.bytes); - unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } - } - - #[inline] - pub fn into_rc(&self) -> Rc { - let rc: Rc<[u8]> = Rc::from(&self.bytes); - unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } - } - - #[inline] - pub fn make_ascii_lowercase(&mut self) { - self.bytes.make_ascii_lowercase() - } - - #[inline] - pub fn make_ascii_uppercase(&mut self) { - self.bytes.make_ascii_uppercase() - } - - #[inline] - pub fn to_ascii_lowercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_lowercase(), is_known_utf8: false } - } - - #[inline] - pub fn to_ascii_uppercase(&self) -> Wtf8Buf { - Wtf8Buf { bytes: self.bytes.to_ascii_uppercase(), is_known_utf8: false } - } - - #[inline] - pub fn is_ascii(&self) -> bool { - self.bytes.is_ascii() - } - - #[inline] - pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { - self.bytes.eq_ignore_ascii_case(&other.bytes) - } -} - -/// Returns a slice of the given string for the byte range \[`begin`..`end`). -/// -/// # Panics -/// -/// Panics when `begin` and `end` do not point to code point boundaries, -/// or point beyond the end of the string. -impl ops::Index> for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, range: ops::Range) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if range.start <= range.end - && is_code_point_boundary(self, range.start) - && is_code_point_boundary(self, range.end) - { - unsafe { slice_unchecked(self, range.start, range.end) } - } else { - slice_error_fail(self, range.start, range.end) - } - } -} - -/// Returns a slice of the given string from byte `begin` to its end. -/// -/// # Panics -/// -/// Panics when `begin` is not at a code point boundary, -/// or is beyond the end of the string. -impl ops::Index> for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, range: ops::RangeFrom) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.start) { - unsafe { slice_unchecked(self, range.start, self.len()) } - } else { - slice_error_fail(self, range.start, self.len()) - } - } -} - -/// Returns a slice of the given string from its beginning to byte `end`. -/// -/// # Panics -/// -/// Panics when `end` is not at a code point boundary, -/// or is beyond the end of the string. -impl ops::Index> for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, range: ops::RangeTo) -> &Wtf8 { - // is_code_point_boundary checks that the index is in [0, .len()] - if is_code_point_boundary(self, range.end) { - unsafe { slice_unchecked(self, 0, range.end) } - } else { - slice_error_fail(self, 0, range.end) - } - } -} - -impl ops::Index for Wtf8 { - type Output = Wtf8; - - #[inline] - fn index(&self, _range: ops::RangeFull) -> &Wtf8 { - self - } -} - -#[inline] -fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { - // The first byte is assumed to be 0xED - 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F -} - -#[inline] -fn decode_surrogate_pair(lead: u16, trail: u16) -> char { - let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); - unsafe { char::from_u32_unchecked(code_point) } -} - -/// Copied from str::is_char_boundary -#[inline] -pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { - if index == 0 { - return true; - } - match slice.bytes.get(index) { - None => index == slice.len(), - Some(&b) => (b as i8) >= -0x40, - } -} - -/// Verify that `index` is at the edge of either a valid UTF-8 codepoint -/// (i.e. a codepoint that's not a surrogate) or of the whole string. -/// -/// These are the cases currently permitted by `OsStr::slice_encoded_bytes`. -/// Splitting between surrogates is valid as far as WTF-8 is concerned, but -/// we do not permit it in the public API because WTF-8 is considered an -/// implementation detail. -#[track_caller] -#[inline] -pub fn check_utf8_boundary(slice: &Wtf8, index: usize) { - if index == 0 { - return; - } - match slice.bytes.get(index) { - Some(0xED) => (), // Might be a surrogate - Some(&b) if (b as i8) >= -0x40 => return, - Some(_) => panic!("byte index {index} is not a codepoint boundary"), - None if index == slice.len() => return, - None => panic!("byte index {index} is out of bounds"), - } - if slice.bytes[index + 1] >= 0xA0 { - // There's a surrogate after index. Now check before index. - if index >= 3 && slice.bytes[index - 3] == 0xED && slice.bytes[index - 2] >= 0xA0 { - panic!("byte index {index} lies between surrogate codepoints"); - } - } -} - -/// Copied from core::str::raw::slice_unchecked -#[inline] -pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { - // SAFETY: memory layout of a &[u8] and &Wtf8 are the same - unsafe { - let len = end - begin; - let start = s.as_bytes().as_ptr().add(begin); - Wtf8::from_bytes_unchecked(slice::from_raw_parts(start, len)) - } -} - -/// Copied from core::str::raw::slice_error_fail -#[inline(never)] -pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { - assert!(begin <= end); - panic!("index {begin} and/or {end} in `{s:?}` do not lie on character boundary"); -} - -/// Iterator for the code points of a WTF-8 string. -/// -/// Created with the method `.code_points()`. -#[derive(Clone)] -pub struct Wtf8CodePoints<'a> { - bytes: slice::Iter<'a, u8>, -} - -impl<'a> Iterator for Wtf8CodePoints<'a> { - type Item = CodePoint; - - #[inline] - fn next(&mut self) -> Option { - // SAFETY: `self.bytes` has been created from a WTF-8 string - unsafe { next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) } - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let len = self.bytes.len(); - (len.saturating_add(3) / 4, Some(len)) - } -} - -/// Generates a wide character sequence for potentially ill-formed UTF-16. -#[stable(feature = "rust1", since = "1.0.0")] -#[derive(Clone)] -pub struct EncodeWide<'a> { - code_points: Wtf8CodePoints<'a>, - extra: u16, -} - -// Copied from libunicode/u_str.rs -#[stable(feature = "rust1", since = "1.0.0")] -impl<'a> Iterator for EncodeWide<'a> { - type Item = u16; - - #[inline] - fn next(&mut self) -> Option { - if self.extra != 0 { - let tmp = self.extra; - self.extra = 0; - return Some(tmp); - } - - let mut buf = [0; 2]; - self.code_points.next().map(|code_point| { - let n = encode_utf16_raw(code_point.value, &mut buf).len(); - if n == 2 { - self.extra = buf[1]; - } - buf[0] - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.code_points.size_hint(); - let ext = (self.extra != 0) as usize; - // every code point gets either one u16 or two u16, - // so this iterator is between 1 or 2 times as - // long as the underlying iterator. - (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext))) - } -} - -#[stable(feature = "encode_wide_fused_iterator", since = "1.62.0")] -impl FusedIterator for EncodeWide<'_> {} - -impl Hash for CodePoint { - #[inline] - fn hash(&self, state: &mut H) { - self.value.hash(state) - } -} - -impl Hash for Wtf8Buf { - #[inline] - fn hash(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) - } -} - -impl Hash for Wtf8 { - #[inline] - fn hash(&self, state: &mut H) { - state.write(&self.bytes); - 0xfeu8.hash(state) - } -} - -#[unstable(feature = "clone_to_uninit", issue = "126799")] -unsafe impl CloneToUninit for Wtf8 { - #[inline] - #[cfg_attr(debug_assertions, track_caller)] - unsafe fn clone_to_uninit(&self, dst: *mut Self) { - // SAFETY: we're just a wrapper around [u8] - unsafe { self.bytes.clone_to_uninit(addr_of_mut!((*dst).bytes)) } - } -} diff --git a/src/tools/tidy/src/pal.rs b/src/tools/tidy/src/pal.rs index c650fd0eec6d8..ed68890a1ef88 100644 --- a/src/tools/tidy/src/pal.rs +++ b/src/tools/tidy/src/pal.rs @@ -50,6 +50,8 @@ const EXCEPTION_PATHS: &[&str] = &[ "library/core/src/ffi/va_list.rs", // We placed a linkage against Windows libraries here "library/core/src/ffi/mod.rs", + "library/core/src/ffi/os_str.rs", + "library/alloc/src/ffi/os_str.rs", "library/std/src/sys", // Platform-specific code for std lives here. "library/std/src/os", // Platform-specific public interfaces // Temporary `std` exceptions