potential_utf/
ustr.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use alloc::boxed::Box;
7use core::cmp::Ordering;
8use core::fmt;
9use core::ops::Deref;
10
11/// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant.
12///
13/// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For
14/// example, strings that are keys of a map don't need to ever be reified as `str`s.
15///
16/// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`].
17///
18/// The main advantage of this type over `[u8]` is that it serializes as a string in
19/// human-readable formats like JSON.
20///
21/// # Examples
22///
23/// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]:
24///
25/// ```
26/// use potential_utf::PotentialUtf8;
27/// use zerovec::ZeroMap;
28///
29/// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation.
30/// let map: ZeroMap<PotentialUtf8, u8> = [
31///     (PotentialUtf8::from_bytes(b"abc"), 11),
32///     (PotentialUtf8::from_bytes(b"def"), 22),
33///     (PotentialUtf8::from_bytes(b"ghi"), 33),
34/// ]
35/// .into_iter()
36/// .collect();
37///
38/// let key = "abc";
39/// let value = map.get_copied(PotentialUtf8::from_str(key));
40/// assert_eq!(Some(11), value);
41/// ```
42///
43/// [`ZeroMap`]: zerovec::ZeroMap
44#[repr(transparent)]
45#[derive(PartialEq, Eq, PartialOrd, Ord)]
46#[allow(clippy::exhaustive_structs)] // transparent newtype
47pub struct PotentialUtf8(pub [u8]);
48
49impl fmt::Debug for PotentialUtf8 {
50    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
51        // Debug as a string if possible
52        match self.try_as_str() {
53            Ok(s) => fmt::Debug::fmt(s, f),
54            Err(_) => fmt::Debug::fmt(&self.0, f),
55        }
56    }
57}
58
59impl PotentialUtf8 {
60    /// Create a [`PotentialUtf8`] from a byte slice.
61    #[inline]
62    pub const fn from_bytes(other: &[u8]) -> &Self {
63        // Safety: PotentialUtf8 is transparent over [u8]
64        unsafe { core::mem::transmute(other) }
65    }
66
67    /// Create a [`PotentialUtf8`] from a string slice.
68    #[inline]
69    pub const fn from_str(s: &str) -> &Self {
70        Self::from_bytes(s.as_bytes())
71    }
72
73    /// Create a [`PotentialUtf8`] from boxed bytes.
74    #[inline]
75    #[cfg(feature = "alloc")]
76    pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> {
77        // Safety: PotentialUtf8 is transparent over [u8]
78        unsafe { core::mem::transmute(other) }
79    }
80
81    /// Create a [`PotentialUtf8`] from a boxed `str`.
82    #[inline]
83    #[cfg(feature = "alloc")]
84    pub fn from_boxed_str(other: Box<str>) -> Box<Self> {
85        Self::from_boxed_bytes(other.into_boxed_bytes())
86    }
87
88    /// Get the bytes from a [`PotentialUtf8].
89    #[inline]
90    pub const fn as_bytes(&self) -> &[u8] {
91        &self.0
92    }
93
94    /// Attempt to convert a [`PotentialUtf8`] to a `str`.
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// use potential_utf::PotentialUtf8;
100    ///
101    /// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc");
102    ///
103    /// let b = A.try_as_str().unwrap();
104    /// assert_eq!(b, "abc");
105    /// ```
106    // Note: this is const starting in 1.63
107    #[inline]
108    pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> {
109        core::str::from_utf8(&self.0)
110    }
111}
112
113impl<'a> From<&'a str> for &'a PotentialUtf8 {
114    #[inline]
115    fn from(other: &'a str) -> Self {
116        PotentialUtf8::from_str(other)
117    }
118}
119
120impl PartialEq<str> for PotentialUtf8 {
121    fn eq(&self, other: &str) -> bool {
122        self.eq(Self::from_str(other))
123    }
124}
125
126impl PartialOrd<str> for PotentialUtf8 {
127    fn partial_cmp(&self, other: &str) -> Option<Ordering> {
128        self.partial_cmp(Self::from_str(other))
129    }
130}
131
132impl PartialEq<PotentialUtf8> for str {
133    fn eq(&self, other: &PotentialUtf8) -> bool {
134        PotentialUtf8::from_str(self).eq(other)
135    }
136}
137
138impl PartialOrd<PotentialUtf8> for str {
139    fn partial_cmp(&self, other: &PotentialUtf8) -> Option<Ordering> {
140        PotentialUtf8::from_str(self).partial_cmp(other)
141    }
142}
143
144#[cfg(feature = "alloc")]
145impl From<Box<str>> for Box<PotentialUtf8> {
146    #[inline]
147    fn from(other: Box<str>) -> Self {
148        PotentialUtf8::from_boxed_str(other)
149    }
150}
151
152impl Deref for PotentialUtf8 {
153    type Target = [u8];
154    fn deref(&self) -> &Self::Target {
155        &self.0
156    }
157}
158
159/// This impl requires enabling the optional `zerovec` Cargo feature
160#[cfg(all(feature = "zerovec", feature = "alloc"))]
161impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 {
162    type Container = zerovec::VarZeroVec<'a, PotentialUtf8>;
163    type Slice = zerovec::VarZeroSlice<PotentialUtf8>;
164    type GetType = PotentialUtf8;
165    type OwnedType = Box<PotentialUtf8>;
166}
167
168// Safety (based on the safety checklist on the VarULE trait):
169//  1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE)
170//  2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE)
171//  3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible)
172//  4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible)
173//  5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly)
174//  6. All other methods are defaulted
175//  7. `[T]` byte equality is semantic equality (transparent over a ULE)
176/// This impl requires enabling the optional `zerovec` Cargo feature
177#[cfg(feature = "zerovec")]
178unsafe impl zerovec::ule::VarULE for PotentialUtf8 {
179    #[inline]
180    fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> {
181        Ok(())
182    }
183    #[inline]
184    unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self {
185        PotentialUtf8::from_bytes(bytes)
186    }
187}
188
189/// This impl requires enabling the optional `serde` Cargo feature
190#[cfg(feature = "serde")]
191impl serde::Serialize for PotentialUtf8 {
192    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
193    where
194        S: serde::Serializer,
195    {
196        use serde::ser::Error;
197        let s = self
198            .try_as_str()
199            .map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?;
200        if serializer.is_human_readable() {
201            serializer.serialize_str(s)
202        } else {
203            serializer.serialize_bytes(s.as_bytes())
204        }
205    }
206}
207
208/// This impl requires enabling the optional `serde` Cargo feature
209#[cfg(all(feature = "serde", feature = "alloc"))]
210impl<'de> serde::Deserialize<'de> for Box<PotentialUtf8> {
211    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
212    where
213        D: serde::Deserializer<'de>,
214    {
215        if deserializer.is_human_readable() {
216            let boxed_str = Box::<str>::deserialize(deserializer)?;
217            Ok(PotentialUtf8::from_boxed_str(boxed_str))
218        } else {
219            let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?;
220            Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes))
221        }
222    }
223}
224
225/// This impl requires enabling the optional `serde` Cargo feature
226#[cfg(feature = "serde")]
227impl<'de, 'a> serde::Deserialize<'de> for &'a PotentialUtf8
228where
229    'de: 'a,
230{
231    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
232    where
233        D: serde::Deserializer<'de>,
234    {
235        if deserializer.is_human_readable() {
236            let s = <&str>::deserialize(deserializer)?;
237            Ok(PotentialUtf8::from_str(s))
238        } else {
239            let bytes = <&[u8]>::deserialize(deserializer)?;
240            Ok(PotentialUtf8::from_bytes(bytes))
241        }
242    }
243}
244
245#[repr(transparent)]
246#[derive(PartialEq, Eq, PartialOrd, Ord)]
247#[allow(clippy::exhaustive_structs)] // transparent newtype
248pub struct PotentialUtf16(pub [u16]);
249
250impl fmt::Debug for PotentialUtf16 {
251    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
252        // Debug as a string if possible
253        for c in char::decode_utf16(self.0.iter().copied()) {
254            match c {
255                Ok(c) => write!(f, "{c}")?,
256                Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?,
257            }
258        }
259        Ok(())
260    }
261}
262
263impl PotentialUtf16 {
264    /// Create a [`PotentialUtf16`] from a u16 slice.
265    #[inline]
266    pub const fn from_slice(other: &[u16]) -> &Self {
267        // Safety: PotentialUtf16 is transparent over [u16]
268        unsafe { core::mem::transmute(other) }
269    }
270}