icu_collections/codepointinvliststringlist/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! This module provides functionality for querying of sets of Unicode code points and strings.
6//!
7//! It depends on [`CodePointInversionList`] to efficiently represent Unicode code points, while
8//! it also maintains a list of strings in the set.
9//!
10//! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html).
11
12#[cfg(feature = "alloc")]
13use crate::codepointinvlist::CodePointInversionListBuilder;
14use crate::codepointinvlist::{CodePointInversionList, CodePointInversionListULE};
15#[cfg(feature = "alloc")]
16use alloc::string::{String, ToString};
17#[cfg(feature = "alloc")]
18use alloc::vec::Vec;
19use displaydoc::Display;
20use yoke::Yokeable;
21use zerofrom::ZeroFrom;
22use zerovec::{VarZeroSlice, VarZeroVec};
23
24/// A data structure providing a concrete implementation of a set of code points and strings,
25/// using an inversion list for the code points.
26///
27/// This is what ICU4C calls a `UnicodeSet`.
28#[zerovec::make_varule(CodePointInversionListAndStringListULE)]
29#[zerovec::skip_derive(Ord)]
30#[zerovec::derive(Debug)]
31#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
32#[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))]
33// Valid to auto-derive Deserialize because the invariants are weakly held
34#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
35#[cfg_attr(feature = "serde", zerovec::derive(Serialize, Deserialize, Debug))]
36pub struct CodePointInversionListAndStringList<'data> {
37    #[cfg_attr(feature = "serde", serde(borrow))]
38    #[zerovec::varule(CodePointInversionListULE)]
39    cp_inv_list: CodePointInversionList<'data>,
40    // Invariants (weakly held):
41    //   - no input string is length 1 (a length 1 string should be a single code point)
42    //   - the string list is sorted
43    //   - the elements in the string list are unique
44    #[cfg_attr(feature = "serde", serde(borrow))]
45    str_list: VarZeroVec<'data, str>,
46}
47
48#[cfg(feature = "databake")]
49impl databake::Bake for CodePointInversionListAndStringList<'_> {
50    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
51        env.insert("icu_collections");
52        let cp_inv_list = self.cp_inv_list.bake(env);
53        let str_list = self.str_list.bake(env);
54        // Safe because our parts are safe.
55        databake::quote! {
56            icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList::from_parts_unchecked(#cp_inv_list, #str_list)
57        }
58    }
59}
60
61#[cfg(feature = "databake")]
62impl databake::BakeSize for CodePointInversionListAndStringList<'_> {
63    fn borrows_size(&self) -> usize {
64        self.cp_inv_list.borrows_size() + self.str_list.borrows_size()
65    }
66}
67
68impl<'data> CodePointInversionListAndStringList<'data> {
69    /// Returns a new [`CodePointInversionListAndStringList`] from both a [`CodePointInversionList`] for the
70    /// code points and a [`VarZeroVec`]`<`[`str`]`>` of strings.
71    pub fn try_from(
72        cp_inv_list: CodePointInversionList<'data>,
73        str_list: VarZeroVec<'data, str>,
74    ) -> Result<Self, InvalidStringList> {
75        // Verify invariants:
76        // Do so by using the equivalent of str_list.iter().windows(2) to get
77        // overlapping windows of size 2. The above putative code is not possible
78        // because `.windows()` exists on a slice, but VarZeroVec cannot return a slice
79        // because the non-fixed size elements necessitate at least some type
80        // of allocation.
81        {
82            let mut it = str_list.iter();
83            if let Some(mut x) = it.next() {
84                if x.len() == 1 {
85                    return Err(InvalidStringList::InvalidStringLength(
86                        #[cfg(feature = "alloc")]
87                        x.to_string(),
88                    ));
89                }
90                for y in it {
91                    if x.len() == 1 {
92                        return Err(InvalidStringList::InvalidStringLength(
93                            #[cfg(feature = "alloc")]
94                            x.to_string(),
95                        ));
96                    } else if x == y {
97                        return Err(InvalidStringList::StringListNotUnique(
98                            #[cfg(feature = "alloc")]
99                            x.to_string(),
100                        ));
101                    } else if x > y {
102                        return Err(InvalidStringList::StringListNotSorted(
103                            #[cfg(feature = "alloc")]
104                            x.to_string(),
105                            #[cfg(feature = "alloc")]
106                            y.to_string(),
107                        ));
108                    }
109
110                    // Next window begins. Update `x` here, `y` will be updated in next loop iteration.
111                    x = y;
112                }
113            }
114        }
115
116        Ok(CodePointInversionListAndStringList {
117            cp_inv_list,
118            str_list,
119        })
120    }
121
122    #[doc(hidden)] // databake internal
123    pub const fn from_parts_unchecked(
124        cp_inv_list: CodePointInversionList<'data>,
125        str_list: VarZeroVec<'data, str>,
126    ) -> Self {
127        CodePointInversionListAndStringList {
128            cp_inv_list,
129            str_list,
130        }
131    }
132
133    /// Returns the number of elements in this set (its cardinality).
134    /// Note than the elements of a set may include both individual
135    /// codepoints and strings.
136    pub fn size(&self) -> usize {
137        self.cp_inv_list.size() + self.str_list.len()
138    }
139
140    /// Return true if this set contains multi-code point strings or the empty string.
141    pub fn has_strings(&self) -> bool {
142        !self.str_list.is_empty()
143    }
144
145    ///
146    /// # Examples
147    /// ```
148    /// use icu::collections::codepointinvlist::CodePointInversionList;
149    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
150    /// use zerovec::VarZeroVec;
151    ///
152    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
153    /// let cp_list =
154    ///    CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
155    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
156    /// let str_list = VarZeroVec::<str>::from(str_slice);
157    ///
158    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
159    ///
160    /// assert!(cpilsl.contains_str("bmp_max"));
161    /// assert!(cpilsl.contains_str(""));
162    /// assert!(cpilsl.contains_str("A"));
163    /// assert!(cpilsl.contains_str("ቔ"));  // U+1254 ETHIOPIC SYLLABLE QHEE
164    /// assert!(!cpilsl.contains_str("bazinga!"));
165    /// ```
166    pub fn contains_str(&self, s: &str) -> bool {
167        let mut chars = s.chars();
168        if let Some(first_char) = chars.next() {
169            if chars.next().is_none() {
170                return self.contains(first_char);
171            }
172        }
173        self.str_list.binary_search(s).is_ok()
174    }
175
176    ///
177    /// # Examples
178    /// ```
179    /// use icu::collections::codepointinvlist::CodePointInversionList;
180    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
181    /// use zerovec::VarZeroVec;
182    ///
183    /// let cp_slice = &[0, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
184    /// let cp_list =
185    ///     CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
186    /// let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
187    /// let str_list = VarZeroVec::<str>::from(str_slice);
188    ///
189    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
190    ///
191    /// assert!(cpilsl.contains32(0));
192    /// assert!(cpilsl.contains32(0x0042));
193    /// assert!(!cpilsl.contains32(0x0080));
194    /// ```
195    pub fn contains32(&self, cp: u32) -> bool {
196        self.cp_inv_list.contains32(cp)
197    }
198
199    ///
200    /// # Examples
201    /// ```
202    /// use icu::collections::codepointinvlist::CodePointInversionList;
203    /// use icu::collections::codepointinvliststringlist::CodePointInversionListAndStringList;
204    /// use zerovec::VarZeroVec;
205    ///
206    /// let cp_slice = &[0, 0x1_0000, 0x10_FFFF, 0x11_0000];
207    /// let cp_list =
208    ///    CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
209    /// let str_slice = &["", "bmp_max", "unicode_max", "zero"];
210    /// let str_list = VarZeroVec::<str>::from(str_slice);
211    ///
212    /// let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
213    ///
214    /// assert!(cpilsl.contains('A'));
215    /// assert!(cpilsl.contains('ቔ'));  // U+1254 ETHIOPIC SYLLABLE QHEE
216    /// assert!(!cpilsl.contains('\u{1_0000}'));
217    /// assert!(!cpilsl.contains('🨫'));  // U+1FA2B NEUTRAL CHESS TURNED QUEEN
218    pub fn contains(&self, ch: char) -> bool {
219        self.contains32(ch as u32)
220    }
221
222    /// Access the underlying [`CodePointInversionList`].
223    pub fn code_points(&self) -> &CodePointInversionList<'data> {
224        &self.cp_inv_list
225    }
226
227    /// Access the contained strings.
228    pub fn strings(&self) -> &VarZeroSlice<str> {
229        &self.str_list
230    }
231}
232
233#[cfg(feature = "alloc")]
234impl<'a> FromIterator<&'a str> for CodePointInversionListAndStringList<'_> {
235    fn from_iter<I>(it: I) -> Self
236    where
237        I: IntoIterator<Item = &'a str>,
238    {
239        let mut builder = CodePointInversionListBuilder::new();
240        let mut strings = Vec::<&str>::new();
241        for s in it {
242            let mut chars = s.chars();
243            if let Some(first_char) = chars.next() {
244                if chars.next().is_none() {
245                    builder.add_char(first_char);
246                    continue;
247                }
248            }
249            strings.push(s);
250        }
251
252        // Ensure that the string list is sorted. If not, the binary search that
253        // is used for `.contains(&str)` will return garbage output.
254        strings.sort_unstable();
255        strings.dedup();
256
257        let cp_inv_list = builder.build();
258        let str_list = VarZeroVec::<str>::from(&strings);
259
260        CodePointInversionListAndStringList {
261            cp_inv_list,
262            str_list,
263        }
264    }
265}
266
267/// Custom Errors for [`CodePointInversionListAndStringList`].
268#[derive(Display, Debug)]
269pub enum InvalidStringList {
270    /// A string in the string list had an invalid length
271    #[cfg_attr(feature = "alloc", displaydoc("Invalid string length for string: {0}"))]
272    InvalidStringLength(#[cfg(feature = "alloc")] String),
273    /// A string in the string list appears more than once
274    #[cfg_attr(feature = "alloc", displaydoc("String list has duplicate: {0}"))]
275    StringListNotUnique(#[cfg(feature = "alloc")] String),
276    /// Two strings in the string list compare to each other opposite of sorted order
277    #[cfg_attr(
278        feature = "alloc",
279        displaydoc("Strings in string list not in sorted order: ({0}, {1})")
280    )]
281    StringListNotSorted(
282        #[cfg(feature = "alloc")] String,
283        #[cfg(feature = "alloc")] String,
284    ),
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290
291    #[test]
292    fn test_size_has_strings() {
293        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
294        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
295        let str_slice = &["ascii_max", "bmp_max", "unicode_max", "zero"];
296        let str_list = VarZeroVec::<str>::from(str_slice);
297
298        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
299
300        assert!(cpilsl.has_strings());
301        assert_eq!(8, cpilsl.size());
302    }
303
304    #[test]
305    fn test_empty_string_allowed() {
306        let cp_slice = &[0, 1, 0x7F, 0x80, 0xFFFF, 0x1_0000, 0x10_FFFF, 0x11_0000];
307        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
308        let str_slice = &["", "ascii_max", "bmp_max", "unicode_max", "zero"];
309        let str_list = VarZeroVec::<str>::from(str_slice);
310
311        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list).unwrap();
312
313        assert!(cpilsl.has_strings());
314        assert_eq!(9, cpilsl.size());
315    }
316
317    #[test]
318    fn test_invalid_string() {
319        let cp_slice = &[0, 1];
320        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
321        let str_slice = &["a"];
322        let str_list = VarZeroVec::<str>::from(str_slice);
323
324        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
325
326        assert!(matches!(
327            cpilsl,
328            Err(InvalidStringList::InvalidStringLength(_))
329        ));
330    }
331
332    #[test]
333    fn test_invalid_string_list_has_duplicate() {
334        let cp_slice = &[0, 1];
335        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
336        let str_slice = &["abc", "abc"];
337        let str_list = VarZeroVec::<str>::from(str_slice);
338
339        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
340
341        assert!(matches!(
342            cpilsl,
343            Err(InvalidStringList::StringListNotUnique(_))
344        ));
345    }
346
347    #[test]
348    fn test_invalid_string_list_not_sorted() {
349        let cp_slice = &[0, 1];
350        let cp_list = CodePointInversionList::try_from_u32_inversion_list_slice(cp_slice).unwrap();
351        let str_slice = &["xyz", "abc"];
352        let str_list = VarZeroVec::<str>::from(str_slice);
353
354        let cpilsl = CodePointInversionListAndStringList::try_from(cp_list, str_list);
355
356        assert!(matches!(
357            cpilsl,
358            Err(InvalidStringList::StringListNotSorted(_, _))
359        ));
360    }
361
362    #[test]
363    fn test_from_iter_invariants() {
364        let in_strs_1 = ["a", "abc", "xyz", "abc"];
365        let in_strs_2 = ["xyz", "abc", "a", "abc"];
366
367        let cpilsl_1 = CodePointInversionListAndStringList::from_iter(in_strs_1);
368        let cpilsl_2 = CodePointInversionListAndStringList::from_iter(in_strs_2);
369
370        assert_eq!(cpilsl_1, cpilsl_2);
371
372        assert!(cpilsl_1.has_strings());
373        assert!(cpilsl_1.contains_str("abc"));
374        assert!(cpilsl_1.contains_str("xyz"));
375        assert!(!cpilsl_1.contains_str("def"));
376
377        assert_eq!(1, cpilsl_1.cp_inv_list.size());
378        assert!(cpilsl_1.contains('a'));
379        assert!(!cpilsl_1.contains('0'));
380        assert!(!cpilsl_1.contains('q'));
381
382        assert_eq!(3, cpilsl_1.size());
383    }
384}