icu_collections/codepointinvlist/
cpinvlist.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "serde")]
6use alloc::format;
7#[cfg(feature = "serde")]
8use alloc::string::String;
9#[cfg(feature = "alloc")]
10use alloc::vec::Vec;
11use core::{char, ops::RangeBounds, ops::RangeInclusive};
12use potential_utf::PotentialCodePoint;
13use yoke::Yokeable;
14use zerofrom::ZeroFrom;
15use zerovec::{ule::AsULE, zerovec, ZeroVec};
16
17use super::InvalidSetError;
18use crate::codepointinvlist::utils::{deconstruct_range, is_valid_zv};
19
20/// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0, inclusive
21const BMP_MAX: u32 = 0xFFFF;
22
23/// Represents the inversion list for a set of all code points in the Basic Multilingual Plane.
24const BMP_INV_LIST_VEC: ZeroVec<PotentialCodePoint> = zerovec!(PotentialCodePoint; PotentialCodePoint::to_unaligned; [PotentialCodePoint::from_u24(0x0), PotentialCodePoint::from_u24(BMP_MAX + 1)]);
25
26/// Represents the inversion list for all of the code points in the Unicode range.
27const ALL_VEC: ZeroVec<PotentialCodePoint> = zerovec!(PotentialCodePoint; PotentialCodePoint::to_unaligned; [PotentialCodePoint::from_u24(0x0), PotentialCodePoint::from_u24((char::MAX as u32) + 1)]);
28
29/// A membership wrapper for [`CodePointInversionList`].
30///
31/// Provides exposure to membership functions and constructors from serialized `CodePointSet`s (sets of code points)
32/// and predefined ranges.
33#[zerovec::make_varule(CodePointInversionListULE)]
34#[zerovec::skip_derive(Ord)]
35#[zerovec::derive(Debug)]
36#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
37#[cfg_attr(not(feature = "alloc"), zerovec::skip_derive(ZeroMapKV, ToOwned))]
38pub struct CodePointInversionList<'data> {
39    // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature
40    // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html
41    // Allows for traits of fixed size arrays
42
43    // Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list)
44    inv_list: ZeroVec<'data, PotentialCodePoint>,
45    size: u32,
46}
47
48#[cfg(feature = "serde")]
49impl<'de: 'a, 'a> serde::Deserialize<'de> for CodePointInversionList<'a> {
50    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
51    where
52        D: serde::Deserializer<'de>,
53    {
54        use serde::de::Error;
55
56        let parsed_inv_list = if deserializer.is_human_readable() {
57            let parsed_strings = Vec::<alloc::borrow::Cow<'de, str>>::deserialize(deserializer)?;
58            let mut inv_list = ZeroVec::new_owned(Vec::with_capacity(parsed_strings.len() * 2));
59            for range in parsed_strings {
60                fn internal(range: &str) -> Option<(u32, u32)> {
61                    let (start, range) = UnicodeCodePoint::parse(range)?;
62                    if range.is_empty() {
63                        return Some((start.0, start.0));
64                    }
65                    let (hyphen, range) = UnicodeCodePoint::parse(range)?;
66                    if hyphen.0 != '-' as u32 {
67                        return None;
68                    }
69                    let (end, range) = UnicodeCodePoint::parse(range)?;
70                    range.is_empty().then_some((start.0, end.0))
71                }
72                let (start, end) = internal(&range).ok_or_else(|| Error::custom(format!(
73                    "Cannot deserialize invalid inversion list for CodePointInversionList: {range:?}"
74                )))?;
75                inv_list.with_mut(|v| {
76                    v.push(PotentialCodePoint::from_u24(start).to_unaligned());
77                    v.push(PotentialCodePoint::from_u24(end + 1).to_unaligned());
78                });
79            }
80            inv_list
81        } else {
82            ZeroVec::<PotentialCodePoint>::deserialize(deserializer)?
83        };
84        CodePointInversionList::try_from_inversion_list(parsed_inv_list).map_err(|e| {
85            Error::custom(format!(
86                "Cannot deserialize invalid inversion list for CodePointInversionList: {e:?}"
87            ))
88        })
89    }
90}
91
92#[cfg(feature = "databake")]
93impl databake::Bake for CodePointInversionList<'_> {
94    fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
95        env.insert("icu_collections");
96        let inv_list = self.inv_list.bake(env);
97        let size = self.size.bake(env);
98        // Safe because our parts are safe.
99        databake::quote! { unsafe {
100            #[allow(unused_unsafe)]
101            icu_collections::codepointinvlist::CodePointInversionList::from_parts_unchecked(#inv_list, #size)
102        }}
103    }
104}
105
106#[cfg(feature = "databake")]
107impl databake::BakeSize for CodePointInversionList<'_> {
108    fn borrows_size(&self) -> usize {
109        self.inv_list.borrows_size()
110    }
111}
112
113#[cfg(feature = "serde")]
114#[derive(Debug, Copy, Clone)]
115struct UnicodeCodePoint(u32);
116
117#[cfg(feature = "serde")]
118impl UnicodeCodePoint {
119    fn from_u32(cp: u32) -> Result<Self, String> {
120        if cp <= char::MAX as u32 {
121            Ok(Self(cp))
122        } else {
123            Err(format!("Not a Unicode code point {}", cp))
124        }
125    }
126
127    fn parse(value: &str) -> Option<(Self, &str)> {
128        Some(if let Some(hex) = value.strip_prefix("U+") {
129            let (escape, remainder) = (hex.get(..4)?, hex.get(4..)?);
130            (Self(u32::from_str_radix(escape, 16).ok()?), remainder)
131        } else {
132            let c = value.chars().next()?;
133            (Self(c as u32), value.get(c.len_utf8()..)?)
134        })
135    }
136}
137
138#[cfg(feature = "serde")]
139impl core::fmt::Display for UnicodeCodePoint {
140    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
141        match self.0 {
142            s @ 0xD800..=0xDFFF => write!(f, "U+{s:X}"),
143            // char should be in range by construction but this code is not so performance-sensitive
144            // so we just use the replacement character
145            c => write!(
146                f,
147                "{}",
148                char::from_u32(c).unwrap_or(char::REPLACEMENT_CHARACTER)
149            ),
150        }
151    }
152}
153
154#[cfg(feature = "serde")]
155impl serde::Serialize for CodePointInversionList<'_> {
156    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
157    where
158        S: serde::Serializer,
159    {
160        if serializer.is_human_readable() {
161            use serde::ser::Error;
162            use serde::ser::SerializeSeq;
163            let mut seq = serializer.serialize_seq(Some(self.inv_list.len() / 2))?;
164            for range in self.iter_ranges() {
165                let start = UnicodeCodePoint::from_u32(*range.start()).map_err(S::Error::custom)?;
166                if range.start() == range.end() {
167                    seq.serialize_element(&format!("{start}"))?;
168                } else {
169                    let end = UnicodeCodePoint::from_u32(*range.end()).map_err(S::Error::custom)?;
170                    seq.serialize_element(&format!("{start}-{end}",))?;
171                }
172            }
173            seq.end()
174        } else {
175            // Note: serde(flatten) currently does not promote a struct field of type Vec
176            // to replace the struct when serializing. The error message from the default
177            // serialization is: "can only flatten structs and maps (got a sequence)".
178            self.inv_list.serialize(serializer)
179        }
180    }
181}
182
183impl<'data> CodePointInversionList<'data> {
184    /// Returns a new [`CodePointInversionList`] from an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
185    /// represented as a [`ZeroVec`]`<`[`PotentialCodePoint`]`>` of code points.
186    ///
187    /// The inversion list must be of even length, sorted ascending non-overlapping,
188    /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
189    ///
190    /// # Examples
191    ///
192    /// ```
193    /// use icu::collections::codepointinvlist::CodePointInversionList;
194    /// use icu::collections::codepointinvlist::InvalidSetError;
195    /// use potential_utf::PotentialCodePoint;
196    /// use zerovec::ZeroVec;
197    ///
198    /// let valid = [0x0, 0x10000];
199    /// let inv_list: ZeroVec<PotentialCodePoint> = valid
200    ///     .into_iter()
201    ///     .map(PotentialCodePoint::from_u24)
202    ///     .collect();
203    /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
204    /// assert!(matches!(result, CodePointInversionList));
205    ///
206    /// let invalid = vec![0x0, 0x80, 0x3];
207    /// let inv_list: ZeroVec<PotentialCodePoint> = invalid
208    ///     .iter()
209    ///     .copied()
210    ///     .map(PotentialCodePoint::from_u24)
211    ///     .collect();
212    /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
213    /// assert!(matches!(result, Err(InvalidSetError(_))));
214    /// if let Err(InvalidSetError(actual)) = result {
215    ///     assert_eq!(
216    ///         &invalid,
217    ///         &actual.into_iter().map(u32::from).collect::<Vec<_>>()
218    ///     );
219    /// }
220    /// ```
221    pub fn try_from_inversion_list(
222        inv_list: ZeroVec<'data, PotentialCodePoint>,
223    ) -> Result<Self, InvalidSetError> {
224        #[allow(clippy::indexing_slicing)] // chunks
225        if is_valid_zv(&inv_list) {
226            let size = inv_list
227                .as_ule_slice()
228                .chunks(2)
229                .map(|end_points| {
230                    u32::from(<PotentialCodePoint as AsULE>::from_unaligned(end_points[1]))
231                        - u32::from(<PotentialCodePoint as AsULE>::from_unaligned(end_points[0]))
232                })
233                .sum::<u32>();
234            Ok(Self { inv_list, size })
235        } else {
236            Err(InvalidSetError(
237                #[cfg(feature = "alloc")]
238                inv_list.to_vec(),
239            ))
240        }
241    }
242
243    /// Safety: no actual safety invariants, however has correctness invariants
244    #[doc(hidden)] // databake internal
245    pub const unsafe fn from_parts_unchecked(
246        inv_list: ZeroVec<'data, PotentialCodePoint>,
247        size: u32,
248    ) -> Self {
249        Self { inv_list, size }
250    }
251
252    /// Returns a new, fully-owned [`CodePointInversionList`] by cloning an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
253    /// represented as a slice of [`PotentialCodePoint`] code points.
254    ///
255    /// The inversion list must be of even length, sorted ascending non-overlapping,
256    /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
257    ///
258    /// # Examples
259    ///
260    /// ```
261    /// use icu::collections::codepointinvlist::CodePointInversionList;
262    ///
263    /// let bmp_list = &[0x0, 0x10000];
264    /// let smp_list = &[0x10000, 0x20000];
265    /// let sip_list = &[0x20000, 0x30000];
266    ///
267    /// let lists: Vec<CodePointInversionList> =
268    ///     [&bmp_list[..], smp_list, sip_list]
269    ///         .into_iter()
270    ///         .map(|l| {
271    ///             CodePointInversionList::try_from_u32_inversion_list_slice(l)
272    ///                 .unwrap()
273    ///         })
274    ///         .collect();
275    ///
276    /// let bmp = &lists[0];
277    /// assert!(bmp.contains32(0xFFFF));
278    /// assert!(!bmp.contains32(0x10000));
279    ///
280    /// assert!(!lists.iter().any(|set| set.contains32(0x40000)));
281    /// ```
282    #[cfg(feature = "alloc")]
283    pub fn try_from_u32_inversion_list_slice(inv_list: &[u32]) -> Result<Self, InvalidSetError> {
284        let inv_list_zv: ZeroVec<PotentialCodePoint> = inv_list
285            .iter()
286            .copied()
287            .map(PotentialCodePoint::from_u24)
288            .collect();
289        CodePointInversionList::try_from_inversion_list(inv_list_zv)
290    }
291
292    /// Attempts to convert this list into a fully-owned one. No-op if already fully owned
293    #[cfg(feature = "alloc")]
294    pub fn into_owned(self) -> CodePointInversionList<'static> {
295        CodePointInversionList {
296            inv_list: self.inv_list.into_owned(),
297            size: self.size,
298        }
299    }
300
301    /// Returns an owned inversion list representing the current [`CodePointInversionList`]
302    #[cfg(feature = "alloc")]
303    pub fn get_inversion_list_vec(&self) -> Vec<u32> {
304        self.as_inversion_list().iter().map(u32::from).collect()
305    }
306
307    /// Returns [`CodePointInversionList`] spanning entire Unicode range
308    ///
309    /// The range spans from `0x0 -> 0x10FFFF` inclusive.
310    ///  
311    /// # Examples
312    ///
313    /// ```
314    /// use icu::collections::codepointinvlist::CodePointInversionList;
315    ///
316    /// let expected = [0x0, (char::MAX as u32) + 1];
317    /// assert_eq!(
318    ///     CodePointInversionList::all().get_inversion_list_vec(),
319    ///     expected
320    /// );
321    /// assert_eq!(
322    ///     CodePointInversionList::all().size(),
323    ///     (expected[1] - expected[0]) as usize
324    /// );
325    /// ```
326    pub fn all() -> Self {
327        Self {
328            inv_list: ALL_VEC,
329            size: (char::MAX as u32) + 1,
330        }
331    }
332
333    /// Returns [`CodePointInversionList`] spanning BMP range
334    ///
335    /// The range spans from `0x0 -> 0xFFFF` inclusive.
336    ///
337    /// # Examples
338    ///
339    /// ```
340    /// use icu::collections::codepointinvlist::CodePointInversionList;
341    ///
342    /// const BMP_MAX: u32 = 0xFFFF;
343    ///
344    /// let expected = [0x0, BMP_MAX + 1];
345    /// assert_eq!(
346    ///     CodePointInversionList::bmp().get_inversion_list_vec(),
347    ///     expected
348    /// );
349    /// assert_eq!(
350    ///     CodePointInversionList::bmp().size(),
351    ///     (expected[1] - expected[0]) as usize
352    /// );
353    /// ```
354    pub fn bmp() -> Self {
355        Self {
356            inv_list: BMP_INV_LIST_VEC,
357            size: BMP_MAX + 1,
358        }
359    }
360
361    /// Returns the inversion list as a slice
362    ///
363    /// Public only to the crate, not exposed to public
364    #[cfg(feature = "alloc")]
365    pub(crate) fn as_inversion_list(&self) -> &ZeroVec<PotentialCodePoint> {
366        &self.inv_list
367    }
368
369    /// Yields an [`Iterator`] going through the character set in the [`CodePointInversionList`]
370    ///
371    /// # Examples
372    ///
373    /// ```
374    /// use icu::collections::codepointinvlist::CodePointInversionList;
375    /// let example_list = [0x41, 0x44, 0x45, 0x46];
376    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
377    ///     &example_list,
378    /// )
379    /// .unwrap();
380    /// let mut ex_iter_chars = example.iter_chars();
381    /// assert_eq!(Some('A'), ex_iter_chars.next());
382    /// assert_eq!(Some('B'), ex_iter_chars.next());
383    /// assert_eq!(Some('C'), ex_iter_chars.next());
384    /// assert_eq!(Some('E'), ex_iter_chars.next());
385    /// assert_eq!(None, ex_iter_chars.next());
386    /// ```
387    pub fn iter_chars(&self) -> impl Iterator<Item = char> + '_ {
388        #[allow(clippy::indexing_slicing)] // chunks
389        self.inv_list
390            .as_ule_slice()
391            .chunks(2)
392            .flat_map(|pair| {
393                u32::from(PotentialCodePoint::from_unaligned(pair[0]))
394                    ..u32::from(PotentialCodePoint::from_unaligned(pair[1]))
395            })
396            .filter_map(char::from_u32)
397    }
398
399    /// Yields an [`Iterator`] returning the ranges of the code points that are
400    /// included in the [`CodePointInversionList`]
401    ///
402    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
403    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
404    /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
405    ///
406    /// # Example
407    ///
408    /// ```
409    /// use icu::collections::codepointinvlist::CodePointInversionList;
410    /// let example_list = [0x41, 0x44, 0x45, 0x46];
411    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
412    ///     &example_list,
413    /// )
414    /// .unwrap();
415    /// let mut example_iter_ranges = example.iter_ranges();
416    /// assert_eq!(Some(0x41..=0x43), example_iter_ranges.next());
417    /// assert_eq!(Some(0x45..=0x45), example_iter_ranges.next());
418    /// assert_eq!(None, example_iter_ranges.next());
419    /// ```
420    pub fn iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_ {
421        #[allow(clippy::indexing_slicing)] // chunks
422        self.inv_list.as_ule_slice().chunks(2).map(|pair| {
423            let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0]));
424            let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1]));
425            range_start..=(range_limit - 1)
426        })
427    }
428
429    /// Yields an [`Iterator`] returning the ranges of the code points that are
430    /// *not* included in the [`CodePointInversionList`]
431    ///
432    /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
433    /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
434    /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
435    ///
436    /// # Example
437    ///
438    /// ```
439    /// use icu::collections::codepointinvlist::CodePointInversionList;
440    /// let example_list = [0x41, 0x44, 0x45, 0x46];
441    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
442    ///     &example_list,
443    /// )
444    /// .unwrap();
445    /// let mut example_iter_ranges = example.iter_ranges_complemented();
446    /// assert_eq!(Some(0..=0x40), example_iter_ranges.next());
447    /// assert_eq!(Some(0x44..=0x44), example_iter_ranges.next());
448    /// assert_eq!(Some(0x46..=char::MAX as u32), example_iter_ranges.next());
449    /// assert_eq!(None, example_iter_ranges.next());
450    /// ```
451    pub fn iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
452        let inv_ule = self.inv_list.as_ule_slice();
453        let middle = inv_ule.get(1..inv_ule.len() - 1).unwrap_or(&[]);
454        let beginning = if let Some(first) = self.inv_list.first() {
455            let first = u32::from(first);
456            if first == 0 {
457                None
458            } else {
459                Some(0..=first - 1)
460            }
461        } else {
462            None
463        };
464        let end = if let Some(last) = self.inv_list.last() {
465            let last = u32::from(last);
466            if last == char::MAX as u32 {
467                None
468            } else {
469                Some(last..=char::MAX as u32)
470            }
471        } else {
472            None
473        };
474        #[allow(clippy::indexing_slicing)] // chunks
475        let chunks = middle.chunks(2).map(|pair| {
476            let range_start = u32::from(PotentialCodePoint::from_unaligned(pair[0]));
477            let range_limit = u32::from(PotentialCodePoint::from_unaligned(pair[1]));
478            range_start..=(range_limit - 1)
479        });
480        beginning.into_iter().chain(chunks).chain(end)
481    }
482
483    /// Returns the number of ranges contained in this [`CodePointInversionList`]
484    pub fn get_range_count(&self) -> usize {
485        self.inv_list.len() / 2
486    }
487
488    /// Returns a specific range contained in this [`CodePointInversionList`] by index.
489    /// Intended for use in FFI.
490    pub fn get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>> {
491        let start_idx = idx * 2;
492        let end_idx = start_idx + 1;
493        let start = u32::from(self.inv_list.get(start_idx)?);
494        let end = u32::from(self.inv_list.get(end_idx)?);
495        Some(start..=(end - 1))
496    }
497
498    /// Returns the number of elements of the [`CodePointInversionList`]
499    pub fn size(&self) -> usize {
500        if self.is_empty() {
501            return 0;
502        }
503        self.size as usize
504    }
505
506    /// Returns whether or not the [`CodePointInversionList`] is empty
507    pub fn is_empty(&self) -> bool {
508        self.inv_list.is_empty()
509    }
510
511    /// Wrapper for contains
512    ///
513    /// Returns an [`Option`] as to whether or not it is possible for the query to be contained.
514    /// The value in the [`Option`] is the start index of the range that contains the query.
515    fn contains_query(&self, query: u32) -> Option<usize> {
516        let query = PotentialCodePoint::try_from(query).ok()?;
517        match self.inv_list.binary_search(&query) {
518            Ok(pos) => {
519                if pos % 2 == 0 {
520                    Some(pos)
521                } else {
522                    None
523                }
524            }
525            Err(pos) => {
526                if pos % 2 != 0 && pos < self.inv_list.len() {
527                    Some(pos - 1)
528                } else {
529                    None
530                }
531            }
532        }
533    }
534
535    /// Checks to see the query is in the [`CodePointInversionList`]
536    ///
537    /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
538    /// in the set using [`core`] implementation
539    ///
540    /// # Examples
541    ///
542    /// ```
543    /// use icu::collections::codepointinvlist::CodePointInversionList;
544    /// let example_list = [0x41, 0x43, 0x44, 0x45];
545    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
546    ///     &example_list,
547    /// )
548    /// .unwrap();
549    /// assert!(example.contains('A'));
550    /// assert!(!example.contains('C'));
551    /// ```
552    pub fn contains(&self, query: char) -> bool {
553        self.contains_query(query as u32).is_some()
554    }
555
556    /// Checks to see the unsigned int is in the [`CodePointInversionList::all()`](CodePointInversionList::all())
557    ///
558    /// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte
559    /// values, there is an important difference. A [`u32`] can take values up to
560    /// a very large integer value, while a [`prim@char`] in Rust is defined to be in
561    /// the range from 0 to the maximum valid Unicode Scalar Value.
562    ///
563    /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
564    /// in the set using [`core`] implementation
565    ///
566    /// # Examples
567    ///
568    /// ```
569    /// use icu::collections::codepointinvlist::CodePointInversionList;
570    /// let example_list = [0x41, 0x43, 0x44, 0x45];
571    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
572    ///     &example_list,
573    /// )
574    /// .unwrap();
575    /// assert!(example.contains32(0x41));
576    /// assert!(!example.contains32(0x43));
577    /// ```
578    pub fn contains32(&self, query: u32) -> bool {
579        self.contains_query(query).is_some()
580    }
581
582    /// Checks to see if the range is in the [`CodePointInversionList`]
583    ///
584    /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
585    /// in the set using [`Vec`] implementation. Only runs the search once on the `start`
586    /// parameter, while the `end` parameter is checked in a single `O(1)` step.
587    ///
588    /// # Examples
589    ///
590    /// ```
591    /// use icu::collections::codepointinvlist::CodePointInversionList;
592    /// let example_list = [0x41, 0x43, 0x44, 0x45];
593    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
594    ///     &example_list,
595    /// )
596    /// .unwrap();
597    /// assert!(example.contains_range('A'..'C'));
598    /// assert!(example.contains_range('A'..='B'));
599    /// assert!(!example.contains_range('A'..='C'));
600    /// ```
601    ///
602    /// Surrogate points (`0xD800 -> 0xDFFF`) will return [`false`] if the Range contains them but the
603    /// [`CodePointInversionList`] does not.
604    ///
605    /// Note: when comparing to ICU4C/J, keep in mind that `Range`s in Rust are
606    /// constructed inclusive of start boundary and exclusive of end boundary.
607    /// The ICU4C/J `CodePointInversionList::contains(UChar32 start, UChar32 end)` method
608    /// differs by including the end boundary.
609    ///
610    /// # Examples
611    ///
612    /// ```
613    /// use icu::collections::codepointinvlist::CodePointInversionList;
614    /// use std::char;
615    /// let check =
616    ///     char::from_u32(0xD7FE).unwrap()..char::from_u32(0xE001).unwrap();
617    /// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001];
618    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
619    ///     &example_list,
620    /// )
621    /// .unwrap();
622    /// assert!(!example.contains_range(check));
623    /// ```
624    pub fn contains_range(&self, range: impl RangeBounds<char>) -> bool {
625        let (from, till) = deconstruct_range(range);
626        if from >= till {
627            return false;
628        }
629        match self.contains_query(from) {
630            Some(pos) => {
631                if let Some(x) = self.inv_list.get(pos + 1) {
632                    (till) <= x.into()
633                } else {
634                    debug_assert!(
635                        false,
636                        "Inversion list query should not return out of bounds index"
637                    );
638                    false
639                }
640            }
641            None => false,
642        }
643    }
644
645    /// Check if the calling [`CodePointInversionList`] contains all the characters of the given [`CodePointInversionList`]
646    ///
647    /// # Examples
648    ///
649    /// ```
650    /// use icu::collections::codepointinvlist::CodePointInversionList;
651    /// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z
652    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
653    ///     &example_list,
654    /// )
655    /// .unwrap();
656    /// let a_to_d = CodePointInversionList::try_from_u32_inversion_list_slice(&[
657    ///     0x41, 0x45,
658    /// ])
659    /// .unwrap();
660    /// let f_to_t = CodePointInversionList::try_from_u32_inversion_list_slice(&[
661    ///     0x46, 0x55,
662    /// ])
663    /// .unwrap();
664    /// let r_to_x = CodePointInversionList::try_from_u32_inversion_list_slice(&[
665    ///     0x52, 0x58,
666    /// ])
667    /// .unwrap();
668    /// assert!(example.contains_set(&a_to_d)); // contains all
669    /// assert!(!example.contains_set(&f_to_t)); // contains none
670    /// assert!(!example.contains_set(&r_to_x)); // contains some
671    /// ```
672    pub fn contains_set(&self, set: &Self) -> bool {
673        if set.size() > self.size() {
674            return false;
675        }
676
677        let mut set_ranges = set.iter_ranges();
678        let mut check_elem = set_ranges.next();
679
680        let ranges = self.iter_ranges();
681        for range in ranges {
682            match check_elem {
683                Some(ref check_range) => {
684                    if check_range.start() >= range.start()
685                        && check_range.end() <= &(range.end() + 1)
686                    {
687                        check_elem = set_ranges.next();
688                    }
689                }
690                _ => break,
691            }
692        }
693        check_elem.is_none()
694    }
695
696    /// Returns the end of the initial substring where the characters are either contained/not contained
697    /// in the set.
698    ///
699    /// # Examples
700    ///
701    /// ```
702    /// use icu::collections::codepointinvlist::CodePointInversionList;
703    /// let example_list = [0x41, 0x44]; // {A, B, C}
704    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
705    ///     &example_list,
706    /// )
707    /// .unwrap();
708    /// assert_eq!(example.span("CABXYZ", true), 3);
709    /// assert_eq!(example.span("XYZC", false), 3);
710    /// assert_eq!(example.span("XYZ", true), 0);
711    /// assert_eq!(example.span("ABC", false), 0);
712    /// ```
713    pub fn span(&self, span_str: &str, contained: bool) -> usize {
714        span_str
715            .chars()
716            .take_while(|&x| self.contains(x) == contained)
717            .count()
718    }
719
720    /// Returns the start of the trailing substring (starting from end of string) where the characters are
721    /// either contained/not contained in the set. Returns the length of the string if no valid return.
722    ///
723    /// # Examples
724    ///
725    /// ```
726    /// use icu::collections::codepointinvlist::CodePointInversionList;
727    /// let example_list = [0x41, 0x44]; // {A, B, C}
728    /// let example = CodePointInversionList::try_from_u32_inversion_list_slice(
729    ///     &example_list,
730    /// )
731    /// .unwrap();
732    /// assert_eq!(example.span_back("XYZCAB", true), 3);
733    /// assert_eq!(example.span_back("ABCXYZ", true), 6);
734    /// assert_eq!(example.span_back("CABXYZ", false), 3);
735    /// ```
736    pub fn span_back(&self, span_str: &str, contained: bool) -> usize {
737        span_str.len()
738            - span_str
739                .chars()
740                .rev()
741                .take_while(|&x| self.contains(x) == contained)
742                .count()
743    }
744}
745
746#[cfg(test)]
747mod tests {
748    use super::{CodePointInversionList, InvalidSetError};
749    use std::{char, vec::Vec};
750    use zerovec::ZeroVec;
751
752    #[test]
753    fn test_codepointinversionlist_try_from_vec() {
754        let ex = vec![0x2, 0x3, 0x4, 0x5];
755        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
756        assert_eq!(ex, check.get_inversion_list_vec());
757        assert_eq!(2, check.size());
758    }
759
760    #[test]
761    fn test_codepointinversionlist_try_from_vec_error() {
762        let check = vec![0x1, 0x1, 0x2, 0x3, 0x4];
763        let set = CodePointInversionList::try_from_u32_inversion_list_slice(&check);
764        assert!(matches!(set, Err(InvalidSetError(_))));
765        if let Err(InvalidSetError(actual)) = set {
766            assert_eq!(
767                &check,
768                &actual.into_iter().map(u32::from).collect::<Vec<_>>()
769            );
770        }
771    }
772
773    // CodePointInversionList membership functions
774    #[test]
775    fn test_codepointinversionlist_contains_query() {
776        let ex = vec![0x41, 0x46, 0x4B, 0x55];
777        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
778        assert!(check.contains_query(0x40).is_none());
779        assert_eq!(check.contains_query(0x41).unwrap(), 0);
780        assert_eq!(check.contains_query(0x44).unwrap(), 0);
781        assert!(check.contains_query(0x46).is_none());
782        assert_eq!(check.contains_query(0x4C).unwrap(), 2);
783        assert!(check.contains_query(0x56).is_none());
784    }
785
786    #[test]
787    fn test_codepointinversionlist_contains() {
788        let ex = vec![0x2, 0x5, 0xA, 0xF];
789        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
790        assert!(check.contains(0x2 as char));
791        assert!(check.contains(0x4 as char));
792        assert!(check.contains(0xA as char));
793        assert!(check.contains(0xE as char));
794    }
795
796    #[test]
797    fn test_codepointinversionlist_contains_false() {
798        let ex = vec![0x2, 0x5, 0xA, 0xF];
799        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
800        assert!(!check.contains(0x1 as char));
801        assert!(!check.contains(0x5 as char));
802        assert!(!check.contains(0x9 as char));
803        assert!(!check.contains(0xF as char));
804        assert!(!check.contains(0x10 as char));
805    }
806
807    #[test]
808    fn test_codepointinversionlist_contains_range() {
809        let ex = vec![0x41, 0x46, 0x4B, 0x55];
810        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
811        assert!(check.contains_range('A'..='E')); // 65 - 69
812        assert!(check.contains_range('C'..'D')); // 67 - 67
813        assert!(check.contains_range('L'..'P')); // 76 - 80
814        assert!(!check.contains_range('L'..='U')); // 76 - 85
815    }
816
817    #[test]
818    fn test_codepointinversionlist_contains_range_false() {
819        let ex = vec![0x41, 0x46, 0x4B, 0x55];
820        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
821        assert!(!check.contains_range('!'..'A')); // 33 - 65
822        assert!(!check.contains_range('F'..'K')); // 70 - 74
823        assert!(!check.contains_range('U'..)); // 85 - ..
824    }
825
826    #[test]
827    fn test_codepointinversionlist_contains_range_invalid() {
828        let check = CodePointInversionList::all();
829        assert!(!check.contains_range('A'..'!')); // 65 - 33
830        assert!(!check.contains_range('A'..'A')); // 65 - 65
831    }
832
833    #[test]
834    fn test_codepointinversionlist_contains_set_u() {
835        let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x6E];
836        let u = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
837        let inside = vec![0xF, 0x14, 0x2C, 0x31, 0x46, 0x50, 0x64, 0x6D];
838        let s = CodePointInversionList::try_from_u32_inversion_list_slice(&inside).unwrap();
839        assert!(u.contains_set(&s));
840    }
841
842    #[test]
843    fn test_codepointinversionlist_contains_set_u_false() {
844        let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x78];
845        let u = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
846        let outside = vec![0x0, 0xA, 0x16, 0x2C, 0x32, 0x46, 0x4F, 0x51, 0x6D, 0x6F];
847        let s = CodePointInversionList::try_from_u32_inversion_list_slice(&outside).unwrap();
848        assert!(!u.contains_set(&s));
849    }
850
851    #[test]
852    fn test_codepointinversionlist_size() {
853        let ex = vec![0x2, 0x5, 0xA, 0xF];
854        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
855        assert_eq!(8, check.size());
856        let check = CodePointInversionList::all();
857        let expected = (char::MAX as u32) + 1;
858        assert_eq!(expected as usize, check.size());
859        let inv_list_vec = vec![];
860        let check = CodePointInversionList {
861            inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
862            size: 0,
863        };
864        assert_eq!(check.size(), 0);
865    }
866
867    #[test]
868    fn test_codepointinversionlist_is_empty() {
869        let inv_list_vec = vec![];
870        let check = CodePointInversionList {
871            inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
872            size: 0,
873        };
874        assert!(check.is_empty());
875    }
876
877    #[test]
878    fn test_codepointinversionlist_is_not_empty() {
879        let check = CodePointInversionList::all();
880        assert!(!check.is_empty());
881    }
882
883    #[test]
884    fn test_codepointinversionlist_iter_chars() {
885        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
886        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
887        let mut iter = check.iter_chars();
888        assert_eq!(Some('A'), iter.next());
889        assert_eq!(Some('B'), iter.next());
890        assert_eq!(Some('C'), iter.next());
891        assert_eq!(Some('E'), iter.next());
892        assert_eq!(None, iter.next());
893    }
894
895    #[test]
896    fn test_codepointinversionlist_iter_ranges() {
897        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
898        let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
899        let mut ranges = set.iter_ranges();
900        assert_eq!(Some(0x41..=0x43), ranges.next());
901        assert_eq!(Some(0x45..=0x45), ranges.next());
902        assert_eq!(Some(0xD800..=0xD800), ranges.next());
903        assert_eq!(None, ranges.next());
904    }
905
906    #[test]
907    fn test_codepointinversionlist_iter_ranges_exactsizeiter_trait() {
908        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
909        let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
910        let ranges = set.iter_ranges();
911        assert_eq!(3, ranges.len());
912    }
913
914    #[test]
915    fn test_codepointinversionlist_range_count() {
916        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
917        let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
918        assert_eq!(3, set.get_range_count());
919    }
920
921    #[test]
922    fn test_codepointinversionlist_get_nth_range() {
923        let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
924        let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
925        assert_eq!(Some(0x41..=0x43), set.get_nth_range(0));
926        assert_eq!(Some(0x45..=0x45), set.get_nth_range(1));
927        assert_eq!(Some(0xD800..=0xD800), set.get_nth_range(2));
928        assert_eq!(None, set.get_nth_range(3));
929    }
930
931    // Range<char> cannot represent the upper bound (non-inclusive) for
932    // char::MAX, whereas Range<u32> can.
933    #[test]
934    fn test_codepointinversionlist_iter_ranges_with_max_code_point() {
935        let ex = vec![0x80, (char::MAX as u32) + 1];
936        let set = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
937        let mut ranges = set.iter_ranges();
938        assert_eq!(Some(0x80..=(char::MAX as u32)), ranges.next());
939        assert_eq!(None, ranges.next());
940    }
941
942    #[test]
943    fn test_codepointinversionlist_span_contains() {
944        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
945        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
946        assert_eq!(check.span("ABCDE", true), 3);
947        assert_eq!(check.span("E", true), 0);
948    }
949
950    #[test]
951    fn test_codepointinversionlist_span_does_not_contain() {
952        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
953        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
954        assert_eq!(check.span("DEF", false), 2);
955        assert_eq!(check.span("KLMA", false), 3);
956    }
957
958    #[test]
959    fn test_codepointinversionlist_span_back_contains() {
960        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
961        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
962        assert_eq!(check.span_back("XYZABFH", true), 3);
963        assert_eq!(check.span_back("ABCXYZ", true), 6);
964    }
965
966    #[test]
967    fn test_codepointinversionlist_span_back_does_not_contain() {
968        let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
969        let check = CodePointInversionList::try_from_u32_inversion_list_slice(&ex).unwrap();
970        assert_eq!(check.span_back("ABCXYZ", false), 3);
971        assert_eq!(check.span_back("XYZABC", false), 6);
972    }
973
974    #[test]
975    fn test_uniset_to_inv_list() {
976        let inv_list = [
977            0x9, 0xE, 0x20, 0x21, 0x85, 0x86, 0xA0, 0xA1, 0x1626, 0x1627, 0x2000, 0x2003, 0x2028,
978            0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
979        ];
980        let s: CodePointInversionList =
981            CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap();
982        let round_trip_inv_list = s.get_inversion_list_vec();
983        assert_eq!(
984            round_trip_inv_list.into_iter().collect::<ZeroVec<u32>>(),
985            inv_list
986        );
987    }
988
989    #[test]
990    fn test_serde_serialize() {
991        let inv_list = [0x41, 0x46, 0x4B, 0x55];
992        let uniset = CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap();
993        let json_str = serde_json::to_string(&uniset).unwrap();
994        assert_eq!(json_str, r#"["A-E","K-T"]"#);
995    }
996
997    #[test]
998    fn test_serde_serialize_surrogates() {
999        let inv_list = [0xDFAB, 0xDFFF];
1000        let uniset = CodePointInversionList::try_from_u32_inversion_list_slice(&inv_list).unwrap();
1001        let json_str = serde_json::to_string(&uniset).unwrap();
1002        assert_eq!(json_str, r#"["U+DFAB-U+DFFE"]"#);
1003    }
1004
1005    #[test]
1006    fn test_serde_deserialize() {
1007        let inv_list_str = r#"["A-E","K-T"]"#;
1008        let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
1009        let exp_uniset =
1010            CodePointInversionList::try_from_u32_inversion_list_slice(&exp_inv_list).unwrap();
1011        let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1012        assert_eq!(act_uniset, exp_uniset);
1013    }
1014
1015    #[test]
1016    fn test_serde_deserialize_surrogates() {
1017        let inv_list_str = r#"["U+DFAB-U+DFFE"]"#;
1018        let exp_inv_list = [0xDFAB, 0xDFFF];
1019        let exp_uniset =
1020            CodePointInversionList::try_from_u32_inversion_list_slice(&exp_inv_list).unwrap();
1021        let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1022        assert_eq!(act_uniset, exp_uniset);
1023    }
1024
1025    #[test]
1026    fn test_serde_deserialize_invalid() {
1027        assert!(serde_json::from_str::<CodePointInversionList>("[65,70,98775,85]").is_err());
1028        assert!(serde_json::from_str::<CodePointInversionList>("[65,70,U+FFFFFFFFFF,85]").is_err());
1029    }
1030
1031    #[test]
1032    fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1033        let set = CodePointInversionList::bmp();
1034        let set_serialized: Vec<u8> = postcard::to_allocvec(&set).unwrap();
1035        let set_deserialized: CodePointInversionList =
1036            postcard::from_bytes::<CodePointInversionList>(&set_serialized)?;
1037
1038        assert_eq!(&set, &set_deserialized);
1039        assert!(!set_deserialized.inv_list.is_owned());
1040
1041        Ok(())
1042    }
1043
1044    #[test]
1045    fn databake() {
1046        databake::test_bake!(
1047            CodePointInversionList<'static>,
1048            const,
1049            unsafe {
1050                #[allow(unused_unsafe)]
1051                crate::codepointinvlist::CodePointInversionList::from_parts_unchecked(
1052                    unsafe {
1053                        zerovec::ZeroVec::from_bytes_unchecked(
1054                            b"0\0\0\0:\0\0\0A\0\0\0G\0\0\0a\0\0\0g\0\0\0",
1055                        )
1056                    },
1057                    22u32,
1058                )
1059            },
1060            icu_collections,
1061            [zerovec],
1062        );
1063    }
1064}