icu_properties/
code_point_map.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use crate::code_point_set::CodePointSetData;
7use crate::props::GeneralCategory;
8use crate::props::GeneralCategoryGroup;
9use crate::provider::*;
10use core::ops::RangeInclusive;
11use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12use icu_provider::marker::ErasedMarker;
13use icu_provider::prelude::*;
14
15/// A wrapper around code point map data.
16///
17/// It is returned by APIs that return Unicode
18/// property data in a map-like form, ex: enumerated property value data keyed
19/// by code point. Access its data via the borrowed version,
20/// [`CodePointMapDataBorrowed`].
21#[derive(Debug, Clone)]
22pub struct CodePointMapData<T: TrieValue> {
23    data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24}
25
26impl<T: TrieValue> CodePointMapData<T> {
27    /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28    ///
29    /// See the documentation on [`EnumeratedProperty`] implementations for details.
30    ///
31    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
32    ///
33    /// [📚 Help choosing a constructor](icu_provider::constructors)
34    #[cfg(feature = "compiled_data")]
35    #[allow(clippy::new_ret_no_self)]
36    pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37    where
38        T: EnumeratedProperty,
39    {
40        CodePointMapDataBorrowed::new()
41    }
42
43    #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44    pub fn try_new_unstable(
45        provider: &(impl DataProvider<T::DataMarker> + ?Sized),
46    ) -> Result<Self, DataError>
47    where
48        T: EnumeratedProperty,
49    {
50        Ok(Self {
51            data: provider.load(Default::default())?.payload.cast(),
52        })
53    }
54
55    /// Construct a borrowed version of this type that can be queried.
56    ///
57    /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
58    /// up front.
59    ///
60    /// This owned version if returned by functions that use a runtime data provider.
61    #[inline]
62    pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
63        CodePointMapDataBorrowed {
64            map: self.data.get(),
65        }
66    }
67
68    /// Convert this map to a map around another type
69    ///
70    /// Typically useful for type-erasing maps into maps around integers.
71    ///
72    /// # Panics
73    /// Will panic if T and P are different sizes
74    ///
75    /// # Example
76    ///
77    /// ```
78    /// use icu::properties::CodePointMapData;
79    /// use icu::properties::props::GeneralCategory;
80    ///
81    /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
82    ///
83    /// let gc = data.try_into_converted::<u8>().unwrap();
84    /// let gc = gc.as_borrowed();
85    ///
86    /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8);  // U+6728
87    /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8);  // U+1F383 JACK-O-LANTERN
88    /// ```
89    #[cfg(feature = "alloc")]
90    pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
91    where
92        P: TrieValue,
93    {
94        self.data
95            .try_map_project(|data, _| data.try_into_converted())
96            .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
97    }
98
99    /// Construct a new one from loaded data
100    ///
101    /// Typically it is preferable to use getters like [`load_general_category()`] instead
102    pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
103    where
104        M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
105    {
106        Self { data: data.cast() }
107    }
108
109    /// Construct a new one an owned [`CodePointTrie`]
110    pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
111        let set = PropertyCodePointMap::from_code_point_trie(trie);
112        CodePointMapData::from_data(
113            DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
114        )
115    }
116
117    /// Convert this type to a [`CodePointTrie`] as a borrowed value.
118    ///
119    /// The data backing this is extensible and supports multiple implementations.
120    /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
121    /// added, and users may select which at data generation time.
122    ///
123    /// This method returns an `Option` in order to return `None` when the backing data provider
124    /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
125    /// constraint.
126    pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
127        self.data.get().as_code_point_trie()
128    }
129
130    /// Convert this type to a [`CodePointTrie`], borrowing if possible,
131    /// otherwise allocating a new [`CodePointTrie`].
132    ///
133    /// The data backing this is extensible and supports multiple implementations.
134    /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
135    /// added, and users may select which at data generation time.
136    ///
137    /// The performance of the conversion to this specific return type will vary
138    /// depending on the data structure that is backing `self`.
139    pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
140        self.data.get().to_code_point_trie()
141    }
142}
143
144/// A borrowed wrapper around code point set data, returned by
145/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
146#[derive(Clone, Copy, Debug)]
147pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
148    map: &'a PropertyCodePointMap<'a, T>,
149}
150
151impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
152    /// Get the value this map has associated with code point `ch`
153    ///
154    /// # Example
155    ///
156    /// ```
157    /// use icu::properties::CodePointMapData;
158    /// use icu::properties::props::GeneralCategory;
159    ///
160    /// let gc = CodePointMapData::<GeneralCategory>::new();
161    ///
162    /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter);  // U+6728
163    /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol);  // U+1F383 JACK-O-LANTERN
164    /// ```
165    pub fn get(self, ch: char) -> T {
166        self.map.get32(ch as u32)
167    }
168
169    /// See [`Self::get`].
170    pub fn get32(self, ch: u32) -> T {
171        self.map.get32(ch)
172    }
173
174    /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
175    ///
176    /// # Example
177    ///
178    /// ```
179    /// use icu::properties::props::GeneralCategory;
180    /// use icu::properties::CodePointMapData;
181    ///
182    /// let gc = CodePointMapData::<GeneralCategory>::new();
183    ///
184    /// let other_letter_set_data =
185    ///     gc.get_set_for_value(GeneralCategory::OtherLetter);
186    /// let other_letter_set = other_letter_set_data.as_borrowed();
187    ///
188    /// assert!(other_letter_set.contains('木')); // U+6728
189    /// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
190    /// ```
191    #[cfg(feature = "alloc")]
192    pub fn get_set_for_value(self, value: T) -> CodePointSetData {
193        let set = self.map.get_set_for_value(value);
194        CodePointSetData::from_code_point_inversion_list(set)
195    }
196
197    /// Yields an [`Iterator`] returning ranges of consecutive code points that
198    /// share the same value in the [`CodePointMapData`].
199    ///
200    /// # Examples
201    ///
202    /// ```
203    /// use icu::properties::props::GeneralCategory;
204    /// use icu::properties::CodePointMapData;
205    ///
206    /// let gc = CodePointMapData::<GeneralCategory>::new();
207    /// let mut ranges = gc.iter_ranges();
208    /// let next = ranges.next().unwrap();
209    /// assert_eq!(next.range, 0..=31);
210    /// assert_eq!(next.value, GeneralCategory::Control);
211    /// let next = ranges.next().unwrap();
212    /// assert_eq!(next.range, 32..=32);
213    /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
214    /// ```
215    pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
216        self.map.iter_ranges()
217    }
218
219    /// Yields an [`Iterator`] returning ranges of consecutive code points that
220    /// share the same value `v` in the [`CodePointMapData`].
221    ///
222    /// # Examples
223    ///
224    ///
225    /// ```
226    /// use icu::properties::props::GeneralCategory;
227    /// use icu::properties::CodePointMapData;
228    ///
229    /// let gc = CodePointMapData::<GeneralCategory>::new();
230    /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
231    /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
232    /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
233    /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
234    /// ```
235    pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
236        self.map
237            .iter_ranges()
238            .filter(move |r| r.value == val)
239            .map(|r| r.range)
240    }
241
242    /// Yields an [`Iterator`] returning ranges of consecutive code points that
243    /// do *not* have the value `v` in the [`CodePointMapData`].
244    pub fn iter_ranges_for_value_complemented(
245        self,
246        val: T,
247    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
248        self.map
249            .iter_ranges_mapped(move |value| value != val)
250            .filter(|v| v.value)
251            .map(|v| v.range)
252    }
253
254    /// Exposed for FFI needs, could be exposed in general in the future but we should
255    /// have a use case first.
256    ///
257    /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
258    #[doc(hidden)] // used by FFI code
259    pub fn iter_ranges_mapped<U: Eq + 'a>(
260        self,
261        predicate: impl FnMut(T) -> U + Copy + 'a,
262    ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
263        self.map.iter_ranges_mapped(predicate)
264    }
265}
266
267impl CodePointMapDataBorrowed<'_, GeneralCategory> {
268    /// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
269    ///
270    /// # Example
271    ///
272    /// ```
273    /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
274    /// use icu::properties::CodePointMapData;
275    ///
276    /// let gc = CodePointMapData::<GeneralCategory>::new();
277    ///
278    /// let other_letter_set_data =
279    ///     gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
280    /// let other_letter_set = other_letter_set_data.as_borrowed();
281    ///
282    /// assert!(other_letter_set.contains('木')); // U+6728
283    /// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
284    /// ```
285    #[cfg(feature = "alloc")]
286    pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
287        let matching_gc_ranges = self
288            .iter_ranges()
289            .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
290            .map(|cpm_range| cpm_range.range);
291        CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
292    }
293}
294
295#[cfg(feature = "compiled_data")]
296impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
297    fn default() -> Self {
298        Self::new()
299    }
300}
301
302impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
303    /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
304    ///
305    /// See the documentation on [`EnumeratedProperty`] implementations for details.
306    ///
307    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
308    ///
309    /// [📚 Help choosing a constructor](icu_provider::constructors)
310    #[cfg(feature = "compiled_data")]
311    pub const fn new() -> Self
312    where
313        T: EnumeratedProperty,
314    {
315        CodePointMapDataBorrowed { map: T::SINGLETON }
316    }
317
318    /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
319    ///
320    /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
321    /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
322    pub const fn static_to_owned(self) -> CodePointMapData<T> {
323        CodePointMapData {
324            data: DataPayload::from_static_ref(self.map),
325        }
326    }
327}
328
329impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
330    /// Yields an [`Iterator`] returning ranges of consecutive code points that
331    /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
332    ///
333    /// # Examples
334    ///
335    /// ```
336    /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
337    /// use icu::properties::CodePointMapData;
338    ///
339    /// let gc = CodePointMapData::<GeneralCategory>::new();
340    /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
341    /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
342    /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
343    /// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32);
344    /// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32);
345    /// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32);
346    /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
347    /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32);
348    /// ```
349    pub fn iter_ranges_for_group(
350        self,
351        group: GeneralCategoryGroup,
352    ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
353        self.map
354            .iter_ranges_mapped(move |value| group.contains(value))
355            .filter(|v| v.value)
356            .map(|v| v.range)
357    }
358}
359
360/// A Unicode character property that assigns a value to each code point.
361///
362/// The descriptions of most properties are taken from [`TR44`], the documentation for the
363/// Unicode Character Database.
364///
365/// <div class="stab unstable">
366/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
367/// trait, please consider using a type from the implementors listed below.
368/// </div>
369///
370/// [`TR44`]: https://www.unicode.org/reports/tr44
371pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
372    #[doc(hidden)]
373    type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
374    #[doc(hidden)]
375    #[cfg(feature = "compiled_data")]
376    const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
377    /// The name of this property
378    const NAME: &'static [u8];
379    /// The abbreviated name of this property, if it exists, otherwise the name
380    const SHORT_NAME: &'static [u8];
381
382    /// Convenience method for `CodePointMapData::new().get(ch)`
383    ///
384    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
385    #[cfg(feature = "compiled_data")]
386    fn for_char(ch: char) -> Self {
387        CodePointMapData::new().get(ch)
388    }
389}