icu_properties/code_point_map.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "alloc")]
6use crate::code_point_set::CodePointSetData;
7use crate::props::GeneralCategory;
8use crate::props::GeneralCategoryGroup;
9use crate::provider::*;
10use core::ops::RangeInclusive;
11use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
12use icu_provider::marker::ErasedMarker;
13use icu_provider::prelude::*;
14
15/// A wrapper around code point map data.
16///
17/// It is returned by APIs that return Unicode
18/// property data in a map-like form, ex: enumerated property value data keyed
19/// by code point. Access its data via the borrowed version,
20/// [`CodePointMapDataBorrowed`].
21#[derive(Debug, Clone)]
22pub struct CodePointMapData<T: TrieValue> {
23 data: DataPayload<ErasedMarker<PropertyCodePointMap<'static, T>>>,
24}
25
26impl<T: TrieValue> CodePointMapData<T> {
27 /// Creates a new [`CodePointMapData`] for a [`EnumeratedProperty`].
28 ///
29 /// See the documentation on [`EnumeratedProperty`] implementations for details.
30 ///
31 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
32 ///
33 /// [📚 Help choosing a constructor](icu_provider::constructors)
34 #[cfg(feature = "compiled_data")]
35 #[allow(clippy::new_ret_no_self)]
36 pub const fn new() -> CodePointMapDataBorrowed<'static, T>
37 where
38 T: EnumeratedProperty,
39 {
40 CodePointMapDataBorrowed::new()
41 }
42
43 #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
44 pub fn try_new_unstable(
45 provider: &(impl DataProvider<T::DataMarker> + ?Sized),
46 ) -> Result<Self, DataError>
47 where
48 T: EnumeratedProperty,
49 {
50 Ok(Self {
51 data: provider.load(Default::default())?.payload.cast(),
52 })
53 }
54
55 /// Construct a borrowed version of this type that can be queried.
56 ///
57 /// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
58 /// up front.
59 ///
60 /// This owned version if returned by functions that use a runtime data provider.
61 #[inline]
62 pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
63 CodePointMapDataBorrowed {
64 map: self.data.get(),
65 }
66 }
67
68 /// Convert this map to a map around another type
69 ///
70 /// Typically useful for type-erasing maps into maps around integers.
71 ///
72 /// # Panics
73 /// Will panic if T and P are different sizes
74 ///
75 /// # Example
76 ///
77 /// ```
78 /// use icu::properties::CodePointMapData;
79 /// use icu::properties::props::GeneralCategory;
80 ///
81 /// let data = CodePointMapData::<GeneralCategory>::new().static_to_owned();
82 ///
83 /// let gc = data.try_into_converted::<u8>().unwrap();
84 /// let gc = gc.as_borrowed();
85 ///
86 /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728
87 /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
88 /// ```
89 #[cfg(feature = "alloc")]
90 pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, zerovec::ule::UleError>
91 where
92 P: TrieValue,
93 {
94 self.data
95 .try_map_project(|data, _| data.try_into_converted())
96 .map(CodePointMapData::from_data::<ErasedMarker<PropertyCodePointMap<'static, P>>>)
97 }
98
99 /// Construct a new one from loaded data
100 ///
101 /// Typically it is preferable to use getters like [`load_general_category()`] instead
102 pub(crate) fn from_data<M>(data: DataPayload<M>) -> Self
103 where
104 M: DynamicDataMarker<DataStruct = PropertyCodePointMap<'static, T>>,
105 {
106 Self { data: data.cast() }
107 }
108
109 /// Construct a new one an owned [`CodePointTrie`]
110 pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
111 let set = PropertyCodePointMap::from_code_point_trie(trie);
112 CodePointMapData::from_data(
113 DataPayload::<ErasedMarker<PropertyCodePointMap<'static, T>>>::from_owned(set),
114 )
115 }
116
117 /// Convert this type to a [`CodePointTrie`] as a borrowed value.
118 ///
119 /// The data backing this is extensible and supports multiple implementations.
120 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
121 /// added, and users may select which at data generation time.
122 ///
123 /// This method returns an `Option` in order to return `None` when the backing data provider
124 /// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
125 /// constraint.
126 pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
127 self.data.get().as_code_point_trie()
128 }
129
130 /// Convert this type to a [`CodePointTrie`], borrowing if possible,
131 /// otherwise allocating a new [`CodePointTrie`].
132 ///
133 /// The data backing this is extensible and supports multiple implementations.
134 /// Currently it is always [`CodePointTrie`]; however in the future more backends may be
135 /// added, and users may select which at data generation time.
136 ///
137 /// The performance of the conversion to this specific return type will vary
138 /// depending on the data structure that is backing `self`.
139 pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
140 self.data.get().to_code_point_trie()
141 }
142}
143
144/// A borrowed wrapper around code point set data, returned by
145/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
146#[derive(Clone, Copy, Debug)]
147pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
148 map: &'a PropertyCodePointMap<'a, T>,
149}
150
151impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
152 /// Get the value this map has associated with code point `ch`
153 ///
154 /// # Example
155 ///
156 /// ```
157 /// use icu::properties::CodePointMapData;
158 /// use icu::properties::props::GeneralCategory;
159 ///
160 /// let gc = CodePointMapData::<GeneralCategory>::new();
161 ///
162 /// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
163 /// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
164 /// ```
165 pub fn get(self, ch: char) -> T {
166 self.map.get32(ch as u32)
167 }
168
169 /// See [`Self::get`].
170 pub fn get32(self, ch: u32) -> T {
171 self.map.get32(ch)
172 }
173
174 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value
175 ///
176 /// # Example
177 ///
178 /// ```
179 /// use icu::properties::props::GeneralCategory;
180 /// use icu::properties::CodePointMapData;
181 ///
182 /// let gc = CodePointMapData::<GeneralCategory>::new();
183 ///
184 /// let other_letter_set_data =
185 /// gc.get_set_for_value(GeneralCategory::OtherLetter);
186 /// let other_letter_set = other_letter_set_data.as_borrowed();
187 ///
188 /// assert!(other_letter_set.contains('木')); // U+6728
189 /// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
190 /// ```
191 #[cfg(feature = "alloc")]
192 pub fn get_set_for_value(self, value: T) -> CodePointSetData {
193 let set = self.map.get_set_for_value(value);
194 CodePointSetData::from_code_point_inversion_list(set)
195 }
196
197 /// Yields an [`Iterator`] returning ranges of consecutive code points that
198 /// share the same value in the [`CodePointMapData`].
199 ///
200 /// # Examples
201 ///
202 /// ```
203 /// use icu::properties::props::GeneralCategory;
204 /// use icu::properties::CodePointMapData;
205 ///
206 /// let gc = CodePointMapData::<GeneralCategory>::new();
207 /// let mut ranges = gc.iter_ranges();
208 /// let next = ranges.next().unwrap();
209 /// assert_eq!(next.range, 0..=31);
210 /// assert_eq!(next.value, GeneralCategory::Control);
211 /// let next = ranges.next().unwrap();
212 /// assert_eq!(next.range, 32..=32);
213 /// assert_eq!(next.value, GeneralCategory::SpaceSeparator);
214 /// ```
215 pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
216 self.map.iter_ranges()
217 }
218
219 /// Yields an [`Iterator`] returning ranges of consecutive code points that
220 /// share the same value `v` in the [`CodePointMapData`].
221 ///
222 /// # Examples
223 ///
224 ///
225 /// ```
226 /// use icu::properties::props::GeneralCategory;
227 /// use icu::properties::CodePointMapData;
228 ///
229 /// let gc = CodePointMapData::<GeneralCategory>::new();
230 /// let mut ranges = gc.iter_ranges_for_value(GeneralCategory::UppercaseLetter);
231 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
232 /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
233 /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='Þ' as u32);
234 /// ```
235 pub fn iter_ranges_for_value(self, val: T) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
236 self.map
237 .iter_ranges()
238 .filter(move |r| r.value == val)
239 .map(|r| r.range)
240 }
241
242 /// Yields an [`Iterator`] returning ranges of consecutive code points that
243 /// do *not* have the value `v` in the [`CodePointMapData`].
244 pub fn iter_ranges_for_value_complemented(
245 self,
246 val: T,
247 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
248 self.map
249 .iter_ranges_mapped(move |value| value != val)
250 .filter(|v| v.value)
251 .map(|v| v.range)
252 }
253
254 /// Exposed for FFI needs, could be exposed in general in the future but we should
255 /// have a use case first.
256 ///
257 /// FFI needs this since it operates on erased maps and can't use `iter_ranges_for_group()`
258 #[doc(hidden)] // used by FFI code
259 pub fn iter_ranges_mapped<U: Eq + 'a>(
260 self,
261 predicate: impl FnMut(T) -> U + Copy + 'a,
262 ) -> impl Iterator<Item = CodePointMapRange<U>> + 'a {
263 self.map.iter_ranges_mapped(predicate)
264 }
265}
266
267impl CodePointMapDataBorrowed<'_, GeneralCategory> {
268 /// Get a [`CodePointSetData`] for all elements corresponding to a particular value group
269 ///
270 /// # Example
271 ///
272 /// ```
273 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
274 /// use icu::properties::CodePointMapData;
275 ///
276 /// let gc = CodePointMapData::<GeneralCategory>::new();
277 ///
278 /// let other_letter_set_data =
279 /// gc.get_set_for_value_group(GeneralCategoryGroup::OtherLetter);
280 /// let other_letter_set = other_letter_set_data.as_borrowed();
281 ///
282 /// assert!(other_letter_set.contains('木')); // U+6728
283 /// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
284 /// ```
285 #[cfg(feature = "alloc")]
286 pub fn get_set_for_value_group(self, value: GeneralCategoryGroup) -> crate::CodePointSetData {
287 let matching_gc_ranges = self
288 .iter_ranges()
289 .filter(|cpm_range| (1 << cpm_range.value as u32) & value.0 != 0)
290 .map(|cpm_range| cpm_range.range);
291 CodePointSetData::from_code_point_inversion_list(matching_gc_ranges.collect())
292 }
293}
294
295#[cfg(feature = "compiled_data")]
296impl<T: EnumeratedProperty> Default for CodePointMapDataBorrowed<'static, T> {
297 fn default() -> Self {
298 Self::new()
299 }
300}
301
302impl<T: TrieValue> CodePointMapDataBorrowed<'static, T> {
303 /// Creates a new [`CodePointMapDataBorrowed`] for a [`EnumeratedProperty`].
304 ///
305 /// See the documentation on [`EnumeratedProperty`] implementations for details.
306 ///
307 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
308 ///
309 /// [📚 Help choosing a constructor](icu_provider::constructors)
310 #[cfg(feature = "compiled_data")]
311 pub const fn new() -> Self
312 where
313 T: EnumeratedProperty,
314 {
315 CodePointMapDataBorrowed { map: T::SINGLETON }
316 }
317
318 /// Cheaply converts a [`CodePointMapDataBorrowed<'static>`] into a [`CodePointMapData`].
319 ///
320 /// Note: Due to branching and indirection, using [`CodePointMapData`] might inhibit some
321 /// compile-time optimizations that are possible with [`CodePointMapDataBorrowed`].
322 pub const fn static_to_owned(self) -> CodePointMapData<T> {
323 CodePointMapData {
324 data: DataPayload::from_static_ref(self.map),
325 }
326 }
327}
328
329impl<'a> CodePointMapDataBorrowed<'a, GeneralCategory> {
330 /// Yields an [`Iterator`] returning ranges of consecutive code points that
331 /// have a `General_Category` value belonging to the specified [`GeneralCategoryGroup`]
332 ///
333 /// # Examples
334 ///
335 /// ```
336 /// use icu::properties::props::{GeneralCategory, GeneralCategoryGroup};
337 /// use icu::properties::CodePointMapData;
338 ///
339 /// let gc = CodePointMapData::<GeneralCategory>::new();
340 /// let mut ranges = gc.iter_ranges_for_group(GeneralCategoryGroup::Letter);
341 /// assert_eq!(ranges.next().unwrap(), 'A' as u32..='Z' as u32);
342 /// assert_eq!(ranges.next().unwrap(), 'a' as u32..='z' as u32);
343 /// assert_eq!(ranges.next().unwrap(), 'ª' as u32..='ª' as u32);
344 /// assert_eq!(ranges.next().unwrap(), 'µ' as u32..='µ' as u32);
345 /// assert_eq!(ranges.next().unwrap(), 'º' as u32..='º' as u32);
346 /// assert_eq!(ranges.next().unwrap(), 'À' as u32..='Ö' as u32);
347 /// assert_eq!(ranges.next().unwrap(), 'Ø' as u32..='ö' as u32);
348 /// ```
349 pub fn iter_ranges_for_group(
350 self,
351 group: GeneralCategoryGroup,
352 ) -> impl Iterator<Item = RangeInclusive<u32>> + 'a {
353 self.map
354 .iter_ranges_mapped(move |value| group.contains(value))
355 .filter(|v| v.value)
356 .map(|v| v.range)
357 }
358}
359
360/// A Unicode character property that assigns a value to each code point.
361///
362/// The descriptions of most properties are taken from [`TR44`], the documentation for the
363/// Unicode Character Database.
364///
365/// <div class="stab unstable">
366/// 🚫 This trait is sealed; it cannot be implemented by user code. If an API requests an item that implements this
367/// trait, please consider using a type from the implementors listed below.
368/// </div>
369///
370/// [`TR44`]: https://www.unicode.org/reports/tr44
371pub trait EnumeratedProperty: crate::private::Sealed + TrieValue {
372 #[doc(hidden)]
373 type DataMarker: DataMarker<DataStruct = PropertyCodePointMap<'static, Self>>;
374 #[doc(hidden)]
375 #[cfg(feature = "compiled_data")]
376 const SINGLETON: &'static PropertyCodePointMap<'static, Self>;
377 /// The name of this property
378 const NAME: &'static [u8];
379 /// The abbreviated name of this property, if it exists, otherwise the name
380 const SHORT_NAME: &'static [u8];
381
382 /// Convenience method for `CodePointMapData::new().get(ch)`
383 ///
384 /// ✨ *Enabled with the `compiled_data` Cargo feature.*
385 #[cfg(feature = "compiled_data")]
386 fn for_char(ch: char) -> Self {
387 CodePointMapData::new().get(ch)
388 }
389}