icu_locale_core/
data.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::extensions::unicode as unicode_ext;
6use crate::subtags::{Language, Region, Script, Subtag, Variant};
7#[cfg(feature = "alloc")]
8use crate::ParseError;
9use crate::{LanguageIdentifier, Locale};
10use core::cmp::Ordering;
11use core::default::Default;
12use core::fmt;
13use core::hash::Hash;
14#[cfg(feature = "alloc")]
15use core::str::FromStr;
16
17/// A locale type optimized for use in fallbacking and the ICU4X data pipeline.
18///
19/// [`DataLocale`] contains less functionality than [`Locale`] but more than
20/// [`LanguageIdentifier`] for better size and performance while still meeting
21/// the needs of the ICU4X data pipeline.
22///
23/// You can create a [`DataLocale`] from a borrowed [`Locale`], which is more
24/// efficient than cloning the [`Locale`], but less efficient than converting an owned
25/// [`Locale`]:
26///
27/// ```
28/// use icu_locale_core::locale;
29/// use icu_provider::DataLocale;
30///
31/// let locale1 = locale!("en-u-ca-buddhist");
32/// let data_locale = DataLocale::from(&locale1);
33/// ```
34///
35/// [`DataLocale`] only supports `-u-sd` keywords, to reflect the current state of CLDR data
36/// lookup and fallback. This may change in the future.
37///
38/// ```
39/// use icu_locale_core::{locale, Locale};
40/// use icu_provider::DataLocale;
41///
42/// let locale = "hi-IN-t-en-h0-hybrid-u-attr-ca-buddhist-sd-inas"
43///     .parse::<Locale>()
44///     .unwrap();
45///
46/// assert_eq!(
47///     DataLocale::from(locale),
48///     DataLocale::from(locale!("hi-IN-u-sd-inas"))
49/// );
50/// ```
51#[derive(Clone, Copy, PartialEq, Hash, Eq)]
52#[non_exhaustive]
53pub struct DataLocale {
54    /// Language subtag
55    pub language: Language,
56    /// Script subtag
57    pub script: Option<Script>,
58    /// Region subtag
59    pub region: Option<Region>,
60    /// Variant subtag
61    pub variant: Option<Variant>,
62    /// Subivision (-u-sd-) subtag
63    pub subdivision: Option<Subtag>,
64}
65
66impl Default for DataLocale {
67    fn default() -> Self {
68        Self {
69            language: Language::UNKNOWN,
70            script: None,
71            region: None,
72            variant: None,
73            subdivision: None,
74        }
75    }
76}
77
78impl DataLocale {
79    /// `const` version of `Default::default`
80    pub const fn default() -> Self {
81        DataLocale {
82            language: Language::UNKNOWN,
83            script: None,
84            region: None,
85            variant: None,
86            subdivision: None,
87        }
88    }
89}
90
91impl Default for &DataLocale {
92    fn default() -> Self {
93        static DEFAULT: DataLocale = DataLocale::default();
94        &DEFAULT
95    }
96}
97
98impl fmt::Debug for DataLocale {
99    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
100        write!(f, "DataLocale{{{self}}}")
101    }
102}
103
104impl_writeable_for_each_subtag_str_no_test!(DataLocale, selff, selff.script.is_none() && selff.region.is_none() && selff.variant.is_none() && selff.subdivision.is_none() => selff.language.write_to_string());
105
106impl From<LanguageIdentifier> for DataLocale {
107    fn from(langid: LanguageIdentifier) -> Self {
108        Self::from(&langid)
109    }
110}
111
112impl From<Locale> for DataLocale {
113    fn from(locale: Locale) -> Self {
114        Self::from(&locale)
115    }
116}
117
118impl From<&LanguageIdentifier> for DataLocale {
119    fn from(langid: &LanguageIdentifier) -> Self {
120        Self {
121            language: langid.language,
122            script: langid.script,
123            region: langid.region,
124            variant: langid.variants.iter().copied().next(),
125            subdivision: None,
126        }
127    }
128}
129
130impl From<&Locale> for DataLocale {
131    fn from(locale: &Locale) -> Self {
132        let mut r = Self::from(&locale.id);
133
134        r.subdivision = locale
135            .extensions
136            .unicode
137            .keywords
138            .get(&unicode_ext::key!("sd"))
139            .and_then(|v| v.as_single_subtag().copied());
140        r
141    }
142}
143
144#[cfg(feature = "alloc")]
145impl FromStr for DataLocale {
146    type Err = ParseError;
147    #[inline]
148    fn from_str(s: &str) -> Result<Self, Self::Err> {
149        Self::try_from_str(s)
150    }
151}
152
153impl DataLocale {
154    #[inline]
155    /// Parses a [`DataLocale`].
156    #[cfg(feature = "alloc")]
157    pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
158        Self::try_from_utf8(s.as_bytes())
159    }
160
161    /// Parses a [`DataLocale`] from a UTF-8 byte slice.
162    #[cfg(feature = "alloc")]
163    pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
164        let locale = Locale::try_from_utf8(code_units)?;
165        if locale.id.variants.len() > 1
166            || !locale.extensions.transform.is_empty()
167            || !locale.extensions.private.is_empty()
168            || !locale.extensions.other.is_empty()
169            || !locale.extensions.unicode.attributes.is_empty()
170        {
171            return Err(ParseError::InvalidExtension);
172        }
173
174        let unicode_extensions_count = locale.extensions.unicode.keywords.iter().count();
175
176        if unicode_extensions_count != 0
177            && (unicode_extensions_count != 1
178                || !locale
179                    .extensions
180                    .unicode
181                    .keywords
182                    .contains_key(&unicode_ext::key!("sd")))
183        {
184            return Err(ParseError::InvalidExtension);
185        }
186
187        Ok(locale.into())
188    }
189
190    pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
191    where
192        F: FnMut(&str) -> Result<(), E>,
193    {
194        f(self.language.as_str())?;
195        if let Some(ref script) = self.script {
196            f(script.as_str())?;
197        }
198        if let Some(ref region) = self.region {
199            f(region.as_str())?;
200        }
201        if let Some(ref single_variant) = self.variant {
202            f(single_variant.as_str())?;
203        }
204        if let Some(ref subdivision) = self.subdivision {
205            f("u")?;
206            f("sd")?;
207            f(subdivision.as_str())?;
208        }
209        Ok(())
210    }
211
212    fn as_tuple(
213        &self,
214    ) -> (
215        Language,
216        Option<Script>,
217        Option<Region>,
218        Option<Variant>,
219        Option<Subtag>,
220    ) {
221        (
222            self.language,
223            self.script,
224            self.region,
225            self.variant,
226            self.subdivision,
227        )
228    }
229
230    /// Returns an ordering suitable for use in [`BTreeSet`].
231    ///
232    /// [`BTreeSet`]: alloc::collections::BTreeSet
233    pub fn total_cmp(&self, other: &Self) -> Ordering {
234        self.as_tuple().cmp(&other.as_tuple())
235    }
236
237    /// Compare this [`DataLocale`] with BCP-47 bytes.
238    ///
239    /// The return value is equivalent to what would happen if you first converted this
240    /// [`DataLocale`] to a BCP-47 string and then performed a byte comparison.
241    ///
242    /// This function is case-sensitive and results in a *total order*, so it is appropriate for
243    /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
244    ///
245    /// # Examples
246    ///
247    /// ```
248    /// use core::cmp::Ordering;
249    /// use icu_provider::DataLocale;
250    ///
251    /// let bcp47_strings: &[&str] = &[
252    ///     "ca",
253    ///     "ca-ES",
254    ///     "ca-ES-u-sd-esct",
255    ///     "ca-ES-valencia",
256    ///     "cat",
257    ///     "pl-Latn-PL",
258    ///     "und",
259    ///     "und-fonipa",
260    ///     "zh",
261    /// ];
262    ///
263    /// for ab in bcp47_strings.windows(2) {
264    ///     let a = ab[0];
265    ///     let b = ab[1];
266    ///     assert_eq!(a.cmp(b), Ordering::Less, "strings: {} < {}", a, b);
267    ///     let a_loc: DataLocale = a.parse().unwrap();
268    ///     assert_eq!(
269    ///         a_loc.strict_cmp(a.as_bytes()),
270    ///         Ordering::Equal,
271    ///         "strict_cmp: {} == {}",
272    ///         a_loc,
273    ///         a
274    ///     );
275    ///     assert_eq!(
276    ///         a_loc.strict_cmp(b.as_bytes()),
277    ///         Ordering::Less,
278    ///         "strict_cmp: {} < {}",
279    ///         a_loc,
280    ///         b
281    ///     );
282    ///     let b_loc: DataLocale = b.parse().unwrap();
283    ///     assert_eq!(
284    ///         b_loc.strict_cmp(b.as_bytes()),
285    ///         Ordering::Equal,
286    ///         "strict_cmp: {} == {}",
287    ///         b_loc,
288    ///         b
289    ///     );
290    ///     assert_eq!(
291    ///         b_loc.strict_cmp(a.as_bytes()),
292    ///         Ordering::Greater,
293    ///         "strict_cmp: {} > {}",
294    ///         b_loc,
295    ///         a
296    ///     );
297    /// }
298    /// ```
299    ///
300    /// Comparison against invalid strings:
301    ///
302    /// ```
303    /// use icu_provider::DataLocale;
304    ///
305    /// let invalid_strings: &[&str] = &[
306    ///     // Less than "ca-ES"
307    ///     "CA",
308    ///     "ar-x-gbp-FOO",
309    ///     // Greater than "ca-AR"
310    ///     "ca_ES",
311    ///     "ca-ES-x-gbp-FOO",
312    /// ];
313    ///
314    /// let data_locale = "ca-ES".parse::<DataLocale>().unwrap();
315    ///
316    /// for s in invalid_strings.iter() {
317    ///     let expected_ordering = "ca-AR".cmp(s);
318    ///     let actual_ordering = data_locale.strict_cmp(s.as_bytes());
319    ///     assert_eq!(expected_ordering, actual_ordering, "{}", s);
320    /// }
321    /// ```
322    pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
323        writeable::cmp_utf8(self, other)
324    }
325
326    /// Returns whether this [`DataLocale`] is `und` in the locale and extensions portion.
327    ///
328    /// # Examples
329    ///
330    /// ```
331    /// use icu_provider::DataLocale;
332    ///
333    /// assert!("und".parse::<DataLocale>().unwrap().is_unknown());
334    /// assert!(!"de-u-sd-denw".parse::<DataLocale>().unwrap().is_unknown());
335    /// assert!(!"und-ES".parse::<DataLocale>().unwrap().is_unknown());
336    /// ```
337    pub fn is_unknown(&self) -> bool {
338        self.language.is_unknown()
339            && self.script.is_none()
340            && self.region.is_none()
341            && self.variant.is_none()
342            && self.subdivision.is_none()
343    }
344
345    /// Converts this `DataLocale` into a [`Locale`].
346    pub fn into_locale(self) -> Locale {
347        Locale {
348            id: LanguageIdentifier {
349                language: self.language,
350                script: self.script,
351                region: self.region,
352                variants: self
353                    .variant
354                    .map(crate::subtags::Variants::from_variant)
355                    .unwrap_or_default(),
356            },
357            extensions: {
358                let mut extensions = crate::extensions::Extensions::default();
359                if let Some(sd) = self.subdivision {
360                    extensions.unicode = unicode_ext::Unicode {
361                        keywords: unicode_ext::Keywords::new_single(
362                            unicode_ext::key!("sd"),
363                            unicode_ext::Value::from_subtag(Some(sd)),
364                        ),
365                        ..Default::default()
366                    }
367                }
368                extensions
369            },
370        }
371    }
372}
373
374#[test]
375fn test_data_locale_to_string() {
376    struct TestCase {
377        pub locale: &'static str,
378        pub expected: &'static str,
379    }
380
381    for cas in [
382        TestCase {
383            locale: "und",
384            expected: "und",
385        },
386        TestCase {
387            locale: "und-u-sd-sdd",
388            expected: "und-u-sd-sdd",
389        },
390        TestCase {
391            locale: "en-ZA-u-sd-zaa",
392            expected: "en-ZA-u-sd-zaa",
393        },
394    ] {
395        let locale = cas.locale.parse::<DataLocale>().unwrap();
396        writeable::assert_writeable_eq!(locale, cas.expected);
397    }
398}
399
400#[test]
401fn test_data_locale_from_string() {
402    #[derive(Debug)]
403    struct TestCase {
404        pub input: &'static str,
405        pub success: bool,
406    }
407
408    for cas in [
409        TestCase {
410            input: "und",
411            success: true,
412        },
413        TestCase {
414            input: "und-u-cu-gbp",
415            success: false,
416        },
417        TestCase {
418            input: "en-ZA-u-sd-zaa",
419            success: true,
420        },
421        TestCase {
422            input: "en...",
423            success: false,
424        },
425    ] {
426        let data_locale = match (DataLocale::from_str(cas.input), cas.success) {
427            (Ok(l), true) => l,
428            (Err(_), false) => {
429                continue;
430            }
431            (Ok(_), false) => {
432                panic!("DataLocale parsed but it was supposed to fail: {cas:?}");
433            }
434            (Err(_), true) => {
435                panic!("DataLocale was supposed to parse but it failed: {cas:?}");
436            }
437        };
438        writeable::assert_writeable_eq!(data_locale, cas.input);
439    }
440}