icu_locale_core/data.rs
1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::extensions::unicode as unicode_ext;
6use crate::subtags::{Language, Region, Script, Subtag, Variant};
7#[cfg(feature = "alloc")]
8use crate::ParseError;
9use crate::{LanguageIdentifier, Locale};
10use core::cmp::Ordering;
11use core::default::Default;
12use core::fmt;
13use core::hash::Hash;
14#[cfg(feature = "alloc")]
15use core::str::FromStr;
16
17/// A locale type optimized for use in fallbacking and the ICU4X data pipeline.
18///
19/// [`DataLocale`] contains less functionality than [`Locale`] but more than
20/// [`LanguageIdentifier`] for better size and performance while still meeting
21/// the needs of the ICU4X data pipeline.
22///
23/// You can create a [`DataLocale`] from a borrowed [`Locale`], which is more
24/// efficient than cloning the [`Locale`], but less efficient than converting an owned
25/// [`Locale`]:
26///
27/// ```
28/// use icu_locale_core::locale;
29/// use icu_provider::DataLocale;
30///
31/// let locale1 = locale!("en-u-ca-buddhist");
32/// let data_locale = DataLocale::from(&locale1);
33/// ```
34///
35/// [`DataLocale`] only supports `-u-sd` keywords, to reflect the current state of CLDR data
36/// lookup and fallback. This may change in the future.
37///
38/// ```
39/// use icu_locale_core::{locale, Locale};
40/// use icu_provider::DataLocale;
41///
42/// let locale = "hi-IN-t-en-h0-hybrid-u-attr-ca-buddhist-sd-inas"
43/// .parse::<Locale>()
44/// .unwrap();
45///
46/// assert_eq!(
47/// DataLocale::from(locale),
48/// DataLocale::from(locale!("hi-IN-u-sd-inas"))
49/// );
50/// ```
51#[derive(Clone, Copy, PartialEq, Hash, Eq)]
52#[non_exhaustive]
53pub struct DataLocale {
54 /// Language subtag
55 pub language: Language,
56 /// Script subtag
57 pub script: Option<Script>,
58 /// Region subtag
59 pub region: Option<Region>,
60 /// Variant subtag
61 pub variant: Option<Variant>,
62 /// Subivision (-u-sd-) subtag
63 pub subdivision: Option<Subtag>,
64}
65
66impl Default for DataLocale {
67 fn default() -> Self {
68 Self {
69 language: Language::UNKNOWN,
70 script: None,
71 region: None,
72 variant: None,
73 subdivision: None,
74 }
75 }
76}
77
78impl DataLocale {
79 /// `const` version of `Default::default`
80 pub const fn default() -> Self {
81 DataLocale {
82 language: Language::UNKNOWN,
83 script: None,
84 region: None,
85 variant: None,
86 subdivision: None,
87 }
88 }
89}
90
91impl Default for &DataLocale {
92 fn default() -> Self {
93 static DEFAULT: DataLocale = DataLocale::default();
94 &DEFAULT
95 }
96}
97
98impl fmt::Debug for DataLocale {
99 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
100 write!(f, "DataLocale{{{self}}}")
101 }
102}
103
104impl_writeable_for_each_subtag_str_no_test!(DataLocale, selff, selff.script.is_none() && selff.region.is_none() && selff.variant.is_none() && selff.subdivision.is_none() => selff.language.write_to_string());
105
106impl From<LanguageIdentifier> for DataLocale {
107 fn from(langid: LanguageIdentifier) -> Self {
108 Self::from(&langid)
109 }
110}
111
112impl From<Locale> for DataLocale {
113 fn from(locale: Locale) -> Self {
114 Self::from(&locale)
115 }
116}
117
118impl From<&LanguageIdentifier> for DataLocale {
119 fn from(langid: &LanguageIdentifier) -> Self {
120 Self {
121 language: langid.language,
122 script: langid.script,
123 region: langid.region,
124 variant: langid.variants.iter().copied().next(),
125 subdivision: None,
126 }
127 }
128}
129
130impl From<&Locale> for DataLocale {
131 fn from(locale: &Locale) -> Self {
132 let mut r = Self::from(&locale.id);
133
134 r.subdivision = locale
135 .extensions
136 .unicode
137 .keywords
138 .get(&unicode_ext::key!("sd"))
139 .and_then(|v| v.as_single_subtag().copied());
140 r
141 }
142}
143
144#[cfg(feature = "alloc")]
145impl FromStr for DataLocale {
146 type Err = ParseError;
147 #[inline]
148 fn from_str(s: &str) -> Result<Self, Self::Err> {
149 Self::try_from_str(s)
150 }
151}
152
153impl DataLocale {
154 #[inline]
155 /// Parses a [`DataLocale`].
156 #[cfg(feature = "alloc")]
157 pub fn try_from_str(s: &str) -> Result<Self, ParseError> {
158 Self::try_from_utf8(s.as_bytes())
159 }
160
161 /// Parses a [`DataLocale`] from a UTF-8 byte slice.
162 #[cfg(feature = "alloc")]
163 pub fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
164 let locale = Locale::try_from_utf8(code_units)?;
165 if locale.id.variants.len() > 1
166 || !locale.extensions.transform.is_empty()
167 || !locale.extensions.private.is_empty()
168 || !locale.extensions.other.is_empty()
169 || !locale.extensions.unicode.attributes.is_empty()
170 {
171 return Err(ParseError::InvalidExtension);
172 }
173
174 let unicode_extensions_count = locale.extensions.unicode.keywords.iter().count();
175
176 if unicode_extensions_count != 0
177 && (unicode_extensions_count != 1
178 || !locale
179 .extensions
180 .unicode
181 .keywords
182 .contains_key(&unicode_ext::key!("sd")))
183 {
184 return Err(ParseError::InvalidExtension);
185 }
186
187 Ok(locale.into())
188 }
189
190 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
191 where
192 F: FnMut(&str) -> Result<(), E>,
193 {
194 f(self.language.as_str())?;
195 if let Some(ref script) = self.script {
196 f(script.as_str())?;
197 }
198 if let Some(ref region) = self.region {
199 f(region.as_str())?;
200 }
201 if let Some(ref single_variant) = self.variant {
202 f(single_variant.as_str())?;
203 }
204 if let Some(ref subdivision) = self.subdivision {
205 f("u")?;
206 f("sd")?;
207 f(subdivision.as_str())?;
208 }
209 Ok(())
210 }
211
212 fn as_tuple(
213 &self,
214 ) -> (
215 Language,
216 Option<Script>,
217 Option<Region>,
218 Option<Variant>,
219 Option<Subtag>,
220 ) {
221 (
222 self.language,
223 self.script,
224 self.region,
225 self.variant,
226 self.subdivision,
227 )
228 }
229
230 /// Returns an ordering suitable for use in [`BTreeSet`].
231 ///
232 /// [`BTreeSet`]: alloc::collections::BTreeSet
233 pub fn total_cmp(&self, other: &Self) -> Ordering {
234 self.as_tuple().cmp(&other.as_tuple())
235 }
236
237 /// Compare this [`DataLocale`] with BCP-47 bytes.
238 ///
239 /// The return value is equivalent to what would happen if you first converted this
240 /// [`DataLocale`] to a BCP-47 string and then performed a byte comparison.
241 ///
242 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
243 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
244 ///
245 /// # Examples
246 ///
247 /// ```
248 /// use core::cmp::Ordering;
249 /// use icu_provider::DataLocale;
250 ///
251 /// let bcp47_strings: &[&str] = &[
252 /// "ca",
253 /// "ca-ES",
254 /// "ca-ES-u-sd-esct",
255 /// "ca-ES-valencia",
256 /// "cat",
257 /// "pl-Latn-PL",
258 /// "und",
259 /// "und-fonipa",
260 /// "zh",
261 /// ];
262 ///
263 /// for ab in bcp47_strings.windows(2) {
264 /// let a = ab[0];
265 /// let b = ab[1];
266 /// assert_eq!(a.cmp(b), Ordering::Less, "strings: {} < {}", a, b);
267 /// let a_loc: DataLocale = a.parse().unwrap();
268 /// assert_eq!(
269 /// a_loc.strict_cmp(a.as_bytes()),
270 /// Ordering::Equal,
271 /// "strict_cmp: {} == {}",
272 /// a_loc,
273 /// a
274 /// );
275 /// assert_eq!(
276 /// a_loc.strict_cmp(b.as_bytes()),
277 /// Ordering::Less,
278 /// "strict_cmp: {} < {}",
279 /// a_loc,
280 /// b
281 /// );
282 /// let b_loc: DataLocale = b.parse().unwrap();
283 /// assert_eq!(
284 /// b_loc.strict_cmp(b.as_bytes()),
285 /// Ordering::Equal,
286 /// "strict_cmp: {} == {}",
287 /// b_loc,
288 /// b
289 /// );
290 /// assert_eq!(
291 /// b_loc.strict_cmp(a.as_bytes()),
292 /// Ordering::Greater,
293 /// "strict_cmp: {} > {}",
294 /// b_loc,
295 /// a
296 /// );
297 /// }
298 /// ```
299 ///
300 /// Comparison against invalid strings:
301 ///
302 /// ```
303 /// use icu_provider::DataLocale;
304 ///
305 /// let invalid_strings: &[&str] = &[
306 /// // Less than "ca-ES"
307 /// "CA",
308 /// "ar-x-gbp-FOO",
309 /// // Greater than "ca-AR"
310 /// "ca_ES",
311 /// "ca-ES-x-gbp-FOO",
312 /// ];
313 ///
314 /// let data_locale = "ca-ES".parse::<DataLocale>().unwrap();
315 ///
316 /// for s in invalid_strings.iter() {
317 /// let expected_ordering = "ca-AR".cmp(s);
318 /// let actual_ordering = data_locale.strict_cmp(s.as_bytes());
319 /// assert_eq!(expected_ordering, actual_ordering, "{}", s);
320 /// }
321 /// ```
322 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
323 writeable::cmp_utf8(self, other)
324 }
325
326 /// Returns whether this [`DataLocale`] is `und` in the locale and extensions portion.
327 ///
328 /// # Examples
329 ///
330 /// ```
331 /// use icu_provider::DataLocale;
332 ///
333 /// assert!("und".parse::<DataLocale>().unwrap().is_unknown());
334 /// assert!(!"de-u-sd-denw".parse::<DataLocale>().unwrap().is_unknown());
335 /// assert!(!"und-ES".parse::<DataLocale>().unwrap().is_unknown());
336 /// ```
337 pub fn is_unknown(&self) -> bool {
338 self.language.is_unknown()
339 && self.script.is_none()
340 && self.region.is_none()
341 && self.variant.is_none()
342 && self.subdivision.is_none()
343 }
344
345 /// Converts this `DataLocale` into a [`Locale`].
346 pub fn into_locale(self) -> Locale {
347 Locale {
348 id: LanguageIdentifier {
349 language: self.language,
350 script: self.script,
351 region: self.region,
352 variants: self
353 .variant
354 .map(crate::subtags::Variants::from_variant)
355 .unwrap_or_default(),
356 },
357 extensions: {
358 let mut extensions = crate::extensions::Extensions::default();
359 if let Some(sd) = self.subdivision {
360 extensions.unicode = unicode_ext::Unicode {
361 keywords: unicode_ext::Keywords::new_single(
362 unicode_ext::key!("sd"),
363 unicode_ext::Value::from_subtag(Some(sd)),
364 ),
365 ..Default::default()
366 }
367 }
368 extensions
369 },
370 }
371 }
372}
373
374#[test]
375fn test_data_locale_to_string() {
376 struct TestCase {
377 pub locale: &'static str,
378 pub expected: &'static str,
379 }
380
381 for cas in [
382 TestCase {
383 locale: "und",
384 expected: "und",
385 },
386 TestCase {
387 locale: "und-u-sd-sdd",
388 expected: "und-u-sd-sdd",
389 },
390 TestCase {
391 locale: "en-ZA-u-sd-zaa",
392 expected: "en-ZA-u-sd-zaa",
393 },
394 ] {
395 let locale = cas.locale.parse::<DataLocale>().unwrap();
396 writeable::assert_writeable_eq!(locale, cas.expected);
397 }
398}
399
400#[test]
401fn test_data_locale_from_string() {
402 #[derive(Debug)]
403 struct TestCase {
404 pub input: &'static str,
405 pub success: bool,
406 }
407
408 for cas in [
409 TestCase {
410 input: "und",
411 success: true,
412 },
413 TestCase {
414 input: "und-u-cu-gbp",
415 success: false,
416 },
417 TestCase {
418 input: "en-ZA-u-sd-zaa",
419 success: true,
420 },
421 TestCase {
422 input: "en...",
423 success: false,
424 },
425 ] {
426 let data_locale = match (DataLocale::from_str(cas.input), cas.success) {
427 (Ok(l), true) => l,
428 (Err(_), false) => {
429 continue;
430 }
431 (Ok(_), false) => {
432 panic!("DataLocale parsed but it was supposed to fail: {cas:?}");
433 }
434 (Err(_), true) => {
435 panic!("DataLocale was supposed to parse but it failed: {cas:?}");
436 }
437 };
438 writeable::assert_writeable_eq!(data_locale, cas.input);
439 }
440}