1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
45pub mod errors;
6mod langid;
7mod locale;
89pub use errors::ParseError;
10pub use langid::*;
1112pub use locale::*;
1314// Safety-usable invariant: returns a prefix of `slice`
15const fn skip_before_separator(slice: &[u8]) -> &[u8] {
16let mut end = 0;
17// Invariant: end ≤ slice.len() since len is a nonnegative integer and end is 0
1819#[allow(clippy::indexing_slicing)] // very protected, should optimize out
20while end < slice.len() && !matches!(slice[end], b'-') {
21// Invariant at beginning of loop: end < slice.len()
22 // Advance until we reach end of slice or a separator.
23end += 1;
24// Invariant at end of loop: end ≤ slice.len()
25}
2627// Notice: this slice may be empty for cases like `"en-"` or `"en--US"`
28 // SAFETY: end ≤ slice.len() by while loop
29 // Safety-usable invariant upheld: returned a prefix of the slice
30unsafe { slice.split_at_unchecked(end).0 }
31}
3233// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
34//
35// It is quite extraordinary due to focus on performance and Rust limitations for `const`
36// functions.
37//
38// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
39// `"en-"` etc.
40//
41// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
42// as well as typical `Peekable` iterator APIs - `next` and `peek`.
43//
44// All methods return an `Option` of a `Result`.
45#[derive(Copy, Clone, Debug)]
46pub struct SubtagIterator<'a> {
47 remaining: &'a [u8],
48// Safety invariant: current is a prefix of remaining
49current: Option<&'a [u8]>,
50}
5152impl<'a> SubtagIterator<'a> {
53pub const fn new(rest: &'a [u8]) -> Self {
54Self {
55 remaining: rest,
56// Safety invariant upheld: skip_before_separator() returns a prefix of `rest`
57current: Some(skip_before_separator(rest)),
58 }
59 }
6061pub const fn next_const(mut self) -> (Self, Option<&'a [u8]>) {
62let Some(result) = self.current else {
63return (self, None);
64 };
6566self.current = if result.len() < self.remaining.len() {
67// If there is more after `result`, by construction `current` starts with a separator
68 // SAFETY: `self.remaining` is strictly longer than `result` due to `result` being a prefix (from the safety invariant)
69self.remaining = unsafe { self.remaining.split_at_unchecked(result.len() + 1).1 };
70// Safety invariant upheld: skip_before_separator() returns a prefix of `rest`, and we don't
71 // mutate self.remaining after this
72Some(skip_before_separator(self.remaining))
73 } else {
74None
75};
76 (self, Some(result))
77 }
7879pub const fn peek(&self) -> Option<&'a [u8]> {
80self.current
81 }
82}
8384impl<'a> Iterator for SubtagIterator<'a> {
85type Item = &'a [u8];
8687fn next(&mut self) -> Option<Self::Item> {
88let (s, res) = self.next_const();
89*self = s;
90 res
91 }
92}
9394#[cfg(test)]
95mod test {
96use super::*;
9798fn slice_to_str(input: &[u8]) -> &str {
99 std::str::from_utf8(input).unwrap()
100 }
101102#[test]
103fn subtag_iterator_peek_test() {
104let slice = "de-at-u-ca-foobar";
105let mut si = SubtagIterator::new(slice.as_bytes());
106107assert_eq!(si.peek().map(slice_to_str), Some("de"));
108assert_eq!(si.peek().map(slice_to_str), Some("de"));
109assert_eq!(si.next().map(slice_to_str), Some("de"));
110111assert_eq!(si.peek().map(slice_to_str), Some("at"));
112assert_eq!(si.peek().map(slice_to_str), Some("at"));
113assert_eq!(si.next().map(slice_to_str), Some("at"));
114 }
115116#[test]
117fn subtag_iterator_test() {
118let slice = "";
119let mut si = SubtagIterator::new(slice.as_bytes());
120assert_eq!(si.next().map(slice_to_str), Some(""));
121122let slice = "-";
123let mut si = SubtagIterator::new(slice.as_bytes());
124assert_eq!(si.next().map(slice_to_str), Some(""));
125126let slice = "-en";
127let mut si = SubtagIterator::new(slice.as_bytes());
128assert_eq!(si.next().map(slice_to_str), Some(""));
129assert_eq!(si.next().map(slice_to_str), Some("en"));
130assert_eq!(si.next(), None);
131132let slice = "en";
133let si = SubtagIterator::new(slice.as_bytes());
134assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
135136let slice = "en-";
137let si = SubtagIterator::new(slice.as_bytes());
138assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
139140let slice = "--";
141let mut si = SubtagIterator::new(slice.as_bytes());
142assert_eq!(si.next().map(slice_to_str), Some(""));
143assert_eq!(si.next().map(slice_to_str), Some(""));
144assert_eq!(si.next().map(slice_to_str), Some(""));
145assert_eq!(si.next(), None);
146147let slice = "-en-";
148let mut si = SubtagIterator::new(slice.as_bytes());
149assert_eq!(si.next().map(slice_to_str), Some(""));
150assert_eq!(si.next().map(slice_to_str), Some("en"));
151assert_eq!(si.next().map(slice_to_str), Some(""));
152assert_eq!(si.next(), None);
153154let slice = "de-at-u-ca-foobar";
155let si = SubtagIterator::new(slice.as_bytes());
156assert_eq!(
157 si.map(slice_to_str).collect::<Vec<_>>(),
158vec!["de", "at", "u", "ca", "foobar",]
159 );
160 }
161162#[test]
163fn skip_before_separator_test() {
164let current = skip_before_separator(b"");
165assert_eq!(current, b"");
166167let current = skip_before_separator(b"en");
168assert_eq!(current, b"en");
169170let current = skip_before_separator(b"en-");
171assert_eq!(current, b"en");
172173let current = skip_before_separator(b"en--US");
174assert_eq!(current, b"en");
175176let current = skip_before_separator(b"-US");
177assert_eq!(current, b"");
178179let current = skip_before_separator(b"US");
180assert_eq!(current, b"US");
181182let current = skip_before_separator(b"-");
183assert_eq!(current, b"");
184 }
185}