icu_locale_core/parser/
mod.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod errors;
6mod langid;
7mod locale;
8
9pub use errors::ParseError;
10pub use langid::*;
11
12pub use locale::*;
13
14// Safety-usable invariant: returns a prefix of `slice`
15const fn skip_before_separator(slice: &[u8]) -> &[u8] {
16    let mut end = 0;
17    // Invariant: end ≤ slice.len() since len is a nonnegative integer and end is 0
18
19    #[allow(clippy::indexing_slicing)] // very protected, should optimize out
20    while end < slice.len() && !matches!(slice[end], b'-') {
21        // Invariant at beginning of loop: end < slice.len()
22        // Advance until we reach end of slice or a separator.
23        end += 1;
24        // Invariant at end of loop: end ≤ slice.len()
25    }
26
27    // Notice: this slice may be empty for cases like `"en-"` or `"en--US"`
28    // SAFETY: end ≤ slice.len() by while loop
29    // Safety-usable invariant upheld: returned a prefix of the slice
30    unsafe { slice.split_at_unchecked(end).0 }
31}
32
33// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
34//
35// It is quite extraordinary due to focus on performance and Rust limitations for `const`
36// functions.
37//
38// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
39// `"en-"` etc.
40//
41// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
42// as well as typical `Peekable` iterator APIs - `next` and `peek`.
43//
44// All methods return an `Option` of a `Result`.
45#[derive(Copy, Clone, Debug)]
46pub struct SubtagIterator<'a> {
47    remaining: &'a [u8],
48    // Safety invariant: current is a prefix of remaining
49    current: Option<&'a [u8]>,
50}
51
52impl<'a> SubtagIterator<'a> {
53    pub const fn new(rest: &'a [u8]) -> Self {
54        Self {
55            remaining: rest,
56            // Safety invariant upheld: skip_before_separator() returns a prefix of `rest`
57            current: Some(skip_before_separator(rest)),
58        }
59    }
60
61    pub const fn next_const(mut self) -> (Self, Option<&'a [u8]>) {
62        let Some(result) = self.current else {
63            return (self, None);
64        };
65
66        self.current = if result.len() < self.remaining.len() {
67            // If there is more after `result`, by construction `current` starts with a separator
68            // SAFETY: `self.remaining` is strictly longer than `result` due to `result` being a prefix (from the safety invariant)
69            self.remaining = unsafe { self.remaining.split_at_unchecked(result.len() + 1).1 };
70            // Safety invariant upheld: skip_before_separator() returns a prefix of `rest`, and we don't
71            // mutate self.remaining after this
72            Some(skip_before_separator(self.remaining))
73        } else {
74            None
75        };
76        (self, Some(result))
77    }
78
79    pub const fn peek(&self) -> Option<&'a [u8]> {
80        self.current
81    }
82}
83
84impl<'a> Iterator for SubtagIterator<'a> {
85    type Item = &'a [u8];
86
87    fn next(&mut self) -> Option<Self::Item> {
88        let (s, res) = self.next_const();
89        *self = s;
90        res
91    }
92}
93
94#[cfg(test)]
95mod test {
96    use super::*;
97
98    fn slice_to_str(input: &[u8]) -> &str {
99        std::str::from_utf8(input).unwrap()
100    }
101
102    #[test]
103    fn subtag_iterator_peek_test() {
104        let slice = "de-at-u-ca-foobar";
105        let mut si = SubtagIterator::new(slice.as_bytes());
106
107        assert_eq!(si.peek().map(slice_to_str), Some("de"));
108        assert_eq!(si.peek().map(slice_to_str), Some("de"));
109        assert_eq!(si.next().map(slice_to_str), Some("de"));
110
111        assert_eq!(si.peek().map(slice_to_str), Some("at"));
112        assert_eq!(si.peek().map(slice_to_str), Some("at"));
113        assert_eq!(si.next().map(slice_to_str), Some("at"));
114    }
115
116    #[test]
117    fn subtag_iterator_test() {
118        let slice = "";
119        let mut si = SubtagIterator::new(slice.as_bytes());
120        assert_eq!(si.next().map(slice_to_str), Some(""));
121
122        let slice = "-";
123        let mut si = SubtagIterator::new(slice.as_bytes());
124        assert_eq!(si.next().map(slice_to_str), Some(""));
125
126        let slice = "-en";
127        let mut si = SubtagIterator::new(slice.as_bytes());
128        assert_eq!(si.next().map(slice_to_str), Some(""));
129        assert_eq!(si.next().map(slice_to_str), Some("en"));
130        assert_eq!(si.next(), None);
131
132        let slice = "en";
133        let si = SubtagIterator::new(slice.as_bytes());
134        assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
135
136        let slice = "en-";
137        let si = SubtagIterator::new(slice.as_bytes());
138        assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
139
140        let slice = "--";
141        let mut si = SubtagIterator::new(slice.as_bytes());
142        assert_eq!(si.next().map(slice_to_str), Some(""));
143        assert_eq!(si.next().map(slice_to_str), Some(""));
144        assert_eq!(si.next().map(slice_to_str), Some(""));
145        assert_eq!(si.next(), None);
146
147        let slice = "-en-";
148        let mut si = SubtagIterator::new(slice.as_bytes());
149        assert_eq!(si.next().map(slice_to_str), Some(""));
150        assert_eq!(si.next().map(slice_to_str), Some("en"));
151        assert_eq!(si.next().map(slice_to_str), Some(""));
152        assert_eq!(si.next(), None);
153
154        let slice = "de-at-u-ca-foobar";
155        let si = SubtagIterator::new(slice.as_bytes());
156        assert_eq!(
157            si.map(slice_to_str).collect::<Vec<_>>(),
158            vec!["de", "at", "u", "ca", "foobar",]
159        );
160    }
161
162    #[test]
163    fn skip_before_separator_test() {
164        let current = skip_before_separator(b"");
165        assert_eq!(current, b"");
166
167        let current = skip_before_separator(b"en");
168        assert_eq!(current, b"en");
169
170        let current = skip_before_separator(b"en-");
171        assert_eq!(current, b"en");
172
173        let current = skip_before_separator(b"en--US");
174        assert_eq!(current, b"en");
175
176        let current = skip_before_separator(b"-US");
177        assert_eq!(current, b"");
178
179        let current = skip_before_separator(b"US");
180        assert_eq!(current, b"US");
181
182        let current = skip_before_separator(b"-");
183        assert_eq!(current, b"");
184    }
185}