icu_collections/
iterator_utils.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use crate::codepointtrie::CodePointMapRange;
6
7/// This is an iterator that coalesces adjacent ranges in an iterator over code
8/// point ranges
9pub(crate) struct RangeListIteratorCoalescer<I, T> {
10    iter: I,
11    peek: Option<CodePointMapRange<T>>,
12}
13
14impl<I, T: Eq> RangeListIteratorCoalescer<I, T>
15where
16    I: Iterator<Item = CodePointMapRange<T>>,
17{
18    pub fn new(iter: I) -> Self {
19        Self { iter, peek: None }
20    }
21}
22
23impl<I, T: Eq> Iterator for RangeListIteratorCoalescer<I, T>
24where
25    I: Iterator<Item = CodePointMapRange<T>>,
26{
27    type Item = CodePointMapRange<T>;
28
29    fn next(&mut self) -> Option<Self::Item> {
30        // Get the initial range we're working with: either a leftover
31        // range from last time, or the next range
32        let mut ret = if let Some(peek) = self.peek.take() {
33            peek
34        } else if let Some(next) = self.iter.next() {
35            next
36        } else {
37            // No ranges, exit early
38            return None;
39        };
40
41        // Keep pulling ranges
42        #[allow(clippy::while_let_on_iterator)]
43        // can't move the iterator, also we want it to be explicit that we're not draining the iterator
44        while let Some(next) = self.iter.next() {
45            if *next.range.start() == ret.range.end() + 1 && next.value == ret.value {
46                // Range has no gap, coalesce
47                ret.range = *ret.range.start()..=*next.range.end();
48            } else {
49                // Range has a gap, return what we have so far, update
50                // peek
51                self.peek = Some(next);
52                return Some(ret);
53            }
54        }
55
56        // Ran out of elements, exit
57        Some(ret)
58    }
59}
60
61#[cfg(test)]
62mod tests {
63    use core::fmt::Debug;
64    use icu::collections::codepointinvlist::CodePointInversionListBuilder;
65    use icu::properties::props::{BinaryProperty, EnumeratedProperty};
66    use icu::properties::{CodePointMapData, CodePointSetData};
67
68    fn test_set<P: BinaryProperty>(name: &str) {
69        let mut builder = CodePointInversionListBuilder::new();
70        let mut builder_complement = CodePointInversionListBuilder::new();
71
72        for range in CodePointSetData::new::<P>().iter_ranges() {
73            builder.add_range32(range)
74        }
75
76        for range in CodePointSetData::new::<P>().iter_ranges_complemented() {
77            builder_complement.add_range32(range)
78        }
79
80        builder.complement();
81        let set1 = builder.build();
82        let set2 = builder_complement.build();
83        assert_eq!(set1, set2, "Set {name} failed to complement correctly");
84    }
85
86    fn test_map<T: EnumeratedProperty + Debug>(value: T, name: &str) {
87        let mut builder = CodePointInversionListBuilder::new();
88        let mut builder_complement = CodePointInversionListBuilder::new();
89
90        for range in CodePointMapData::<T>::new().iter_ranges_for_value(value) {
91            builder.add_range32(range)
92        }
93
94        for range in CodePointMapData::<T>::new().iter_ranges_for_value_complemented(value) {
95            builder_complement.add_range32(range)
96        }
97
98        builder.complement();
99        let set1 = builder.build();
100        let set2 = builder_complement.build();
101        assert_eq!(
102            set1, set2,
103            "Map {name} failed to complement correctly with value {value:?}"
104        );
105    }
106
107    #[test]
108    fn test_complement_sets() {
109        use icu::properties::props::*;
110        // Stress test the RangeListIteratorComplementer logic by ensuring it works for
111        // a whole bunch of binary properties
112        test_set::<AsciiHexDigit>("ASCII_Hex_Digit");
113        test_set::<Alnum>("Alnum");
114        test_set::<Alphabetic>("Alphabetic");
115        test_set::<BidiControl>("Bidi_Control");
116        test_set::<BidiMirrored>("Bidi_Mirrored");
117        test_set::<Blank>("Blank");
118        test_set::<Cased>("Cased");
119        test_set::<CaseIgnorable>("Case_Ignorable");
120        test_set::<FullCompositionExclusion>("Full_Composition_Exclusion");
121        test_set::<ChangesWhenCasefolded>("Changes_When_Casefolded");
122        test_set::<ChangesWhenCasemapped>("Changes_When_Casemapped");
123        test_set::<ChangesWhenNfkcCasefolded>("Changes_When_NFKC_Casefolded");
124        test_set::<ChangesWhenLowercased>("Changes_When_Lowercased");
125        test_set::<ChangesWhenTitlecased>("Changes_When_Titlecased");
126        test_set::<ChangesWhenUppercased>("Changes_When_Uppercased");
127        test_set::<Dash>("Dash");
128        test_set::<Deprecated>("Deprecated");
129        test_set::<DefaultIgnorableCodePoint>("Default_Ignorable_Code_Point");
130        test_set::<Diacritic>("Diacritic");
131        test_set::<EmojiModifierBase>("Emoji_Modifier_Base");
132        test_set::<EmojiComponent>("Emoji_Component");
133        test_set::<EmojiModifier>("Emoji_Modifier");
134        test_set::<Emoji>("Emoji");
135        test_set::<EmojiPresentation>("Emoji_Presentation");
136        test_set::<Extender>("Extender");
137        test_set::<ExtendedPictographic>("Extended_Pictographic");
138        test_set::<Graph>("Graph");
139        test_set::<GraphemeBase>("Grapheme_Base");
140        test_set::<GraphemeExtend>("Grapheme_Extend");
141        test_set::<GraphemeLink>("Grapheme_Link");
142        test_set::<HexDigit>("Hex_Digit");
143        test_set::<Hyphen>("Hyphen");
144        test_set::<IdContinue>("Id_Continue");
145        test_set::<Ideographic>("Ideographic");
146        test_set::<IdStart>("Id_Start");
147        test_set::<IdsBinaryOperator>("Ids_Binary_Operator");
148        test_set::<IdsTrinaryOperator>("Ids_Trinary_Operator");
149        test_set::<JoinControl>("Join_Control");
150        test_set::<LogicalOrderException>("Logical_Order_Exception");
151        test_set::<Lowercase>("Lowercase");
152        test_set::<Math>("Math");
153        test_set::<NoncharacterCodePoint>("Noncharacter_Code_Point");
154        test_set::<NfcInert>("NFC_Inert");
155        test_set::<NfdInert>("NFD_Inert");
156        test_set::<NfkcInert>("NFKC_Inert");
157        test_set::<NfkdInert>("NFKD_Inert");
158        test_set::<PatternSyntax>("Pattern_Syntax");
159        test_set::<PatternWhiteSpace>("Pattern_White_Space");
160        test_set::<PrependedConcatenationMark>("Prepended_Concatenation_Mark");
161        test_set::<Print>("Print");
162        test_set::<QuotationMark>("Quotation_Mark");
163        test_set::<Radical>("Radical");
164        test_set::<RegionalIndicator>("Regional_Indicator");
165        test_set::<SoftDotted>("Soft_Dotted");
166        test_set::<SegmentStarter>("Segment_Starter");
167        test_set::<CaseSensitive>("Case_Sensitive");
168        test_set::<SentenceTerminal>("Sentence_Terminal");
169        test_set::<TerminalPunctuation>("Terminal_Punctuation");
170        test_set::<UnifiedIdeograph>("Unified_Ideograph");
171        test_set::<Uppercase>("Uppercase");
172        test_set::<VariationSelector>("Variation_Selector");
173        test_set::<WhiteSpace>("White_Space");
174        test_set::<Xdigit>("Xdigit");
175        test_set::<XidContinue>("XID_Continue");
176        test_set::<XidStart>("XID_Start");
177    }
178
179    #[test]
180    fn test_complement_maps() {
181        use icu::properties::props::{GeneralCategory, Script};
182        test_map(GeneralCategory::UppercaseLetter, "gc");
183        test_map(GeneralCategory::OtherPunctuation, "gc");
184        test_map(Script::Devanagari, "script");
185        test_map(Script::Latin, "script");
186        test_map(Script::Common, "script");
187    }
188}