zerotrie/builder/konst/
builder.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5use super::super::branch_meta::BranchMeta;
6use super::super::bytestr::ByteStr;
7use super::store::const_for_each;
8use super::store::ConstArrayBuilder;
9use super::store::ConstLengthsStack;
10use super::store::ConstSlice;
11use crate::error::ZeroTrieBuildError;
12use crate::varint;
13
14/// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts.
15pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
16    data: ConstArrayBuilder<N, u8>,
17}
18
19impl<const N: usize> ZeroTrieBuilderConst<N> {
20    /// Non-const function that returns the current trie data as a slice.
21    #[cfg(feature = "litemap")]
22    pub fn as_bytes(&self) -> &[u8] {
23        self.data.as_const_slice().as_slice()
24    }
25
26    /// Returns the trie data, panicking if the buffer is the wrong size.
27    pub const fn build_or_panic(self) -> [u8; N] {
28        self.data.const_build_or_panic()
29    }
30
31    /// Creates a new empty builder.
32    pub const fn new() -> Self {
33        Self {
34            data: ConstArrayBuilder::new_empty([0; N], N),
35        }
36    }
37
38    /// Prepends an ASCII node to the front of the builder. Returns the new builder
39    /// and the delta in length, which is always 1.
40    #[must_use]
41    const fn prepend_ascii(self, ascii: u8) -> (Self, usize) {
42        if ascii >= 128 {
43            panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
44        }
45        let data = self.data.const_push_front_or_panic(ascii);
46        (Self { data }, 1)
47    }
48
49    /// Prepends a value node to the front of the builder. Returns the new builder
50    /// and the delta in length, which depends on the size of the varint.
51    #[must_use]
52    const fn prepend_value(self, value: usize) -> (Self, usize) {
53        let mut data = self.data;
54        let varint_array = varint::write_varint_meta3(value);
55        data = data.const_extend_front_or_panic(varint_array.as_const_slice());
56        data = data.const_bitor_assign(0, 0b10000000);
57        (Self { data }, varint_array.len())
58    }
59
60    /// Prepends a branch node to the front of the builder. Returns the new builder
61    /// and the delta in length, which depends on the size of the varint.
62    #[must_use]
63    const fn prepend_branch(self, value: usize) -> (Self, usize) {
64        let mut data = self.data;
65        let varint_array = varint::write_varint_meta2(value);
66        data = data.const_extend_front_or_panic(varint_array.as_const_slice());
67        data = data.const_bitor_assign(0, 0b11000000);
68        (Self { data }, varint_array.len())
69    }
70
71    /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
72    /// and the delta in length, which is the length of the slice.
73    #[must_use]
74    const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) {
75        let mut data = self.data;
76        let mut i = s.len();
77        while i > 0 {
78            data = data.const_push_front_or_panic(*s.get_or_panic(i - 1));
79            i -= 1;
80        }
81        (Self { data }, s.len())
82    }
83
84    /// Prepends multiple zeros to the front of the builder. Returns the new builder.
85    #[must_use]
86    const fn prepend_n_zeros(self, n: usize) -> Self {
87        let mut data = self.data;
88        let mut i = 0;
89        while i < n {
90            data = data.const_push_front_or_panic(0);
91            i += 1;
92        }
93        Self { data }
94    }
95
96    /// Performs the operation `self[index] |= bits`
97    const fn bitor_assign_at(self, index: usize, bits: u8) -> Self {
98        let mut data = self.data;
99        data = data.const_bitor_assign(index, bits);
100        Self { data }
101    }
102
103    /// Creates a new builder containing the elements in the given slice of key/value pairs.
104    ///
105    /// `K` is the stack size of the lengths stack. If you get an error such as
106    /// "AsciiTrie Builder: Need more stack", try increasing `K`.
107    ///
108    /// # Panics
109    ///
110    /// Panics if the items are not sorted
111    pub const fn from_tuple_slice<'a, const K: usize>(
112        items: &[(&'a ByteStr, usize)],
113    ) -> Result<Self, ZeroTrieBuildError> {
114        let items = ConstSlice::from_slice(items);
115        let mut prev: Option<&'a ByteStr> = None;
116        const_for_each!(items, (ascii_str, _), {
117            match prev {
118                None => (),
119                Some(prev) => {
120                    if !prev.is_less_then(ascii_str) {
121                        panic!("Strings in ByteStr constructor are not sorted");
122                    }
123                }
124            };
125            prev = Some(ascii_str)
126        });
127        Self::from_sorted_const_tuple_slice::<K>(items)
128    }
129
130    /// Creates a new builder containing the elements in the given slice of key/value pairs.
131    ///
132    /// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
133    ///
134    /// `K` is the stack size of the lengths stack. If you get an error such as
135    /// "AsciiTrie Builder: Need more stack", try increasing `K`.
136    pub const fn from_sorted_const_tuple_slice<const K: usize>(
137        items: ConstSlice<(&ByteStr, usize)>,
138    ) -> Result<Self, ZeroTrieBuildError> {
139        let mut result = Self::new();
140        let total_size;
141        (result, total_size) = result.create_or_panic::<K>(items);
142        debug_assert!(total_size == result.data.len());
143        Ok(result)
144    }
145
146    /// The actual builder algorithm. For an explanation, see [`crate::builder`].
147    #[must_use]
148    const fn create_or_panic<const K: usize>(
149        mut self,
150        all_items: ConstSlice<(&ByteStr, usize)>,
151    ) -> (Self, usize) {
152        let mut prefix_len = match all_items.last() {
153            Some(x) => x.0.len(),
154            // Empty slice:
155            None => return (Self::new(), 0),
156        };
157        // Initialize the main loop to point at the last string.
158        let mut lengths_stack = ConstLengthsStack::<K>::new();
159        let mut i = all_items.len() - 1;
160        let mut j = all_items.len();
161        let mut current_len = 0;
162        // Start the main loop.
163        loop {
164            let item_i = all_items.get_or_panic(i);
165            let item_j = all_items.get_or_panic(j - 1);
166            debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
167            // Check if we need to add a value node here.
168            if item_i.0.len() == prefix_len {
169                let len;
170                (self, len) = self.prepend_value(item_i.1);
171                current_len += len;
172            }
173            if prefix_len == 0 {
174                // All done! Leave the main loop.
175                break;
176            }
177            // Reduce the prefix length by 1 and recalculate i and j.
178            prefix_len -= 1;
179            let mut new_i = i;
180            let mut new_j = j;
181            let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
182            let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
183            debug_assert!(ascii_i == ascii_j);
184            let key_ascii = ascii_i;
185            loop {
186                if new_i == 0 {
187                    break;
188                }
189                let candidate = all_items.get_or_panic(new_i - 1).0;
190                if candidate.len() < prefix_len {
191                    // Too short
192                    break;
193                }
194                if item_i.0.prefix_eq(candidate, prefix_len) {
195                    new_i -= 1;
196                } else {
197                    break;
198                }
199                if candidate.len() == prefix_len {
200                    // A string that equals the prefix does not take part in the branch node.
201                    break;
202                }
203                let candidate = candidate.byte_at_or_panic(prefix_len);
204                if candidate != ascii_i {
205                    ascii_i = candidate;
206                }
207            }
208            loop {
209                if new_j == all_items.len() {
210                    break;
211                }
212                let candidate = all_items.get_or_panic(new_j).0;
213                if candidate.len() < prefix_len {
214                    // Too short
215                    break;
216                }
217                if item_j.0.prefix_eq(candidate, prefix_len) {
218                    new_j += 1;
219                } else {
220                    break;
221                }
222                if candidate.len() == prefix_len {
223                    panic!("A shorter string should be earlier in the sequence");
224                }
225                let candidate = candidate.byte_at_or_panic(prefix_len);
226                if candidate != ascii_j {
227                    ascii_j = candidate;
228                }
229            }
230            // If there are no different bytes at this prefix level, we can add an ASCII or Span
231            // node and then continue to the next iteration of the main loop.
232            if ascii_i == key_ascii && ascii_j == key_ascii {
233                let len;
234                (self, len) = self.prepend_ascii(ascii_i);
235                current_len += len;
236                debug_assert!(i == new_i || i == new_i + 1);
237                i = new_i;
238                debug_assert!(j == new_j);
239                continue;
240            }
241            // If i and j changed, we are a target of a branch node.
242            if ascii_j == key_ascii {
243                // We are the _last_ target of a branch node.
244                lengths_stack = lengths_stack.push_or_panic(BranchMeta {
245                    ascii: key_ascii,
246                    cumulative_length: current_len,
247                    local_length: current_len,
248                    count: 1,
249                });
250            } else {
251                // We are the _not the last_ target of a branch node.
252                let BranchMeta {
253                    cumulative_length,
254                    count,
255                    ..
256                } = lengths_stack.peek_or_panic();
257                lengths_stack = lengths_stack.push_or_panic(BranchMeta {
258                    ascii: key_ascii,
259                    cumulative_length: cumulative_length + current_len,
260                    local_length: current_len,
261                    count: count + 1,
262                });
263            }
264            if ascii_i != key_ascii {
265                // We are _not the first_ target of a branch node.
266                // Set the cursor to the previous string and continue the loop.
267                j = i;
268                i -= 1;
269                prefix_len = all_items.get_or_panic(i).0.len();
270                current_len = 0;
271                continue;
272            }
273            // Branch (first)
274            let (total_length, total_count) = {
275                let BranchMeta {
276                    cumulative_length,
277                    count,
278                    ..
279                } = lengths_stack.peek_or_panic();
280                (cumulative_length, count)
281            };
282            let branch_metas;
283            (lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count);
284            let original_keys = branch_metas.map_to_ascii_bytes();
285            // Write out the offset table
286            current_len = total_length;
287            const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
288            let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
289            if w > 3 {
290                panic!("ZeroTrie capacity exceeded");
291            }
292            let mut k = 0;
293            while k <= w {
294                self = self.prepend_n_zeros(total_count - 1);
295                current_len += total_count - 1;
296                let mut l = 0;
297                let mut length_to_write = 0;
298                while l < total_count {
299                    let BranchMeta { local_length, .. } = *branch_metas
300                        .as_const_slice()
301                        .get_or_panic(total_count - l - 1);
302                    let mut adjusted_length = length_to_write;
303                    let mut m = 0;
304                    while m < k {
305                        adjusted_length >>= 8;
306                        m += 1;
307                    }
308                    if l > 0 {
309                        self = self.bitor_assign_at(l - 1, adjusted_length as u8);
310                    }
311                    l += 1;
312                    length_to_write += local_length;
313                }
314                k += 1;
315            }
316            // Write out the lookup table
317            assert!(0 < total_count && total_count <= 256);
318            let branch_value = (w << 8) + (total_count & 0xff);
319            let slice_len;
320            (self, slice_len) = self.prepend_slice(original_keys.as_const_slice());
321            let branch_len;
322            (self, branch_len) = self.prepend_branch(branch_value);
323            current_len += slice_len + branch_len;
324            i = new_i;
325            j = new_j;
326        }
327        assert!(lengths_stack.is_empty());
328        (self, current_len)
329    }
330}