1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
45use super::super::branch_meta::BranchMeta;
6use super::super::bytestr::ByteStr;
7use super::store::const_for_each;
8use super::store::ConstArrayBuilder;
9use super::store::ConstLengthsStack;
10use super::store::ConstSlice;
11use crate::error::ZeroTrieBuildError;
12use crate::varint;
1314/// A low-level builder for ZeroTrieSimpleAscii. Works in const contexts.
15pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
16 data: ConstArrayBuilder<N, u8>,
17}
1819impl<const N: usize> ZeroTrieBuilderConst<N> {
20/// Non-const function that returns the current trie data as a slice.
21#[cfg(feature = "litemap")]
22pub fn as_bytes(&self) -> &[u8] {
23self.data.as_const_slice().as_slice()
24 }
2526/// Returns the trie data, panicking if the buffer is the wrong size.
27pub const fn build_or_panic(self) -> [u8; N] {
28self.data.const_build_or_panic()
29 }
3031/// Creates a new empty builder.
32pub const fn new() -> Self {
33Self {
34 data: ConstArrayBuilder::new_empty([0; N], N),
35 }
36 }
3738/// Prepends an ASCII node to the front of the builder. Returns the new builder
39 /// and the delta in length, which is always 1.
40#[must_use]
41const fn prepend_ascii(self, ascii: u8) -> (Self, usize) {
42if ascii >= 128 {
43panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
44 }
45let data = self.data.const_push_front_or_panic(ascii);
46 (Self { data }, 1)
47 }
4849/// Prepends a value node to the front of the builder. Returns the new builder
50 /// and the delta in length, which depends on the size of the varint.
51#[must_use]
52const fn prepend_value(self, value: usize) -> (Self, usize) {
53let mut data = self.data;
54let varint_array = varint::write_varint_meta3(value);
55 data = data.const_extend_front_or_panic(varint_array.as_const_slice());
56 data = data.const_bitor_assign(0, 0b10000000);
57 (Self { data }, varint_array.len())
58 }
5960/// Prepends a branch node to the front of the builder. Returns the new builder
61 /// and the delta in length, which depends on the size of the varint.
62#[must_use]
63const fn prepend_branch(self, value: usize) -> (Self, usize) {
64let mut data = self.data;
65let varint_array = varint::write_varint_meta2(value);
66 data = data.const_extend_front_or_panic(varint_array.as_const_slice());
67 data = data.const_bitor_assign(0, 0b11000000);
68 (Self { data }, varint_array.len())
69 }
7071/// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
72 /// and the delta in length, which is the length of the slice.
73#[must_use]
74const fn prepend_slice(self, s: ConstSlice<u8>) -> (Self, usize) {
75let mut data = self.data;
76let mut i = s.len();
77while i > 0 {
78 data = data.const_push_front_or_panic(*s.get_or_panic(i - 1));
79 i -= 1;
80 }
81 (Self { data }, s.len())
82 }
8384/// Prepends multiple zeros to the front of the builder. Returns the new builder.
85#[must_use]
86const fn prepend_n_zeros(self, n: usize) -> Self {
87let mut data = self.data;
88let mut i = 0;
89while i < n {
90 data = data.const_push_front_or_panic(0);
91 i += 1;
92 }
93Self { data }
94 }
9596/// Performs the operation `self[index] |= bits`
97const fn bitor_assign_at(self, index: usize, bits: u8) -> Self {
98let mut data = self.data;
99 data = data.const_bitor_assign(index, bits);
100Self { data }
101 }
102103/// Creates a new builder containing the elements in the given slice of key/value pairs.
104 ///
105 /// `K` is the stack size of the lengths stack. If you get an error such as
106 /// "AsciiTrie Builder: Need more stack", try increasing `K`.
107 ///
108 /// # Panics
109 ///
110 /// Panics if the items are not sorted
111pub const fn from_tuple_slice<'a, const K: usize>(
112 items: &[(&'a ByteStr, usize)],
113 ) -> Result<Self, ZeroTrieBuildError> {
114let items = ConstSlice::from_slice(items);
115let mut prev: Option<&'a ByteStr> = None;
116const_for_each!(items, (ascii_str, _), {
117match prev {
118None => (),
119Some(prev) => {
120if !prev.is_less_then(ascii_str) {
121panic!("Strings in ByteStr constructor are not sorted");
122 }
123 }
124 };
125 prev = Some(ascii_str)
126 });
127Self::from_sorted_const_tuple_slice::<K>(items)
128 }
129130/// Creates a new builder containing the elements in the given slice of key/value pairs.
131 ///
132 /// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
133 ///
134 /// `K` is the stack size of the lengths stack. If you get an error such as
135 /// "AsciiTrie Builder: Need more stack", try increasing `K`.
136pub const fn from_sorted_const_tuple_slice<const K: usize>(
137 items: ConstSlice<(&ByteStr, usize)>,
138 ) -> Result<Self, ZeroTrieBuildError> {
139let mut result = Self::new();
140let total_size;
141 (result, total_size) = result.create_or_panic::<K>(items);
142debug_assert!(total_size == result.data.len());
143Ok(result)
144 }
145146/// The actual builder algorithm. For an explanation, see [`crate::builder`].
147#[must_use]
148const fn create_or_panic<const K: usize>(
149mut self,
150 all_items: ConstSlice<(&ByteStr, usize)>,
151 ) -> (Self, usize) {
152let mut prefix_len = match all_items.last() {
153Some(x) => x.0.len(),
154// Empty slice:
155None => return (Self::new(), 0),
156 };
157// Initialize the main loop to point at the last string.
158let mut lengths_stack = ConstLengthsStack::<K>::new();
159let mut i = all_items.len() - 1;
160let mut j = all_items.len();
161let mut current_len = 0;
162// Start the main loop.
163loop {
164let item_i = all_items.get_or_panic(i);
165let item_j = all_items.get_or_panic(j - 1);
166debug_assert!(item_i.0.prefix_eq(item_j.0, prefix_len));
167// Check if we need to add a value node here.
168if item_i.0.len() == prefix_len {
169let len;
170 (self, len) = self.prepend_value(item_i.1);
171 current_len += len;
172 }
173if prefix_len == 0 {
174// All done! Leave the main loop.
175break;
176 }
177// Reduce the prefix length by 1 and recalculate i and j.
178prefix_len -= 1;
179let mut new_i = i;
180let mut new_j = j;
181let mut ascii_i = item_i.0.byte_at_or_panic(prefix_len);
182let mut ascii_j = item_j.0.byte_at_or_panic(prefix_len);
183debug_assert!(ascii_i == ascii_j);
184let key_ascii = ascii_i;
185loop {
186if new_i == 0 {
187break;
188 }
189let candidate = all_items.get_or_panic(new_i - 1).0;
190if candidate.len() < prefix_len {
191// Too short
192break;
193 }
194if item_i.0.prefix_eq(candidate, prefix_len) {
195 new_i -= 1;
196 } else {
197break;
198 }
199if candidate.len() == prefix_len {
200// A string that equals the prefix does not take part in the branch node.
201break;
202 }
203let candidate = candidate.byte_at_or_panic(prefix_len);
204if candidate != ascii_i {
205 ascii_i = candidate;
206 }
207 }
208loop {
209if new_j == all_items.len() {
210break;
211 }
212let candidate = all_items.get_or_panic(new_j).0;
213if candidate.len() < prefix_len {
214// Too short
215break;
216 }
217if item_j.0.prefix_eq(candidate, prefix_len) {
218 new_j += 1;
219 } else {
220break;
221 }
222if candidate.len() == prefix_len {
223panic!("A shorter string should be earlier in the sequence");
224 }
225let candidate = candidate.byte_at_or_panic(prefix_len);
226if candidate != ascii_j {
227 ascii_j = candidate;
228 }
229 }
230// If there are no different bytes at this prefix level, we can add an ASCII or Span
231 // node and then continue to the next iteration of the main loop.
232if ascii_i == key_ascii && ascii_j == key_ascii {
233let len;
234 (self, len) = self.prepend_ascii(ascii_i);
235 current_len += len;
236debug_assert!(i == new_i || i == new_i + 1);
237 i = new_i;
238debug_assert!(j == new_j);
239continue;
240 }
241// If i and j changed, we are a target of a branch node.
242if ascii_j == key_ascii {
243// We are the _last_ target of a branch node.
244lengths_stack = lengths_stack.push_or_panic(BranchMeta {
245 ascii: key_ascii,
246 cumulative_length: current_len,
247 local_length: current_len,
248 count: 1,
249 });
250 } else {
251// We are the _not the last_ target of a branch node.
252let BranchMeta {
253 cumulative_length,
254 count,
255 ..
256 } = lengths_stack.peek_or_panic();
257 lengths_stack = lengths_stack.push_or_panic(BranchMeta {
258 ascii: key_ascii,
259 cumulative_length: cumulative_length + current_len,
260 local_length: current_len,
261 count: count + 1,
262 });
263 }
264if ascii_i != key_ascii {
265// We are _not the first_ target of a branch node.
266 // Set the cursor to the previous string and continue the loop.
267j = i;
268 i -= 1;
269 prefix_len = all_items.get_or_panic(i).0.len();
270 current_len = 0;
271continue;
272 }
273// Branch (first)
274let (total_length, total_count) = {
275let BranchMeta {
276 cumulative_length,
277 count,
278 ..
279 } = lengths_stack.peek_or_panic();
280 (cumulative_length, count)
281 };
282let branch_metas;
283 (lengths_stack, branch_metas) = lengths_stack.pop_many_or_panic(total_count);
284let original_keys = branch_metas.map_to_ascii_bytes();
285// Write out the offset table
286current_len = total_length;
287const USIZE_BITS: usize = core::mem::size_of::<usize>() * 8;
288let w = (USIZE_BITS - (total_length.leading_zeros() as usize) - 1) / 8;
289if w > 3 {
290panic!("ZeroTrie capacity exceeded");
291 }
292let mut k = 0;
293while k <= w {
294self = self.prepend_n_zeros(total_count - 1);
295 current_len += total_count - 1;
296let mut l = 0;
297let mut length_to_write = 0;
298while l < total_count {
299let BranchMeta { local_length, .. } = *branch_metas
300 .as_const_slice()
301 .get_or_panic(total_count - l - 1);
302let mut adjusted_length = length_to_write;
303let mut m = 0;
304while m < k {
305 adjusted_length >>= 8;
306 m += 1;
307 }
308if l > 0 {
309self = self.bitor_assign_at(l - 1, adjusted_length as u8);
310 }
311 l += 1;
312 length_to_write += local_length;
313 }
314 k += 1;
315 }
316// Write out the lookup table
317assert!(0 < total_count && total_count <= 256);
318let branch_value = (w << 8) + (total_count & 0xff);
319let slice_len;
320 (self, slice_len) = self.prepend_slice(original_keys.as_const_slice());
321let branch_len;
322 (self, branch_len) = self.prepend_branch(branch_value);
323 current_len += slice_len + branch_len;
324 i = new_i;
325 j = new_j;
326 }
327assert!(lengths_stack.is_empty());
328 (self, current_len)
329 }
330}