zerotrie/
varint.rs

1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5//! Varint spec for ZeroTrie:
6//!
7//! - Lead byte: top M (2 or 3) bits are metadata; next is varint extender; rest is value
8//! - Trail bytes: top bit is varint extender; rest are low bits of value
9//! - Guaranteed uniqueness of varint by adding "latent value" for each extender byte
10//! - No maximum, but high bits will be dropped if they don't fit in the platform's `usize`
11//!
12//! This is best shown by examples.
13//!
14//! ```txt
15//! xxx0'1010 = 10
16//! xxx0'1111 = 15 (largest single-byte value with M=3)
17//! xxx1'0000 0000'0000 must be 16 (smallest two-byte value with M=3)
18//! xxx1'0000 0000'0001 = 17
19//! xxx1'1111 0111'1111 = 2063 (largest two-byte value with M=3)
20//! xxx1'0000 1000'0000 0000'0000 must be 2064 (smallest three-byte value with M=3)
21//! xxx1'0000 1000'0000 0000'0001 = 2065
22//! ```
23//!
24//! The latent values by number of bytes for M=3 are:
25//!
26//! - 1 byte: 0
27//! - 2 bytes: 16 = 0x10 = 0b10000
28//! - 3 bytes: 2064 = 0x810 = 0b100000010000
29//! - 4 bytes: 264208 = 0x40810 = 0b1000000100000010000
30//! - 5 bytes: 33818640 = 0x2040810 = 0b10000001000000100000010000
31//! - …
32//!
33//! For M=2, the latent values are:
34//!
35//! - 1 byte: 0
36//! - 2 bytes: 32 = 0x20 = 0b100000
37//! - 3 bytes: 4128 = 0x1020 = 0b1000000100000
38//! - 4 bytes: 524320 = 0x81020 = 0b10000001000000100000
39//! - 5 bytes: 67637280 = 0x4081020 = 0b100000010000001000000100000
40//! - …
41
42use crate::builder::konst::ConstArrayBuilder;
43
44#[cfg(feature = "alloc")]
45use crate::builder::nonconst::TrieBuilderStore;
46
47/// Reads a varint with 2 bits of metadata in the lead byte.
48///
49/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
50///
51/// If the varint spills off the end of the slice, a debug assertion will fail,
52/// and the function will return the value up to that point.
53pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
54    let mut value = (start & 0b00011111) as usize;
55    let mut remainder = remainder;
56    if (start & 0b00100000) != 0 {
57        loop {
58            let next;
59            (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
60            // Note: value << 7 could drop high bits. The first addition can't overflow.
61            // The second addition could overflow; in such a case we just inform the
62            // developer via the debug assertion.
63            value = (value << 7) + ((*next & 0b01111111) as usize) + 32;
64            if (*next & 0b10000000) == 0 {
65                break;
66            }
67        }
68    }
69    (value, remainder)
70}
71
72/// Reads a varint with 3 bits of metadata in the lead byte.
73///
74/// Returns the varint value and a subslice of `remainder` with the varint bytes removed.
75///
76/// If the varint spills off the end of the slice, a debug assertion will fail,
77/// and the function will return the value up to that point.
78pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) {
79    let mut value = (start & 0b00001111) as usize;
80    let mut remainder = remainder;
81    if (start & 0b00010000) != 0 {
82        loop {
83            let next;
84            (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint");
85            // Note: value << 7 could drop high bits. The first addition can't overflow.
86            // The second addition could overflow; in such a case we just inform the
87            // developer via the debug assertion.
88            value = (value << 7) + ((*next & 0b01111111) as usize) + 16;
89            if (*next & 0b10000000) == 0 {
90                break;
91            }
92        }
93    }
94    (value, remainder)
95}
96
97/// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`].
98///
99/// Returns the varint value.
100#[cfg(feature = "alloc")]
101pub(crate) fn try_read_varint_meta3_from_tstore<S: TrieBuilderStore>(
102    start: u8,
103    remainder: &mut S,
104) -> Option<usize> {
105    let mut value = (start & 0b00001111) as usize;
106    if (start & 0b00010000) != 0 {
107        loop {
108            let next = remainder.atbs_pop_front()?;
109            // Note: value << 7 could drop high bits. The first addition can't overflow.
110            // The second addition could overflow; in such a case we just inform the
111            // developer via the debug assertion.
112            value = (value << 7) + ((next & 0b01111111) as usize) + 16;
113            if (next & 0b10000000) == 0 {
114                break;
115            }
116        }
117    }
118    Some(value)
119}
120
121#[cfg(test)]
122const MAX_VARINT: usize = usize::MAX;
123
124// *Upper Bound:* Each trail byte stores 7 bits of data, plus the latent value.
125// Add an extra 1 since the lead byte holds only 5 bits of data.
126const MAX_VARINT_LENGTH: usize = 1 + core::mem::size_of::<usize>() * 8 / 7;
127
128/// Returns a new [`ConstArrayBuilder`] containing a varint with 2 bits of metadata.
129pub(crate) const fn write_varint_meta2(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
130    let mut result = [0; MAX_VARINT_LENGTH];
131    let mut i = MAX_VARINT_LENGTH - 1;
132    let mut value = value;
133    let mut last = true;
134    loop {
135        if value < 32 {
136            result[i] = value as u8;
137            if !last {
138                result[i] |= 0b00100000;
139            }
140            break;
141        }
142        value -= 32;
143        result[i] = (value as u8) & 0b01111111;
144        if !last {
145            result[i] |= 0b10000000;
146        } else {
147            last = false;
148        }
149        value >>= 7;
150        i -= 1;
151    }
152    // The bytes are from i to the end.
153    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
154}
155
156/// Returns a new [`ConstArrayBuilder`] containing a varint with 3 bits of metadata.
157pub(crate) const fn write_varint_meta3(value: usize) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
158    let mut result = [0; MAX_VARINT_LENGTH];
159    let mut i = MAX_VARINT_LENGTH - 1;
160    let mut value = value;
161    let mut last = true;
162    loop {
163        if value < 16 {
164            result[i] = value as u8;
165            if !last {
166                result[i] |= 0b00010000;
167            }
168            break;
169        }
170        value -= 16;
171        result[i] = (value as u8) & 0b01111111;
172        if !last {
173            result[i] |= 0b10000000;
174        } else {
175            last = false;
176        }
177        value >>= 7;
178        i -= 1;
179    }
180    // The bytes are from i to the end.
181    ConstArrayBuilder::from_manual_slice(result, i, MAX_VARINT_LENGTH)
182}
183
184/// A secondary implementation that separates the latent value while computing the varint.
185#[cfg(test)]
186pub(crate) const fn write_varint_reference(
187    value: usize,
188) -> ConstArrayBuilder<MAX_VARINT_LENGTH, u8> {
189    let mut result = [0; MAX_VARINT_LENGTH];
190    if value < 32 {
191        result[0] = value as u8;
192        return ConstArrayBuilder::from_manual_slice(result, 0, 1);
193    }
194    result[0] = 32;
195    let mut latent = 32;
196    let mut steps = 2;
197    loop {
198        let next_latent = (latent << 7) + 32;
199        if value < next_latent || next_latent == latent {
200            break;
201        }
202        latent = next_latent;
203        steps += 1;
204    }
205    let mut value = value - latent;
206    let mut i = steps;
207    while i > 0 {
208        i -= 1;
209        result[i] |= (value as u8) & 0b01111111;
210        value >>= 7;
211        if i > 0 && i < steps - 1 {
212            result[i] |= 0b10000000;
213        }
214    }
215    // The bytes are from 0 to `steps`.
216    ConstArrayBuilder::from_manual_slice(result, 0, steps)
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[derive(Debug)]
224    struct TestCase<'a> {
225        bytes: &'a [u8],
226        remainder: &'a [u8],
227        value: usize,
228    }
229    static CASES: &[TestCase] = &[
230        TestCase {
231            bytes: &[0b00000000],
232            remainder: &[],
233            value: 0,
234        },
235        TestCase {
236            bytes: &[0b00001010],
237            remainder: &[],
238            value: 10,
239        },
240        TestCase {
241            bytes: &[0b00011111],
242            remainder: &[],
243            value: 31,
244        },
245        TestCase {
246            bytes: &[0b00011111, 0b10101010],
247            remainder: &[0b10101010],
248            value: 31,
249        },
250        TestCase {
251            bytes: &[0b00100000, 0b00000000],
252            remainder: &[],
253            value: 32,
254        },
255        TestCase {
256            bytes: &[0b00100000, 0b00000001],
257            remainder: &[],
258            value: 33,
259        },
260        TestCase {
261            bytes: &[0b00100000, 0b00100000],
262            remainder: &[],
263            value: 64,
264        },
265        TestCase {
266            bytes: &[0x20, 0x44],
267            remainder: &[],
268            value: 100,
269        },
270        TestCase {
271            bytes: &[0b00100000, 0b01111111],
272            remainder: &[],
273            value: 159,
274        },
275        TestCase {
276            bytes: &[0b00100001, 0b00000000],
277            remainder: &[],
278            value: 160,
279        },
280        TestCase {
281            bytes: &[0b00100001, 0b00000001],
282            remainder: &[],
283            value: 161,
284        },
285        TestCase {
286            bytes: &[0x23, 0x54],
287            remainder: &[],
288            value: 500,
289        },
290        TestCase {
291            bytes: &[0b00111111, 0b01111111],
292            remainder: &[],
293            value: 4127, // 32 + (1 << 12) - 1
294        },
295        TestCase {
296            bytes: &[0b00100000, 0b10000000, 0b00000000],
297            remainder: &[],
298            value: 4128, // 32 + (1 << 12)
299        },
300        TestCase {
301            bytes: &[0b00100000, 0b10000000, 0b00000001],
302            remainder: &[],
303            value: 4129, // 32 + (1 << 12) + 1
304        },
305        TestCase {
306            bytes: &[0b00100000, 0b10000000, 0b01111111],
307            remainder: &[],
308            value: 4255, // 32 + (1 << 12) + 127
309        },
310        TestCase {
311            bytes: &[0b00100000, 0b10000001, 0b00000000],
312            remainder: &[],
313            value: 4256, // 32 + (1 << 12) + 128
314        },
315        TestCase {
316            bytes: &[0b00100000, 0b10000001, 0b00000001],
317            remainder: &[],
318            value: 4257, // 32 + (1 << 12) + 129
319        },
320        TestCase {
321            bytes: &[0x20, 0x86, 0x68],
322            remainder: &[],
323            value: 5000,
324        },
325        TestCase {
326            bytes: &[0b00100000, 0b11111111, 0b01111111],
327            remainder: &[],
328            value: 20511, // 32 + (1 << 12) + (1 << 14) - 1
329        },
330        TestCase {
331            bytes: &[0b00100001, 0b10000000, 0b00000000],
332            remainder: &[],
333            value: 20512, // 32 + (1 << 12) + (1 << 14)
334        },
335        TestCase {
336            bytes: &[0b00111111, 0b11111111, 0b01111111],
337            remainder: &[],
338            value: 528415, // 32 + (1 << 12) + (1 << 19) - 1
339        },
340        TestCase {
341            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000000],
342            remainder: &[],
343            value: 528416, // 32 + (1 << 12) + (1 << 19)
344        },
345        TestCase {
346            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b00000001],
347            remainder: &[],
348            value: 528417, // 32 + (1 << 12) + (1 << 19) + 1
349        },
350        TestCase {
351            bytes: &[0b00111111, 0b11111111, 0b11111111, 0b01111111],
352            remainder: &[],
353            value: 67637279, // 32 + (1 << 12) + (1 << 19) + (1 << 26) - 1
354        },
355        TestCase {
356            bytes: &[0b00100000, 0b10000000, 0b10000000, 0b10000000, 0b00000000],
357            remainder: &[],
358            value: 67637280, // 32 + (1 << 12) + (1 << 19) + (1 << 26)
359        },
360    ];
361
362    #[test]
363    fn test_read() {
364        for cas in CASES {
365            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
366            assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas);
367        }
368    }
369
370    #[test]
371    fn test_read_write() {
372        for cas in CASES {
373            let reference_bytes = write_varint_reference(cas.value);
374            assert_eq!(
375                reference_bytes.len(),
376                cas.bytes.len() - cas.remainder.len(),
377                "{:?}",
378                cas
379            );
380            assert_eq!(
381                reference_bytes.as_slice(),
382                &cas.bytes[0..reference_bytes.len()],
383                "{:?}",
384                cas
385            );
386            let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]);
387            assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas);
388            let write_bytes = write_varint_meta2(cas.value);
389            assert_eq!(
390                reference_bytes.as_slice(),
391                write_bytes.as_slice(),
392                "{:?}",
393                cas
394            );
395        }
396    }
397
398    #[test]
399    fn test_roundtrip() {
400        let mut i = 0usize;
401        while i < MAX_VARINT {
402            let bytes = write_varint_meta2(i);
403            let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]);
404            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
405            i <<= 1;
406            i += 1;
407        }
408    }
409
410    #[test]
411    fn test_extended_roundtrip() {
412        let mut i = 0usize;
413        while i < MAX_VARINT {
414            let bytes = write_varint_meta3(i);
415            let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]);
416            assert_eq!(i, recovered.0, "{:?}", bytes.as_slice());
417            i <<= 1;
418            i += 1;
419        }
420    }
421
422    #[test]
423    fn test_max() {
424        let reference_bytes = write_varint_reference(MAX_VARINT);
425        let write_bytes = write_varint_meta2(MAX_VARINT);
426        assert_eq!(reference_bytes.len(), MAX_VARINT_LENGTH);
427        assert_eq!(reference_bytes.as_slice(), write_bytes.as_slice());
428        let subarray = write_bytes
429            .as_const_slice()
430            .get_subslice_or_panic(1, write_bytes.len());
431        let (recovered_value, remainder) = read_varint_meta2(
432            *write_bytes.as_const_slice().first().unwrap(),
433            subarray.as_slice(),
434        );
435        assert!(remainder.is_empty());
436        assert_eq!(recovered_value, MAX_VARINT);
437        assert_eq!(
438            write_bytes.as_slice(),
439            &[
440                0b00100001, //
441                0b11011111, //
442                0b11011111, //
443                0b11011111, //
444                0b11011111, //
445                0b11011111, //
446                0b11011111, //
447                0b11011111, //
448                0b11011111, //
449                0b01011111, //
450            ]
451        );
452    }
453
454    #[test]
455    fn text_extended_max() {
456        let write_bytes = write_varint_meta3(MAX_VARINT);
457        assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH);
458        let (lead, trailing) = write_bytes.as_slice().split_first().unwrap();
459        let (recovered_value, remainder) = read_varint_meta3(*lead, trailing);
460        assert!(remainder.is_empty());
461        assert_eq!(recovered_value, MAX_VARINT);
462        assert_eq!(
463            write_bytes.as_slice(),
464            &[
465                0b00010001, //
466                0b11101111, //
467                0b11101111, //
468                0b11101111, //
469                0b11101111, //
470                0b11101111, //
471                0b11101111, //
472                0b11101111, //
473                0b11101111, //
474                0b01101111, //
475            ]
476        );
477    }
478
479    #[test]
480    fn test_latent_values() {
481        // Same values documented in the module docs: M=2
482        let m2 = read_varint_meta2;
483        assert_eq!(m2(0, &[]).0, 0);
484        assert_eq!(m2(0x20, &[0x00]).0, 32);
485        assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128);
486        assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416);
487        assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280);
488
489        // Same values documented in the module docs: M=3
490        let m3 = read_varint_meta3;
491        assert_eq!(m3(0, &[]).0, 0);
492        assert_eq!(m3(0x10, &[0x00]).0, 16);
493        assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064);
494        assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208);
495        assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640);
496    }
497}