deflate/
chained_hash_table.rs

1pub const WINDOW_SIZE: usize = 32768;
2pub const WINDOW_MASK: usize = WINDOW_SIZE - 1;
3#[cfg(test)]
4pub const HASH_BYTES: usize = 3;
5const HASH_SHIFT: u16 = 5;
6const HASH_MASK: u16 = WINDOW_MASK as u16;
7
8/// Helper struct to let us allocate both head and prev in the same block.
9struct Tables {
10    /// Starts of hash chains (in prev)
11    pub head: [u16; WINDOW_SIZE],
12    /// Link to previous occurence of this hash value
13    pub prev: [u16; WINDOW_SIZE],
14}
15
16impl Default for Tables {
17    #[inline]
18    fn default() -> Tables {
19        Tables {
20            head: [0; WINDOW_SIZE],
21            prev: [0; WINDOW_SIZE],
22        }
23    }
24}
25
26impl Tables {
27    #[inline]
28    fn fill_prev(&mut self) {
29        self.prev.copy_from_slice(&self.head);
30    }
31}
32
33/// Create and box the hash chains.
34fn create_tables() -> Box<Tables> {
35    // Using default here is a trick to get around the lack of box syntax on stable Rust.
36    //
37    // Box::new([0u16,n]) ends up creating an temporary array on the stack which is not optimised
38    // but using default, which simply calls `box value` internally allows us to get around this.
39    //
40    // We could use vec instead, but using a boxed array helps the compiler optimise
41    // away bounds checks as `n & WINDOW_MASK < WINDOW_SIZE` will always be true.
42    let mut t: Box<Tables> = Box::default();
43
44    for (n, b) in t.head.iter_mut().enumerate() {
45        *b = n as u16;
46    }
47
48    t.fill_prev();
49
50    t
51}
52
53/// Returns a new hash value based on the previous value and the next byte
54#[inline]
55pub fn update_hash(current_hash: u16, to_insert: u8) -> u16 {
56    update_hash_conf(current_hash, to_insert, HASH_SHIFT, HASH_MASK)
57}
58
59#[inline]
60fn update_hash_conf(current_hash: u16, to_insert: u8, shift: u16, mask: u16) -> u16 {
61    ((current_hash << shift) ^ (u16::from(to_insert))) & mask
62}
63
64#[inline]
65fn reset_array(arr: &mut [u16; WINDOW_SIZE]) {
66    for (n, b) in arr.iter_mut().enumerate() {
67        *b = n as u16;
68    }
69}
70
71pub struct ChainedHashTable {
72    // Current running hash value of the last 3 bytes
73    current_hash: u16,
74    // Hash chains.
75    c: Box<Tables>,
76    // Used for testing
77    // count: DebugCounter,
78}
79
80impl ChainedHashTable {
81    pub fn new() -> ChainedHashTable {
82        ChainedHashTable {
83            current_hash: 0,
84            c: create_tables(),
85            //count: DebugCounter::default(),
86        }
87    }
88
89    #[cfg(test)]
90    pub fn from_starting_values(v1: u8, v2: u8) -> ChainedHashTable {
91        let mut t = ChainedHashTable::new();
92        t.current_hash = update_hash(t.current_hash, v1);
93        t.current_hash = update_hash(t.current_hash, v2);
94        t
95    }
96
97    /// Resets the hash value and hash chains
98    pub fn reset(&mut self) {
99        self.current_hash = 0;
100        reset_array(&mut self.c.head);
101        {
102            let h = self.c.head;
103            let mut c = self.c.prev;
104            c[..].copy_from_slice(&h[..]);
105        }
106        /*if cfg!(debug_assertions) {
107            self.count.reset();
108        }*/
109    }
110
111    pub fn add_initial_hash_values(&mut self, v1: u8, v2: u8) {
112        self.current_hash = update_hash(self.current_hash, v1);
113        self.current_hash = update_hash(self.current_hash, v2);
114    }
115
116    /// Insert a byte into the hash table
117    #[inline]
118    pub fn add_hash_value(&mut self, position: usize, value: u8) {
119        // Check that all bytes are input in order and at the correct positions.
120        // Disabled for now as it breaks when sync flushing.
121        /*debug_assert_eq!(
122            position & WINDOW_MASK,
123            self.count.get() as usize & WINDOW_MASK
124        );*/
125        debug_assert!(
126            position < WINDOW_SIZE * 2,
127            "Position is larger than 2 * window size! {}",
128            position
129        );
130        // Storing the hash in a temporary variable here makes the compiler avoid the
131        // bounds checks in this function.
132        let new_hash = update_hash(self.current_hash, value);
133
134        self.add_with_hash(position, new_hash);
135
136        // Update the stored hash value with the new hash.
137        self.current_hash = new_hash;
138    }
139
140    /// Directly set the current hash value
141    #[inline]
142    pub fn set_hash(&mut self, hash: u16) {
143        self.current_hash = hash;
144    }
145
146    /// Update the tables directly, providing the hash.
147    #[inline]
148    pub fn add_with_hash(&mut self, position: usize, hash: u16) {
149        /*if cfg!(debug_assertions) {
150            self.count.add(1);
151        }*/
152
153        self.c.prev[position & WINDOW_MASK] = self.c.head[hash as usize];
154
155        // Ignoring any bits over 16 here is deliberate, as we only concern ourselves about
156        // where in the buffer (which is 64k bytes) we are referring to.
157        self.c.head[hash as usize] = position as u16;
158    }
159
160    // Get the head of the hash chain for the current hash value
161    #[cfg(test)]
162    #[inline]
163    pub const fn current_head(&self) -> u16 {
164        self.c.head[self.current_hash as usize]
165    }
166
167    #[inline]
168    pub const fn current_hash(&self) -> u16 {
169        self.current_hash
170    }
171
172    #[inline]
173    pub const fn get_prev(&self, bytes: usize) -> u16 {
174        self.c.prev[bytes & WINDOW_MASK]
175    }
176
177    #[cfg(test)]
178    #[inline]
179    pub fn farthest_next(&self, match_pos: usize, match_len: usize) -> usize {
180        let to_check = match_len.saturating_sub(2);
181
182        let mut n = 0;
183        let mut smallest_prev = self.get_prev(match_pos);
184        let mut smallest_pos = 0;
185        while n < to_check {
186            let prev = self.get_prev(match_pos + n);
187            if prev < smallest_prev {
188                smallest_prev = prev;
189                smallest_pos = n;
190            }
191            n += 1;
192        }
193        smallest_pos
194    }
195
196    #[inline]
197    fn slide_value(b: u16, pos: u16, bytes: u16) -> u16 {
198        if b >= bytes {
199            b - bytes
200        } else {
201            pos
202        }
203    }
204
205    #[inline]
206    fn slide_table(table: &mut [u16; WINDOW_SIZE], bytes: u16) {
207        for (n, b) in table.iter_mut().enumerate() {
208            *b = ChainedHashTable::slide_value(*b, n as u16, bytes);
209        }
210    }
211
212    pub fn slide(&mut self, bytes: usize) {
213        /*if cfg!(debug_assertions) && bytes != WINDOW_SIZE {
214            // This should only happen in tests in this file.
215            self.count.reset();
216        }*/
217        ChainedHashTable::slide_table(&mut self.c.head, bytes as u16);
218        ChainedHashTable::slide_table(&mut self.c.prev, bytes as u16);
219    }
220}
221
222#[cfg(test)]
223pub fn filled_hash_table(data: &[u8]) -> ChainedHashTable {
224    assert!(data.len() <= (WINDOW_SIZE * 2) + 2);
225    let mut hash_table = ChainedHashTable::from_starting_values(data[0], data[1]);
226    for (n, b) in data[2..].iter().enumerate() {
227        hash_table.add_hash_value(n, *b);
228    }
229    hash_table
230}
231
232#[cfg(test)]
233mod test {
234    use super::{filled_hash_table, ChainedHashTable};
235
236    #[test]
237    fn chained_hash() {
238        use std::str;
239
240        let test_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
241                           eiusmod tempor. rum. incididunt ut labore et dolore magna aliqua. Ut \
242                           enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi \
243                           ut aliquip ex ea commodo consequat. rum. Duis aute irure dolor in \
244                           reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla \
245                           pariatur. Excepteur sint occaecat cupidatat non proident, sunt in \
246                           culpa qui officia deserunt mollit anim id est laborum.";
247
248        let test_data = test_string.as_bytes();
249
250        let current_bytes = &test_data[test_data.len() - super::HASH_BYTES..test_data.len()];
251
252        let num_iters = test_string
253            .matches(str::from_utf8(current_bytes).unwrap())
254            .count();
255
256        let hash_table = filled_hash_table(test_data);
257
258        // Test that the positions in the chain are valid
259        let mut prev_value = hash_table.get_prev(hash_table.current_head() as usize) as usize;
260        let mut count = 0;
261        let mut current = hash_table.current_head() as usize;
262        while current != prev_value {
263            count += 1;
264            current = prev_value;
265            prev_value = hash_table.get_prev(prev_value) as usize;
266        }
267        // There should be at least as many occurences of the hash of the checked bytes as the
268        // numbers of occurences of the checked bytes themselves. As the hashes are not large enough
269        // to store 8 * 3 = 24 bits, there could be more with different input data.
270        assert!(count >= num_iters);
271    }
272
273    #[test]
274    fn table_unique() {
275        let mut test_data = Vec::new();
276        test_data.extend(0u8..255);
277        test_data.extend(255u8..0);
278        let hash_table = filled_hash_table(&test_data);
279        let prev_pos = hash_table.get_prev(hash_table.current_head() as usize);
280        // Since all sequences in the input are unique, there shouldn't be any previous values.
281        assert_eq!(prev_pos, hash_table.current_hash());
282    }
283
284    #[test]
285    fn table_slide() {
286        use std::fs::File;
287        use std::io::Read;
288
289        let window_size = super::WINDOW_SIZE;
290        let window_size16 = super::WINDOW_SIZE as u16;
291
292        let mut input = Vec::new();
293
294        let mut f = File::open("tests/pg11.txt").unwrap();
295
296        f.read_to_end(&mut input).unwrap();
297
298        let mut hash_table = filled_hash_table(&input[..window_size + 2]);
299
300        for (n, b) in input[2..window_size + 2].iter().enumerate() {
301            hash_table.add_hash_value(n + window_size, *b);
302        }
303
304        hash_table.slide(window_size);
305
306        {
307            let max_head = hash_table.c.head.iter().max().unwrap();
308            // After sliding there should be no hashes referring to values
309            // higher than the window size
310            assert!(*max_head < window_size16);
311            assert!(*max_head > 0);
312            let pos = hash_table.get_prev(hash_table.current_head() as usize);
313            // There should be a previous occurence since we inserted the data 3 times
314            assert!(pos < window_size16);
315            assert!(pos > 0);
316        }
317
318        for (n, b) in input[2..(window_size / 2)].iter().enumerate() {
319            hash_table.add_hash_value(n + window_size, *b);
320        }
321
322        // There should hashes referring to values in the upper part of the input window
323        // at this point
324        let max_prev = hash_table.c.prev.iter().max().unwrap();
325        assert!(*max_prev > window_size16);
326
327        let mut pos = hash_table.current_head();
328        // There should be a previous occurence since we inserted the data 3 times
329        assert!(pos > window_size16);
330        let end_byte = input[(window_size / 2) - 1 - 2];
331        let mut iterations = 0;
332        while pos > window_size16 && iterations < 5000 {
333            assert_eq!(input[pos as usize & window_size - 1], end_byte);
334
335            pos = hash_table.get_prev(pos as usize);
336            iterations += 1;
337        }
338    }
339
340    #[test]
341    /// Ensure that the initial hash values are correct.
342    fn initial_chains() {
343        let t = ChainedHashTable::new();
344        for (n, &b) in t.c.head.iter().enumerate() {
345            assert_eq!(n, b as usize);
346        }
347        for (n, &b) in t.c.prev.iter().enumerate() {
348            assert_eq!(n, b as usize);
349        }
350    }
351}