deflate/
lz77.rs

Help
1//! This module contains functionality for doing lz77 compression of data.
2#![macro_use]
3use std::cmp;
4use std::fmt;
5use std::iter::{self, Iterator};
6use std::ops::{Range, RangeFrom};
7use std::slice::Iter;
8
9use crate::chained_hash_table::{update_hash, ChainedHashTable};
10use crate::compress::Flush;
11#[cfg(test)]
12use crate::compression_options::{HIGH_LAZY_IF_LESS_THAN, HIGH_MAX_HASH_CHECKS};
13use crate::input_buffer::InputBuffer;
14#[cfg(test)]
15use crate::lzvalue::{LZType, LZValue};
16use crate::matching::longest_match;
17use crate::output_writer::{BufferStatus, DynamicWriter};
18use crate::rle::process_chunk_greedy_rle;
19
20const MAX_MATCH: usize = crate::huffman_table::MAX_MATCH as usize;
21const MIN_MATCH: usize = crate::huffman_table::MIN_MATCH as usize;
22
23const NO_RLE: u16 = 43212;
24
25/// An enum describing whether we use lazy or greedy matching.
26#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
27pub enum MatchingType {
28    /// Use greedy matching: the matching algorithm simply uses a match right away
29    /// if found.
30    Greedy,
31    /// Use lazy matching: after finding a match, the next input byte is checked, to see
32    /// if there is a better match starting at that byte.
33    ///
34    /// As a special case, if max_hash_checks is set to 0, compression using only run-length
35    /// (i.e maximum match distance of 1) is performed instead.
36    Lazy,
37}
38
39impl fmt::Display for MatchingType {
40    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
41        match *self {
42            MatchingType::Greedy => write!(f, "Greedy matching"),
43            MatchingType::Lazy => write!(f, "Lazy matching"),
44        }
45    }
46}
47
48/// A struct that contains the hash table, and keeps track of where we are in the input data
49pub struct LZ77State {
50    /// Struct containing hash chains that will be used to find matches.
51    hash_table: ChainedHashTable,
52    /// True if this is the first window that is being processed.
53    is_first_window: bool,
54    /// Set to true when the last block has been processed.
55    is_last_block: bool,
56    /// How many bytes the last match in the previous window extended into the current one.
57    overlap: usize,
58    /// How many bytes of input the current block contains.
59    current_block_input_bytes: u64,
60    /// The maximum number of hash entries to search.
61    max_hash_checks: u16,
62    /// Only lazy match if we have a match length less than this.
63    lazy_if_less_than: u16,
64    /// Whether to use greedy or lazy parsing
65    matching_type: MatchingType,
66    /// Keep track of the previous match and byte in case the buffer is full when lazy matching.
67    match_state: ChunkState,
68    /// Keep track of how many bytes in the lookahead that was part of a match, but has not been
69    /// added to the hash chain yet.
70    bytes_to_hash: usize,
71    /// Keep track of if sync flush was used. If this is the case, the two first bytes needs to be
72    /// hashed.
73    was_synced: bool,
74}
75
76impl LZ77State {
77    /// Creates a new LZ77 state
78    pub fn new(
79        max_hash_checks: u16,
80        lazy_if_less_than: u16,
81        matching_type: MatchingType,
82    ) -> LZ77State {
83        LZ77State {
84            hash_table: ChainedHashTable::new(),
85            is_first_window: true,
86            is_last_block: false,
87            overlap: 0,
88            current_block_input_bytes: 0,
89            max_hash_checks,
90            lazy_if_less_than,
91            matching_type,
92            match_state: ChunkState::new(),
93            bytes_to_hash: 0,
94            was_synced: false,
95        }
96    }
97
98    /// Resets the state excluding max_hash_checks and lazy_if_less_than
99    pub fn reset(&mut self) {
100        self.hash_table.reset();
101        self.is_first_window = true;
102        self.is_last_block = false;
103        self.overlap = 0;
104        self.current_block_input_bytes = 0;
105        self.match_state = ChunkState::new();
106        self.bytes_to_hash = 0
107    }
108
109    pub fn set_last(&mut self) {
110        self.is_last_block = true;
111    }
112
113    /// Is this the last block we are outputting?
114    pub const fn is_last_block(&self) -> bool {
115        self.is_last_block
116    }
117
118    /// How many bytes of input the current block contains.
119    pub const fn current_block_input_bytes(&self) -> u64 {
120        self.current_block_input_bytes
121    }
122
123    /// Sets the number of input bytes for the current block to 0.
124    pub fn reset_input_bytes(&mut self) {
125        self.current_block_input_bytes = 0;
126    }
127
128    /// Is there a buffered byte that has not been output yet?
129    pub const fn pending_byte(&self) -> bool {
130        self.match_state.add
131    }
132
133    /// Returns 1 if pending_byte is true, 0 otherwise.
134    pub fn pending_byte_as_num(&self) -> usize {
135        // This could be implemented by using `as usize` as the documentation states this would give
136        // the same result, but not sure if that should be relied upon.
137        if self.match_state.add {
138            1
139        } else {
140            0
141        }
142    }
143}
144
145const DEFAULT_WINDOW_SIZE: usize = 32768;
146
147#[derive(Debug)]
148/// Status after calling `process_chunk`.
149pub enum ProcessStatus {
150    /// All the input data was processed.
151    Ok,
152    /// The output buffer was full.
153    ///
154    /// The argument is the position of the next byte to be checked by `process_chunk`.
155    BufferFull(usize),
156}
157
158#[derive(Debug)]
159/// A struct to keep track of status between calls of `process_chunk_lazy`
160///
161/// This is needed as the output buffer might become full before having output all pending data.
162pub struct ChunkState {
163    /// Length of the last match that was found, if any.
164    current_length: u16,
165    /// Distance of the last match that was found, if any.
166    current_distance: u16,
167    /// The previous byte checked in process_chunk.
168    prev_byte: u8,
169    /// The current byte being checked in process_chunk.
170    cur_byte: u8,
171    /// Whether prev_byte still needs to be output.
172    add: bool,
173}
174
175impl ChunkState {
176    pub fn new() -> ChunkState {
177        ChunkState {
178            current_length: 0,
179            current_distance: 0,
180            prev_byte: 0,
181            cur_byte: 0,
182            add: false,
183        }
184    }
185}
186
187pub const fn buffer_full(position: usize) -> ProcessStatus {
188    ProcessStatus::BufferFull(position)
189}
190
191#[allow(clippy::too_many_arguments)]
192fn process_chunk(
193    data: &[u8],
194    iterated_data: &Range<usize>,
195    mut match_state: &mut ChunkState,
196    hash_table: &mut ChainedHashTable,
197    writer: &mut DynamicWriter,
198    max_hash_checks: u16,
199    lazy_if_less_than: usize,
200    matching_type: MatchingType,
201) -> (usize, ProcessStatus) {
202    let avoid_rle = if cfg!(test) {
203        // Avoid RLE if lazy_if_less than is a specific value.
204        // This is used in some tests, ideally we should probably do this in a less clunky way,
205        // but we use a value here that is higher than the maximum sensible one anyhow, and will
206        // be truncated by deflate_state for calls from outside the library.
207        lazy_if_less_than == NO_RLE as usize
208    } else {
209        false
210    };
211    match matching_type {
212        MatchingType::Greedy => {
213            process_chunk_greedy(data, iterated_data, hash_table, writer, max_hash_checks)
214        }
215        MatchingType::Lazy => {
216            if max_hash_checks > 0 || avoid_rle {
217                process_chunk_lazy(
218                    data,
219                    iterated_data,
220                    &mut match_state,
221                    hash_table,
222                    writer,
223                    max_hash_checks,
224                    lazy_if_less_than,
225                )
226            } else {
227                // Use the RLE method if max_hash_checks is set to 0.
228                process_chunk_greedy_rle(data, iterated_data, writer)
229            }
230        }
231    }
232}
233
234/// Add the specified number of bytes to the hash table from the iterators
235/// adding `start` to the position supplied to the hash table.
236fn add_to_hash_table(
237    bytes_to_add: usize,
238    insert_it: &mut iter::Zip<RangeFrom<usize>, Iter<u8>>,
239    hash_it: &mut Iter<u8>,
240    hash_table: &mut ChainedHashTable,
241) {
242    let taker = insert_it.by_ref().take(bytes_to_add);
243    let mut hash_taker = hash_it.by_ref().take(bytes_to_add);
244    // Update the hash manually here to keep it in a register.
245    let mut hash = hash_table.current_hash();
246    // Advance the iterators and add the bytes we jump over to the hash table and
247    // checksum
248    for (ipos, _) in taker {
249        if let Some(&i_hash_byte) = hash_taker.next() {
250            hash = update_hash(hash, i_hash_byte);
251            hash_table.add_with_hash(ipos, hash);
252        }
253    }
254    // Write the hash back once we are done.
255    hash_table.set_hash(hash);
256}
257
258/// Write the specified literal `byte` to the writer `w`, and return
259/// `ProcessStatus::BufferFull($pos)` if the buffer is full after writing.
260///
261/// `pos` should indicate the byte to start at in the next call to `process_chunk`,
262macro_rules! write_literal {
263    ($w:ident, $byte:expr, $pos:expr) => {
264        let b_status = $w.write_literal($byte);
265
266        if let BufferStatus::Full = b_status {
267            return (0, buffer_full($pos));
268        }
269    };
270}
271
272/// If the match is only 3 bytes long and the distance is more than 8 * 1024, it's likely to take
273/// up more space than it would save.
274#[inline]
275fn match_too_far(match_len: usize, match_dist: usize) -> bool {
276    const TOO_FAR: usize = 8 * 1024;
277    match_len == MIN_MATCH && match_dist > TOO_FAR
278}
279
280///Create the iterators used when processing through a chunk of data.
281fn create_iterators<'a>(
282    data: &'a [u8],
283    iterated_data: &Range<usize>,
284) -> (
285    usize,
286    iter::Zip<RangeFrom<usize>, Iter<'a, u8>>,
287    Iter<'a, u8>,
288) {
289    let end = cmp::min(data.len(), iterated_data.end);
290    let start = iterated_data.start;
291    let current_chunk = &data[start..end];
292
293    let insert_it = (start..).zip(current_chunk.iter());
294    let hash_it = {
295        let hash_start = if data.len() - start > 2 {
296            start + 2
297        } else {
298            data.len()
299        };
300        (&data[hash_start..]).iter()
301    };
302    (end, insert_it, hash_it)
303}
304
305fn process_chunk_lazy(
306    data: &[u8],
307    iterated_data: &Range<usize>,
308    state: &mut ChunkState,
309    mut hash_table: &mut ChainedHashTable,
310    writer: &mut DynamicWriter,
311    max_hash_checks: u16,
312    lazy_if_less_than: usize,
313) -> (usize, ProcessStatus) {
314    let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
315
316    const NO_LENGTH: u16 = 0;
317
318    // The previous match length, if any.
319    let mut prev_length = state.current_length;
320    // The distance of the previous match if any.
321    let mut prev_distance = state.current_distance;
322
323    state.current_length = 0;
324    state.current_distance = 0;
325
326    // The number of bytes past end that was added due to finding a match that extends into
327    // the lookahead window.
328    let mut overlap = 0;
329
330    // Set to true if we found a match that is equal to or longer than `lazy_if_less_than`,
331    // indicating that we won't lazy match (check for a better match at the next byte).
332    // If we had a good match, carry this over from the previous call.
333    let mut ignore_next = prev_length as usize >= lazy_if_less_than;
334
335    // This is to output the correct byte in case there is one pending to be output
336    // from the previous call.
337    state.prev_byte = state.cur_byte;
338
339    // Iterate through the slice, adding literals or length/distance pairs
340    while let Some((position, &b)) = insert_it.next() {
341        state.cur_byte = b;
342        if let Some(&hash_byte) = hash_it.next() {
343            hash_table.add_hash_value(position, hash_byte);
344
345            // Only lazy match if we have a match shorter than a set value
346            // TODO: This should be cleaned up a bit
347            if !ignore_next {
348                let (mut match_len, match_dist) = {
349                    // If there already was a decent match at the previous byte
350                    // and we are lazy matching, do less match checks in this step.
351                    let max_hash_checks = if prev_length >= 32 {
352                        max_hash_checks >> 2
353                    } else {
354                        max_hash_checks
355                    };
356
357                    // Check if we can find a better match here than the one we had at
358                    // the previous byte.
359                    longest_match(
360                        data,
361                        hash_table,
362                        position,
363                        prev_length as usize,
364                        max_hash_checks,
365                    )
366                };
367
368                // If the match is only 3 bytes long and very far back, it's probably not worth
369                // outputting.
370                if match_too_far(match_len, match_dist) {
371                    match_len = NO_LENGTH as usize;
372                };
373
374                if match_len >= lazy_if_less_than {
375                    // We found a decent match, so we won't check for a better one at the next byte.
376                    ignore_next = true;
377                }
378                state.current_length = match_len as u16;
379                state.current_distance = match_dist as u16;
380            } else {
381                // We already had a decent match, so we don't bother checking for another one.
382                state.current_length = NO_LENGTH;
383                state.current_distance = 0;
384                // Make sure we check again next time.
385                ignore_next = false;
386            };
387
388            if prev_length >= state.current_length && prev_length >= MIN_MATCH as u16 {
389                // The previous match was better so we add it.
390                // Casting note: length and distance is already bounded by the longest match
391                // function. Usize is just used for convenience.
392                let b_status =
393                    writer.write_length_distance(prev_length as u16, prev_distance as u16);
394
395                // We add the bytes to the hash table and checksum.
396                // Since we've already added two of them, we need to add two less than
397                // the length.
398                let bytes_to_add = prev_length - 2;
399
400                add_to_hash_table(
401                    bytes_to_add as usize,
402                    &mut insert_it,
403                    &mut hash_it,
404                    &mut hash_table,
405                );
406
407                // If the match is longer than the current window, we have note how many
408                // bytes we overlap, since we don't need to do any matching on these bytes
409                // in the next call of this function.
410                // We don't have to worry about setting overlap to 0 if this is false, as the
411                // function will stop after this condition is true, and overlap is not altered
412                // elsewhere.
413                if position + prev_length as usize > end {
414                    // We need to subtract 1 since the byte at pos is also included.
415                    overlap = position + prev_length as usize - end - 1;
416                };
417
418                state.add = false;
419
420                // Note that there is no current match.
421                state.current_length = 0;
422                state.current_distance = 0;
423
424                if let BufferStatus::Full = b_status {
425                    // MATCH(lazy)
426                    return (overlap, buffer_full(position + prev_length as usize - 1));
427                }
428
429                ignore_next = false;
430            } else if state.add {
431                // We found a better match (or there was no previous match)
432                // so output the previous byte.
433                // BETTER OR NO MATCH
434                write_literal!(writer, state.prev_byte, position + 1);
435            } else {
436                state.add = true
437            }
438
439            prev_length = state.current_length;
440            prev_distance = state.current_distance;
441            state.prev_byte = b;
442        } else {
443            // If there is a match at this point, it will not have been added, so we need to add it.
444            if prev_length >= MIN_MATCH as u16 {
445                let b_status =
446                    writer.write_length_distance(prev_length as u16, prev_distance as u16);
447
448                state.current_length = 0;
449                state.current_distance = 0;
450                state.add = false;
451
452                debug_assert!((position + prev_length as usize) >= end - 1);
453                // Needed to note overlap as we can end up with the last window containing the rest
454                // of the match.
455                overlap = (position + prev_length as usize)
456                    .saturating_sub(end)
457                    .saturating_sub(1);
458
459                // TODO: Not sure if we need to signal that the buffer is full here.
460                // It's only needed in the case of syncing.
461                if let BufferStatus::Full = b_status {
462                    // TODO: These bytes should be hashed when doing a sync flush.
463                    // This can't be done here as the new input data does not exist yet.
464                    return (overlap, buffer_full(end));
465                } else {
466                    return (overlap, ProcessStatus::Ok);
467                }
468            };
469
470            if state.add {
471                // We may still have a leftover byte at this point, so we add it here if needed.
472                state.add = false;
473
474                // ADD
475                write_literal!(writer, state.prev_byte, position);
476            };
477
478            // We are at the last two bytes we want to add, so there is no point
479            // searching for matches here.
480
481            // AFTER ADD
482            write_literal!(writer, b, position + 1);
483        }
484    }
485    (overlap, ProcessStatus::Ok)
486}
487
488fn process_chunk_greedy(
489    data: &[u8],
490    iterated_data: &Range<usize>,
491    mut hash_table: &mut ChainedHashTable,
492    writer: &mut DynamicWriter,
493    max_hash_checks: u16,
494) -> (usize, ProcessStatus) {
495    let (end, mut insert_it, mut hash_it) = create_iterators(data, iterated_data);
496
497    const NO_LENGTH: usize = 0;
498
499    // The number of bytes past end that was added due to finding a match that extends into
500    // the lookahead window.
501    let mut overlap = 0;
502
503    // Iterate through the slice, adding literals or length/distance pairs.
504    while let Some((position, &b)) = insert_it.next() {
505        if let Some(&hash_byte) = hash_it.next() {
506            hash_table.add_hash_value(position, hash_byte);
507
508            // TODO: This should be cleaned up a bit.
509            let (match_len, match_dist) =
510                { longest_match(data, hash_table, position, NO_LENGTH, max_hash_checks) };
511
512            if match_len >= MIN_MATCH as usize && !match_too_far(match_len, match_dist) {
513                // Casting note: length and distance is already bounded by the longest match
514                // function. Usize is just used for convenience.
515                let b_status = writer.write_length_distance(match_len as u16, match_dist as u16);
516
517                // We add the bytes to the hash table and checksum.
518                // Since we've already added one of them, we need to add one less than
519                // the length.
520                let bytes_to_add = match_len - 1;
521                add_to_hash_table(bytes_to_add, &mut insert_it, &mut hash_it, &mut hash_table);
522
523                // If the match is longer than the current window, we have note how many
524                // bytes we overlap, since we don't need to do any matching on these bytes
525                // in the next call of this function.
526                if position + match_len > end {
527                    // We need to subtract 1 since the byte at pos is also included.
528                    overlap = position + match_len - end;
529                };
530
531                if let BufferStatus::Full = b_status {
532                    // MATCH
533                    return (overlap, buffer_full(position + match_len));
534                }
535            } else {
536                // NO MATCH
537                write_literal!(writer, b, position + 1);
538            }
539        } else {
540            // We are at the last two bytes we want to add, so there is no point
541            // searching for matches here.
542            // END
543            write_literal!(writer, b, position + 1);
544        }
545    }
546    (overlap, ProcessStatus::Ok)
547}
548
549#[derive(Eq, PartialEq, Clone, Copy, Debug)]
550pub enum LZ77Status {
551    /// Waiting for more input before doing any processing
552    NeedInput,
553    /// The output buffer is full, so the current block needs to be ended so the
554    /// buffer can be flushed.
555    EndBlock,
556    /// All pending data has been processed.
557    Finished,
558}
559
560#[cfg(test)]
561pub fn lz77_compress_block_finish(
562    data: &[u8],
563    state: &mut LZ77State,
564    buffer: &mut InputBuffer,
565    mut writer: &mut DynamicWriter,
566) -> (usize, LZ77Status) {
567    let (consumed, status, _) =
568        lz77_compress_block(data, state, buffer, &mut writer, Flush::Finish);
569    (consumed, status)
570}
571
572/// Compress a slice with lz77 compression.
573///
574/// This function processes one window at a time, and returns when there is no input left,
575/// or it determines it's time to end a block.
576///
577/// Returns the number of bytes of the input that were consumed, a status describing
578/// whether there is no input, it's time to finish, or it's time to end the block, and the position
579/// of the first byte in the input buffer that has not been output (but may have been checked for
580/// matches).
581pub fn lz77_compress_block(
582    data: &[u8],
583    state: &mut LZ77State,
584    buffer: &mut InputBuffer,
585    mut writer: &mut DynamicWriter,
586    flush: Flush,
587) -> (usize, LZ77Status, usize) {
588    // Currently we only support the maximum window size
589    let window_size = DEFAULT_WINDOW_SIZE;
590
591    // Indicates whether we should try to process all the data including the lookahead, or if we
592    // should wait until we have at least one window size of data before doing anything.
593    let finish = flush == Flush::Finish || flush == Flush::Sync;
594    let sync = flush == Flush::Sync;
595
596    let mut current_position = 0;
597
598    // The current status of the encoding.
599    let mut status = LZ77Status::EndBlock;
600
601    // Whether warm up the hash chain with the two first values.
602    let mut add_initial = true;
603
604    // If we have synced, add the two first bytes to the hash as they couldn't be added before.
605    if state.was_synced {
606        if buffer.current_end() > 2 {
607            let pos_add = buffer.current_end() - 2;
608            for (n, &b) in data.iter().take(2).enumerate() {
609                state.hash_table.add_hash_value(n + pos_add, b);
610            }
611            add_initial = false;
612        }
613        state.was_synced = false;
614    }
615
616    // Add data to the input buffer and keep a reference to the slice of data not added yet.
617    let mut remaining_data = buffer.add_data(data);
618
619    loop {
620        // Note if there is a pending byte from the previous call to process_chunk,
621        // so we get the block input size right.
622        let pending_previous = state.pending_byte_as_num();
623
624        assert!(writer.buffer_length() <= (window_size * 2));
625        // Don't do anything until we are either flushing, or we have at least one window of
626        // data.
627        if buffer.current_end() >= (window_size * 2) + MAX_MATCH || finish {
628            if state.is_first_window {
629                if buffer.get_buffer().len() >= 2
630                    && add_initial
631                    && state.current_block_input_bytes == 0
632                {
633                    let b = buffer.get_buffer();
634                    // Warm up the hash with the two first values, so we can find  matches at
635                    // index 0.
636                    state.hash_table.add_initial_hash_values(b[0], b[1]);
637                    add_initial = false;
638                }
639            } else if buffer.current_end() >= window_size + 2 {
640                for (n, &h) in buffer.get_buffer()[window_size + 2..]
641                    .iter()
642                    .enumerate()
643                    .take(state.bytes_to_hash)
644                {
645                    state.hash_table.add_hash_value(window_size + n, h);
646                }
647                state.bytes_to_hash = 0;
648            }
649
650            let window_start = if state.is_first_window {
651                0
652            } else {
653                window_size
654            };
655            let start = state.overlap + window_start;
656            let end = cmp::min(window_size + window_start, buffer.current_end());
657
658            let (overlap, p_status) = process_chunk(
659                buffer.get_buffer(),
660                &(start..end),
661                &mut state.match_state,
662                &mut state.hash_table,
663                &mut writer,
664                state.max_hash_checks,
665                state.lazy_if_less_than as usize,
666                state.matching_type,
667            );
668
669            state.bytes_to_hash = overlap;
670
671            if let ProcessStatus::BufferFull(written) = p_status {
672                state.current_block_input_bytes +=
673                    (written - start + pending_previous - state.pending_byte_as_num()) as u64;
674
675                // If the buffer is full, return and end the block.
676                // If overlap is non-zero, the buffer was full after outputting the last byte,
677                // otherwise we have to skip to the point in the buffer where we stopped in the
678                // next call.
679                state.overlap = if overlap > 0 {
680                    if !state.is_first_window {
681                        // If we are at the end of the window, make sure we slide the buffer and the
682                        // hash table.
683                        if state.max_hash_checks > 0 {
684                            state.hash_table.slide(window_size);
685                        }
686                        remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
687                    } else {
688                        state.is_first_window = false;
689                    }
690                    overlap
691                } else {
692                    written - window_start
693                };
694
695                current_position = written - state.pending_byte_as_num();
696
697                // Status is already EndBlock at this point.
698                // status = LZ77Status::EndBlock;
699                break;
700            }
701
702            state.current_block_input_bytes +=
703                (end - start + overlap + pending_previous - state.pending_byte_as_num()) as u64;
704
705            // The buffer is not full, but we still need to note if there is any overlap into the
706            // next window.
707            state.overlap = overlap;
708
709            if (state.is_first_window || remaining_data.is_none())
710                && finish
711                && end >= buffer.current_end()
712            {
713                current_position = if state.is_first_window {
714                    end - state.pending_byte_as_num()
715                } else {
716                    debug_assert!(
717                        !state.pending_byte(),
718                        "Bug! Ended compression with a pending byte!"
719                    );
720                    buffer.current_end()
721                };
722
723                // We stopped before or at the window size, so we are at the end.
724                if !sync {
725                    // If we are finishing and not syncing, we simply indicate that we are done.
726                    state.set_last();
727                    state.is_first_window = false;
728                } else {
729                    // For sync flushing we need to slide the buffer and the hash chains so that the
730                    // next call to this function starts at the right place.
731
732                    // There won't be any overlap, since when syncing, we process to the end of the
733                    // pending data.
734                    state.overlap = if state.is_first_window {
735                        end
736                    } else {
737                        buffer.current_end() - window_size
738                    };
739                    state.was_synced = true;
740                }
741                status = LZ77Status::Finished;
742                break;
743            } else if state.is_first_window {
744                state.is_first_window = false;
745            } else {
746                // We are not at the end, so slide and continue.
747                // We slide the hash table back to make space for new hash values
748                // We only need to remember 2^15 bytes back (the maximum distance allowed by the
749                // deflate spec).
750                if state.max_hash_checks > 0 {
751                    state.hash_table.slide(window_size);
752                }
753
754                // Also slide the buffer, discarding data we no longer need and adding new data.
755                remaining_data = buffer.slide(remaining_data.unwrap_or(&[]));
756            }
757        } else {
758            // The caller has not indicated that they want to finish or flush, and there is less
759            // than a window + lookahead of new data, so we wait for more.
760            status = LZ77Status::NeedInput;
761            break;
762        }
763    }
764
765    (
766        data.len() - remaining_data.unwrap_or(&[]).len(),
767        status,
768        current_position,
769    )
770}
771
772#[cfg(test)]
773pub fn decompress_lz77(input: &[LZValue]) -> Vec<u8> {
774    decompress_lz77_with_backbuffer(input, &[])
775}
776
777#[cfg(test)]
778pub fn decompress_lz77_with_backbuffer(input: &[LZValue], back_buffer: &[u8]) -> Vec<u8> {
779    let mut output = Vec::new();
780    for p in input {
781        match p.value() {
782            LZType::Literal(l) => output.push(l),
783            LZType::StoredLengthDistance(l, d) => {
784                // We found a match, so we have to get the data that the match refers to.
785                let d = d as usize;
786                let prev_output_len = output.len();
787                // Get match data from the back buffer if the match extends that far.
788                let consumed = if d > output.len() {
789                    let into_back_buffer = d - output.len();
790
791                    assert!(
792                        into_back_buffer <= back_buffer.len(),
793                        "ERROR: Attempted to refer to a match in non-existing data!\
794                         into_back_buffer: {}, back_buffer len {}, d {}, l {:?}",
795                        into_back_buffer,
796                        back_buffer.len(),
797                        d,
798                        l
799                    );
800                    let start = back_buffer.len() - into_back_buffer;
801                    let end = cmp::min(back_buffer.len(), start + l.actual_length() as usize);
802                    output.extend_from_slice(&back_buffer[start..end]);
803
804                    end - start
805                } else {
806                    0
807                };
808
809                // Get match data from the currently decompressed data.
810                let start = prev_output_len.saturating_sub(d);
811                let mut n = 0;
812                while n < (l.actual_length() as usize).saturating_sub(consumed) {
813                    let b = output[start + n];
814                    output.push(b);
815                    n += 1;
816                }
817            }
818        }
819    }
820    output
821}
822
823#[cfg(test)]
824pub struct TestStruct {
825    state: LZ77State,
826    buffer: InputBuffer,
827    writer: DynamicWriter,
828}
829
830#[cfg(test)]
831impl TestStruct {
832    fn new() -> TestStruct {
833        TestStruct::with_config(
834            HIGH_MAX_HASH_CHECKS,
835            HIGH_LAZY_IF_LESS_THAN,
836            MatchingType::Lazy,
837        )
838    }
839
840    fn with_config(
841        max_hash_checks: u16,
842        lazy_if_less_than: u16,
843        matching_type: MatchingType,
844    ) -> TestStruct {
845        TestStruct {
846            state: LZ77State::new(max_hash_checks, lazy_if_less_than, matching_type),
847            buffer: InputBuffer::empty(),
848            writer: DynamicWriter::new(),
849        }
850    }
851
852    fn compress_block(&mut self, data: &[u8], flush: bool) -> (usize, LZ77Status, usize) {
853        lz77_compress_block(
854            data,
855            &mut self.state,
856            &mut self.buffer,
857            &mut self.writer,
858            if flush { Flush::Finish } else { Flush::None },
859        )
860    }
861}
862
863#[cfg(test)]
864pub fn lz77_compress(data: &[u8]) -> Option<Vec<LZValue>> {
865    lz77_compress_conf(
866        data,
867        HIGH_MAX_HASH_CHECKS,
868        HIGH_LAZY_IF_LESS_THAN,
869        MatchingType::Lazy,
870    )
871}
872
873/// Compress a slice, not storing frequency information
874///
875/// This is a convenience function for compression with fixed huffman values
876/// Only used in tests for now
877#[cfg(test)]
878pub fn lz77_compress_conf(
879    data: &[u8],
880    max_hash_checks: u16,
881    lazy_if_less_than: u16,
882    matching_type: MatchingType,
883) -> Option<Vec<LZValue>> {
884    let mut test_boxed = Box::new(TestStruct::with_config(
885        max_hash_checks,
886        lazy_if_less_than,
887        matching_type,
888    ));
889    let mut out = Vec::<LZValue>::with_capacity(data.len() / 3);
890    {
891        let test = test_boxed.as_mut();
892        let mut slice = data;
893
894        while !test.state.is_last_block {
895            let bytes_written = lz77_compress_block_finish(
896                slice,
897                &mut test.state,
898                &mut test.buffer,
899                &mut test.writer,
900            )
901            .0;
902            slice = &slice[bytes_written..];
903            out.extend(test.writer.get_buffer());
904            test.writer.clear();
905        }
906    }
907
908    Some(out)
909}
910
911#[cfg(test)]
912mod test {
913    use super::*;
914
915    use crate::chained_hash_table::WINDOW_SIZE;
916    use crate::compression_options::DEFAULT_LAZY_IF_LESS_THAN;
917    use crate::lzvalue::{ld, lit, LZType, LZValue};
918    use crate::output_writer::MAX_BUFFER_LENGTH;
919    use crate::test_utils::get_test_data;
920
921    /// Helper function to print the output from the lz77 compression function
922    fn print_output(input: &[LZValue]) {
923        let mut output = vec![];
924        for l in input {
925            match l.value() {
926                LZType::Literal(l) => output.push(l),
927                LZType::StoredLengthDistance(l, d) => {
928                    output.extend(format!("<L {}>", l.actual_length()).into_bytes());
929                    output.extend(format!("<D {}>", d).into_bytes())
930                }
931            }
932        }
933
934        println!("\"{}\"", String::from_utf8(output).unwrap());
935    }
936
937    /// Test that a short string from an example on SO compresses correctly
938    #[test]
939    fn compress_short() {
940        let test_bytes = String::from("Deflate late").into_bytes();
941        let res = lz77_compress(&test_bytes).unwrap();
942
943        let decompressed = decompress_lz77(&res);
944
945        assert_eq!(test_bytes, decompressed);
946        assert_eq!(*res.last().unwrap(), LZValue::length_distance(4, 5));
947    }
948
949    /// Test that compression is working for a longer file
950    #[test]
951    fn compress_long() {
952        let input = get_test_data();
953        let compressed = lz77_compress(&input).unwrap();
954        assert!(compressed.len() < input.len());
955
956        let decompressed = decompress_lz77(&compressed);
957        println!("compress_long length: {}", input.len());
958
959        // This is to check where the compression fails, if it were to
960        for (n, (&a, &b)) in input.iter().zip(decompressed.iter()).enumerate() {
961            if a != b {
962                println!("First difference at {}, input: {}, output: {}", n, a, b);
963                break;
964            }
965        }
966        assert_eq!(input.len(), decompressed.len());
967        assert!(&decompressed == &input);
968    }
969
970    /// Check that lazy matching is working as intended
971    #[test]
972    fn lazy() {
973        // We want to match on `badger` rather than `nba` as it is longer
974        // let data = b" nba nbadg badger nbadger";
975        let data = b"nba badger nbadger";
976        let compressed = lz77_compress(data).unwrap();
977        let test = compressed[compressed.len() - 1];
978        if let LZType::StoredLengthDistance(l, _) = test.value() {
979            assert_eq!(l.actual_length(), 6);
980        } else {
981            print_output(&compressed);
982            panic!();
983        }
984    }
985
986    fn roundtrip(data: &[u8]) {
987        let compressed = super::lz77_compress(&data).unwrap();
988        let decompressed = decompress_lz77(&compressed);
989        assert!(decompressed == data);
990    }
991
992    // Check that data with the exact window size is working properly
993    #[test]
994    #[allow(unused)]
995    fn exact_window_size() {
996        use std::io::Write;
997        let mut data = vec![0; WINDOW_SIZE];
998        roundtrip(&data);
999        {
1000            data.write(&[22; WINDOW_SIZE]);
1001        }
1002        roundtrip(&data);
1003        {
1004            data.write(&[55; WINDOW_SIZE]);
1005        }
1006        roundtrip(&data);
1007    }
1008
1009    /// Test that matches at the window border are working correctly
1010    #[test]
1011    fn border() {
1012        use crate::chained_hash_table::WINDOW_SIZE;
1013        let mut data = vec![35; WINDOW_SIZE];
1014        data.extend(b"Test");
1015        let compressed = super::lz77_compress(&data).unwrap();
1016        assert!(compressed.len() < data.len());
1017        let decompressed = decompress_lz77(&compressed);
1018
1019        assert_eq!(decompressed.len(), data.len());
1020        assert!(decompressed == data);
1021    }
1022
1023    #[test]
1024    fn border_multiple_blocks() {
1025        use crate::chained_hash_table::WINDOW_SIZE;
1026        let mut data = vec![0; (WINDOW_SIZE * 2) + 50];
1027        data.push(1);
1028        let compressed = super::lz77_compress(&data).unwrap();
1029        assert!(compressed.len() < data.len());
1030        let decompressed = decompress_lz77(&compressed);
1031        assert_eq!(decompressed.len(), data.len());
1032        assert!(decompressed == data);
1033    }
1034
1035    #[test]
1036    fn compress_block_status() {
1037        use crate::input_buffer::InputBuffer;
1038
1039        let data = b"Test data data";
1040        let mut writer = DynamicWriter::new();
1041
1042        let mut buffer = InputBuffer::empty();
1043        let mut state = LZ77State::new(4096, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1044        let status = lz77_compress_block_finish(data, &mut state, &mut buffer, &mut writer);
1045        assert_eq!(status.1, LZ77Status::Finished);
1046        assert!(&buffer.get_buffer()[..data.len()] == data);
1047        assert_eq!(buffer.current_end(), data.len());
1048    }
1049
1050    #[test]
1051    fn compress_block_multiple_windows() {
1052        use crate::input_buffer::InputBuffer;
1053        use crate::output_writer::DynamicWriter;
1054
1055        let data = get_test_data();
1056        assert!(data.len() > (WINDOW_SIZE * 2) + super::MAX_MATCH);
1057        let mut writer = DynamicWriter::new();
1058
1059        let mut buffer = InputBuffer::empty();
1060        let mut state = LZ77State::new(0, DEFAULT_LAZY_IF_LESS_THAN, MatchingType::Lazy);
1061        let (bytes_consumed, status) =
1062            lz77_compress_block_finish(&data, &mut state, &mut buffer, &mut writer);
1063        assert_eq!(
1064            buffer.get_buffer().len(),
1065            (WINDOW_SIZE * 2) + super::MAX_MATCH
1066        );
1067        assert_eq!(status, LZ77Status::EndBlock);
1068        let buf_len = buffer.get_buffer().len();
1069        assert!(buffer.get_buffer()[..] == data[..buf_len]);
1070
1071        writer.clear();
1072        let (_, status) = lz77_compress_block_finish(
1073            &data[bytes_consumed..],
1074            &mut state,
1075            &mut buffer,
1076            &mut writer,
1077        );
1078        assert_eq!(status, LZ77Status::EndBlock);
1079    }
1080
1081    #[test]
1082    fn multiple_inputs() {
1083        let data = b"Badger badger bababa test data 25 asfgestghresjkgh";
1084        let comp1 = lz77_compress(data).unwrap();
1085        let comp2 = {
1086            const SPLIT: usize = 25;
1087            let first_part = &data[..SPLIT];
1088            let second_part = &data[SPLIT..];
1089            let mut state = TestStruct::new();
1090            let (bytes_written, status, _) = state.compress_block(first_part, false);
1091            assert_eq!(bytes_written, first_part.len());
1092            assert_eq!(status, LZ77Status::NeedInput);
1093            let (bytes_written, status, _) = state.compress_block(second_part, true);
1094            assert_eq!(bytes_written, second_part.len());
1095            assert_eq!(status, LZ77Status::Finished);
1096            Vec::from(state.writer.get_buffer())
1097        };
1098        assert!(comp1 == comp2);
1099    }
1100
1101    #[test]
1102    /// Test that the exit from process_chunk when buffer is full is working correctly.
1103    fn buffer_fill() {
1104        let data = get_test_data();
1105        // The comments above these calls refers the positions with the
1106        // corersponding comments in process_chunk_{greedy/lazy}.
1107        // POS BETTER OR NO MATCH
1108        buffer_test_literals(&data);
1109        // POS END
1110        // POS NO MATCH
1111        buffer_test_last_bytes(MatchingType::Greedy, &data);
1112        // POS ADD
1113        // POS AFTER ADD
1114        buffer_test_last_bytes(MatchingType::Lazy, &data);
1115
1116        // POS MATCH
1117        buffer_test_match(MatchingType::Greedy);
1118        // POS MATCH(lazy)
1119        buffer_test_match(MatchingType::Lazy);
1120
1121        // POS END
1122        buffer_test_add_end(&data);
1123    }
1124
1125    /// Test buffer fill when a byte is added due to no match being found.
1126    fn buffer_test_literals(data: &[u8]) {
1127        let mut state = TestStruct::with_config(0, NO_RLE, MatchingType::Lazy);
1128        let (bytes_consumed, status, position) = state.compress_block(&data, false);
1129
1130        // There should be enough data for the block to have ended.
1131        assert_eq!(status, LZ77Status::EndBlock);
1132        assert!(bytes_consumed <= (WINDOW_SIZE * 2) + MAX_MATCH);
1133
1134        // The buffer should be full.
1135        assert_eq!(state.writer.get_buffer().len(), MAX_BUFFER_LENGTH);
1136        assert_eq!(position, state.writer.get_buffer().len());
1137        // Since all literals have been input, the block should have the exact number of litlens
1138        // as there were input bytes.
1139        assert_eq!(
1140            state.state.current_block_input_bytes() as usize,
1141            MAX_BUFFER_LENGTH
1142        );
1143        state.state.reset_input_bytes();
1144
1145        let mut out = decompress_lz77(state.writer.get_buffer());
1146
1147        state.writer.clear();
1148        // The buffer should now be cleared.
1149        assert_eq!(state.writer.get_buffer().len(), 0);
1150
1151        assert!(data[..out.len()] == out[..]);
1152
1153        let _ = state.compress_block(&data[bytes_consumed..], false);
1154        // We should have some new data in the buffer at this point.
1155        assert!(state.writer.get_buffer().len() > 0);
1156        assert_eq!(
1157            state.state.current_block_input_bytes() as usize,
1158            MAX_BUFFER_LENGTH
1159        );
1160
1161        out.extend_from_slice(&decompress_lz77(state.writer.get_buffer()));
1162        assert!(data[..out.len()] == out[..]);
1163    }
1164
1165    /// Test buffer fill at the last two bytes that are not hashed.
1166    fn buffer_test_last_bytes(matching_type: MatchingType, data: &[u8]) {
1167        const BYTES_USED: usize = MAX_BUFFER_LENGTH;
1168        assert!(
1169            &data[..BYTES_USED]
1170                == &decompress_lz77(
1171                    &lz77_compress_conf(&data[..BYTES_USED], 0, NO_RLE, matching_type,).unwrap()
1172                )[..]
1173        );
1174        assert!(
1175            &data[..BYTES_USED + 1]
1176                == &decompress_lz77(
1177                    &lz77_compress_conf(&data[..BYTES_USED + 1], 0, NO_RLE, matching_type,)
1178                        .unwrap()
1179                )[..]
1180        );
1181    }
1182
1183    /// Test buffer fill when buffer is full at a match.
1184    fn buffer_test_match(matching_type: MatchingType) {
1185        // TODO: Also test this for the second block to make sure
1186        // buffer is slid.
1187        let mut state = TestStruct::with_config(1, 0, matching_type);
1188        for _ in 0..MAX_BUFFER_LENGTH - 4 {
1189            assert!(state.writer.write_literal(0) == BufferStatus::NotFull);
1190        }
1191        state.compress_block(&[1, 2, 3, 1, 2, 3, 4], true);
1192        assert!(*state.writer.get_buffer().last().unwrap() == LZValue::length_distance(3, 3));
1193    }
1194
1195    /// Test buffer fill for the lazy match algorithm when adding a pending byte at the end.
1196    fn buffer_test_add_end(_data: &[u8]) {
1197        // This is disabled while the buffer size has not been stabilized.
1198        /*
1199        let mut state = TestStruct::with_config(DEFAULT_MAX_HASH_CHECKS,
1200                                                DEFAULT_LAZY_IF_LESS_THAN,
1201                                                MatchingType::Lazy);
1202        // For the test file, this is how much data needs to be added to get the buffer
1203        // full at the right spot to test that this buffer full exit is workong correctly.
1204        for i in 0..31743 {
1205            assert!(state.writer.write_literal(0) == BufferStatus::NotFull, "Buffer pos: {}", i);
1206        }
1207
1208        let dec = decompress_lz77(&state.writer.get_buffer()[pos..]);
1209        assert!(dec.len() > 0);
1210        assert!(dec[..] == data[..dec.len()]);
1211         */
1212    }
1213
1214    /// Check that decompressing lz77-data that refers to the back-buffer works.
1215    #[test]
1216    fn test_decompress_with_backbuffer() {
1217        let bb = vec![5; WINDOW_SIZE];
1218        let lz_compressed = [lit(2), lit(4), ld(4, 20), lit(1), lit(1), ld(5, 10)];
1219        let dec = decompress_lz77_with_backbuffer(&lz_compressed, &bb);
1220
1221        // ------------l2 l4  <-ld4,20-> l1 l1  <---ld5,10-->
1222        assert!(dec == [2, 4, 5, 5, 5, 5, 1, 1, 5, 5, 2, 4, 5]);
1223    }
1224}
1225
1226#[cfg(all(test, feature = "benchmarks"))]
1227mod bench {
1228    use super::lz77_compress;
1229    use test_std::Bencher;
1230    use test_utils::get_test_data;
1231    #[bench]
1232    fn test_file_zlib_lz77_only(b: &mut Bencher) {
1233        let test_data = get_test_data();
1234        b.iter(|| lz77_compress(&test_data));
1235    }
1236}
deflate/lz77.rs

deflate/
lz77.rs