001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.util.Objects;
023
024import org.apache.commons.lang3.ArrayFill;
025
026/**
027 * Helper class for compression algorithms that use the ideas of LZ77.
028 *
029 * <p>
030 * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths)
031 * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how
032 * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the
033 * case of DEFLATE for example).
034 * </p>
035 *
036 * <p>
037 * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951
038 * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib
039 * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation.
040 * </p>
041 *
042 * <p>
043 * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for
044 * a whole family of algorithms.
045 * </p>
046 *
047 * <p>
048 * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either
049 * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all
050 * information, the {@code #finish} method must be used once all data has been fed into the compressor.
051 * </p>
052 *
053 * <p>
054 * Several parameters influence the outcome of the "compression":
055 * </p>
056 * <dl>
057 *
058 * <dt>{@code windowSize}</dt>
059 * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer
060 * of twice of {@code windowSize} - real world values are in the area of 32k.</dd>
061 *
062 * <dt>{@code minBackReferenceLength}</dt>
063 * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd>
064 *
065 * <dt>{@code maxBackReferenceLength}</dt>
066 * <dd>Maximal length of a back-reference found.</dd>
067 *
068 * <dt>{@code maxOffset}</dt>
069 * <dd>Maximal offset of a back-reference.</dd>
070 *
071 * <dt>{@code maxLiteralLength}</dt>
072 * <dd>Maximal length of a literal block.</dd>
073 * </dl>
074 *
075 * @see "https://tools.ietf.org/html/rfc1951#section-4"
076 * @since 1.14
077 * @NotThreadSafe
078 */
079public class LZ77Compressor {
080
081    /**
082     * Represents a back-reference.
083     */
084    public abstract static class AbstractReference extends Block {
085
086        private final int offset;
087        private final int length;
088
089        /**
090         * Constructs a new instance.
091         *
092         * @param blockType The block type.
093         * @param offset the offset of the reference.
094         * @param length the offset of the reference.
095         */
096        public AbstractReference(final BlockType blockType, final int offset, final int length) {
097            super(blockType);
098            this.offset = offset;
099            this.length = length;
100        }
101
102        /**
103         * Gets the offset of the reference.
104         *
105         * @return the length
106         */
107        public int getLength() {
108            return length;
109        }
110
111        /**
112         * Gets the offset of the reference.
113         *
114         * @return the offset
115         */
116        public int getOffset() {
117            return offset;
118        }
119
120        @Override
121        public String toString() {
122            return super.toString() + " with offset " + offset + " and length " + length;
123        }
124    }
125
126    /**
127     * Represents a back-reference.
128     */
129    public static final class BackReference extends AbstractReference {
130
131        /**
132         * Constructs a new instance.
133         *
134         * @param offset the offset of the back-reference.
135         * @param length the offset of the back-reference.
136         */
137        public BackReference(final int offset, final int length) {
138            super(BlockType.BACK_REFERENCE, offset, length);
139        }
140
141    }
142
143    /**
144     * Base class representing blocks the compressor may emit.
145     *
146     * <p>
147     * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses
148     * may get introduced with future releases.
149     * </p>
150     */
151    public abstract static class Block {
152
153        /**
154         * Enumerates the block types the compressor emits.
155         */
156        public enum BlockType {
157
158            /**
159             * The literal block type.
160             */
161            LITERAL,
162
163            /**
164             * The back-reference block type.
165             */
166            BACK_REFERENCE,
167
168            /**
169             * The end-of-data block type.
170             */
171            EOD
172        }
173
174        private final BlockType type;
175
176        /**
177         * Constructs a new typeless instance.
178         *
179         * @deprecated Use {@link #Block()}.
180         */
181        @Deprecated
182        public Block() {
183            this.type = null;
184        }
185
186        /**
187         * Constructs a new instance.
188         *
189         * @param type the block type, may not be {@code null}.
190         */
191        protected Block(final BlockType type) {
192            this.type = Objects.requireNonNull(type);
193        }
194
195        /**
196         * Gets the the block type.
197         *
198         * @return the the block type.
199         */
200        public BlockType getType() {
201            return type;
202        }
203
204        @Override
205        public String toString() {
206            return getClass().getSimpleName() + " " + getType();
207        }
208    }
209
210    /**
211     * Callback invoked while the compressor processes data.
212     *
213     * <p>
214     * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of
215     * {@link #compress} or {@link #finish}.
216     * </p>
217     */
218    public interface Callback {
219
220        /**
221         * Consumes a block.
222         *
223         * @param b the block to consume
224         * @throws IOException in case of an error
225         */
226        void accept(Block b) throws IOException;
227    }
228
229    /** A simple "we are done" marker. */
230    public static final class EOD extends Block {
231
232        /**
233         * The singleton instance.
234         */
235        private static final EOD INSTANCE = new EOD();
236
237        /**
238         * Constructs a new instance.
239         */
240        public EOD() {
241            super(BlockType.EOD);
242        }
243
244    }
245
246    /**
247     * Represents a literal block of data.
248     *
249     * <p>
250     * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept}
251     * immediately as it will get overwritten sooner or later.
252     * </p>
253     */
254    public static final class LiteralBlock extends AbstractReference {
255
256        private final byte[] data;
257
258        /**
259         * Constructs a new instance.
260         *
261         * @param data the literal data.
262         * @param offset the length of literal block.
263         * @param length the length of literal block.
264         */
265        public LiteralBlock(final byte[] data, final int offset, final int length) {
266            super(BlockType.LITERAL, offset, length);
267            this.data = data;
268        }
269
270        /**
271         * Gets the literal data.
272         *
273         * <p>
274         * This returns a live view of the actual data in order to avoid copying, modify the array at your own risk.
275         * </p>
276         *
277         * @return the data
278         */
279        public byte[] getData() {
280            return data;
281        }
282
283    }
284
285    static final int NUMBER_OF_BYTES_IN_HASH = 3;
286    private static final int NO_MATCH = -1;
287
288    // we use a 15 bit hash code as calculated in updateHash
289    private static final int HASH_SIZE = 1 << 15;
290    private static final int HASH_MASK = HASH_SIZE - 1;
291
292    private static final int H_SHIFT = 5;
293    private final Parameters params;
294    private final Callback callback;
295
296    // the sliding window, twice as big as "windowSize" parameter
297    private final byte[] window;
298
299    // the head of hash-chain - indexed by hash-code, points to the
300    // location inside of window of the latest sequence of bytes with
301    // the given hash.
302    private final int[] head;
303    // for each window-location points to the latest earlier location
304    // with the same hash. Only stores values for the latest
305    // "windowSize" elements, the index is "window location modulo
306    // windowSize".
307    private final int[] prev;
308    // bit mask used when indexing into prev
309    private final int wMask;
310    private boolean initialized;
311    // the position inside of window that shall be encoded right now
312    private int currentPosition;
313    // the number of bytes available to compress including the one at
314    // currentPosition
315    private int lookahead;
316    // the hash of the three bytes stating at the current position
317    private int insertHash;
318
319    // the position inside the window where the current literal
320    // block starts (in case we are inside a literal block).
321    private int blockStart;
322
323    // position of the current match
324    private int matchStart = NO_MATCH;
325
326    // number of missed insertString calls for the up to three last
327    // bytes of the last match that can only be performed once more
328    // data has been read
329    private int missedInserts;
330
331    /**
332     * Initializes a compressor with parameters and a callback.
333     *
334     * @param params   the parameters
335     * @param callback the callback
336     * @throws NullPointerException if either parameter is {@code null}
337     */
338    public LZ77Compressor(final Parameters params, final Callback callback) {
339        Objects.requireNonNull(params, "params");
340        Objects.requireNonNull(callback, "callback");
341
342        this.params = params;
343        this.callback = callback;
344
345        final int wSize = params.getWindowSize();
346        window = new byte[wSize * 2];
347        wMask = wSize - 1;
348        head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH);
349        prev = new int[wSize];
350    }
351
352    private void catchUpMissedInserts() {
353        while (missedInserts > 0) {
354            insertString(currentPosition - missedInserts--);
355        }
356    }
357
358    private void compress() throws IOException {
359        final int minMatch = params.getMinBackReferenceLength();
360        final boolean lazy = params.getLazyMatching();
361        final int lazyThreshold = params.getLazyMatchingThreshold();
362
363        while (lookahead >= minMatch) {
364            catchUpMissedInserts();
365            int matchLength = 0;
366            final int hashHead = insertString(currentPosition);
367            if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
368                // sets matchStart as a side effect
369                matchLength = longestMatch(hashHead);
370
371                if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
372                    // try to find a longer match using the next position
373                    matchLength = longestMatchForNextPosition(matchLength);
374                }
375            }
376            if (matchLength >= minMatch) {
377                if (blockStart != currentPosition) {
378                    // emit preceding literal block
379                    flushLiteralBlock();
380                    blockStart = NO_MATCH;
381                }
382                flushBackReference(matchLength);
383                insertStringsInMatch(matchLength);
384                lookahead -= matchLength;
385                currentPosition += matchLength;
386                blockStart = currentPosition;
387            } else {
388                // no match, append to current or start a new literal
389                lookahead--;
390                currentPosition++;
391                if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
392                    flushLiteralBlock();
393                    blockStart = currentPosition;
394                }
395            }
396        }
397    }
398
399    /**
400     * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
401     *
402     * @param data the data to compress - must not be null
403     * @throws IOException if the callback throws an exception
404     */
405    public void compress(final byte[] data) throws IOException {
406        compress(data, 0, data.length);
407    }
408
409    /**
410     * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method.
411     *
412     * @param data the data to compress - must not be null
413     * @param off  the start offset of the data
414     * @param len  the number of bytes to compress
415     * @throws IOException if the callback throws an exception
416     */
417    public void compress(final byte[] data, int off, int len) throws IOException {
418        final int wSize = params.getWindowSize();
419        while (len > wSize) { // chop into windowSize sized chunks
420            doCompress(data, off, wSize);
421            off += wSize;
422            len -= wSize;
423        }
424        if (len > 0) {
425            doCompress(data, off, len);
426        }
427    }
428
429    // performs the actual algorithm with the pre-condition len <= windowSize
430    private void doCompress(final byte[] data, final int off, final int len) throws IOException {
431        final int spaceLeft = window.length - currentPosition - lookahead;
432        if (len > spaceLeft) {
433            slide();
434        }
435        System.arraycopy(data, off, window, currentPosition + lookahead, len);
436        lookahead += len;
437        if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
438            initialize();
439        }
440        if (initialized) {
441            compress();
442        }
443    }
444
445    /**
446     * Tells the compressor to process all remaining data and signal end of data to the callback.
447     *
448     * <p>
449     * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method.
450     * </p>
451     *
452     * @throws IOException if the callback throws an exception
453     */
454    public void finish() throws IOException {
455        if (blockStart != currentPosition || lookahead > 0) {
456            currentPosition += lookahead;
457            flushLiteralBlock();
458        }
459        callback.accept(EOD.INSTANCE);
460    }
461
462    private void flushBackReference(final int matchLength) throws IOException {
463        callback.accept(new BackReference(currentPosition - matchStart, matchLength));
464    }
465
466    private void flushLiteralBlock() throws IOException {
467        callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
468    }
469
470    private void initialize() {
471        for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
472            insertHash = nextHash(insertHash, window[i]);
473        }
474        initialized = true;
475    }
476
477    /**
478     * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain.
479     *
480     * <p>
481     * Updates {@code insertHash} and {@code prev} as a side effect.
482     * </p>
483     */
484    private int insertString(final int pos) {
485        insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
486        final int hashHead = head[insertHash];
487        prev[pos & wMask] = hashHead;
488        head[insertHash] = pos;
489        return hashHead;
490    }
491
492    private void insertStringsInMatch(final int matchLength) {
493        // inserts strings contained in current match
494        // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
495        final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
496        // currentPosition has been inserted already
497        for (int i = 1; i <= stop; i++) {
498            insertString(currentPosition + i);
499        }
500        missedInserts = matchLength - stop - 1;
501    }
502
503    /**
504     * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset).
505     *
506     * <p>
507     * Sets matchStart to the index of the start position of the longest match as a side effect.
508     * </p>
509     */
510    private int longestMatch(int matchHead) {
511        final int minLength = params.getMinBackReferenceLength();
512        int longestMatchLength = minLength - 1;
513        final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
514        final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
515        final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
516        final int maxCandidates = params.getMaxCandidates();
517        for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
518            int currentLength = 0;
519            for (int i = 0; i < maxPossibleLength; i++) {
520                if (window[matchHead + i] != window[currentPosition + i]) {
521                    break;
522                }
523                currentLength++;
524            }
525            if (currentLength > longestMatchLength) {
526                longestMatchLength = currentLength;
527                matchStart = matchHead;
528                if (currentLength >= niceBackReferenceLength) {
529                    // no need to search any further
530                    break;
531                }
532            }
533            matchHead = prev[matchHead & wMask];
534        }
535        return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
536    }
537
538    private int longestMatchForNextPosition(final int prevMatchLength) {
539        // save a bunch of values to restore them if the next match isn't better than the current one
540        final int prevMatchStart = matchStart;
541        final int prevInsertHash = insertHash;
542
543        lookahead--;
544        currentPosition++;
545        final int hashHead = insertString(currentPosition);
546        final int prevHashHead = prev[currentPosition & wMask];
547        int matchLength = longestMatch(hashHead);
548
549        if (matchLength <= prevMatchLength) {
550            // use the first match, as the next one isn't any better
551            matchLength = prevMatchLength;
552            matchStart = prevMatchStart;
553
554            // restore modified values
555            head[insertHash] = prevHashHead;
556            insertHash = prevInsertHash;
557            currentPosition--;
558            lookahead++;
559        }
560        return matchLength;
561    }
562
563    /**
564     * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is
565     * nextHash(H, D).
566     *
567     * <p>
568     * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update.
569     * </p>
570     */
571    private int nextHash(final int oldHash, final byte nextByte) {
572        final int nextVal = nextByte & 0xFF;
573        return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK;
574    }
575
576    /**
577     * Adds some initial data to fill the window with.
578     *
579     * <p>
580     * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
581     * LZ4 frame format using block dependency.
582     * </p>
583     *
584     * @param data the data to fill the window with.
585     * @throws IllegalStateException if the compressor has already started to accept data
586     */
587    public void prefill(final byte[] data) {
588        if (currentPosition != 0 || lookahead != 0) {
589            throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore");
590        }
591
592        // don't need more than windowSize for back-references
593        final int len = Math.min(params.getWindowSize(), data.length);
594        System.arraycopy(data, data.length - len, window, 0, len);
595
596        if (len >= NUMBER_OF_BYTES_IN_HASH) {
597            initialize();
598            final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
599            for (int i = 0; i < stop; i++) {
600                insertString(i);
601            }
602            missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
603        } else { // not enough data to hash anything
604            missedInserts = len;
605        }
606        blockStart = currentPosition = len;
607    }
608
609    private void slide() throws IOException {
610        final int wSize = params.getWindowSize();
611        if (blockStart != currentPosition && blockStart < wSize) {
612            flushLiteralBlock();
613            blockStart = currentPosition;
614        }
615        System.arraycopy(window, wSize, window, 0, wSize);
616        currentPosition -= wSize;
617        matchStart -= wSize;
618        blockStart -= wSize;
619        for (int i = 0; i < HASH_SIZE; i++) {
620            final int h = head[i];
621            head[i] = h >= wSize ? h - wSize : NO_MATCH;
622        }
623        for (int i = 0; i < wSize; i++) {
624            final int p = prev[i];
625            prev[i] = p >= wSize ? p - wSize : NO_MATCH;
626        }
627    }
628}