001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.lz77support;
020
021import java.io.IOException;
022import java.io.InputStream;
023import java.util.Arrays;
024
025import org.apache.commons.compress.compressors.CompressorInputStream;
026import org.apache.commons.compress.utils.ByteUtils;
027import org.apache.commons.compress.utils.IOUtils;
028import org.apache.commons.compress.utils.InputStreamStatistics;
029import org.apache.commons.io.input.BoundedInputStream;
030
031/**
032 * Encapsulates code common to LZ77 decompressors.
033 *
034 * <p>
035 * Assumes the stream consists of blocks of literal data and back-references (called copies) in any order. Of course the first block must be a literal block for
036 * the scheme to work - unless the {@link #prefill prefill} method has been used to provide initial data that is never returned by {@link #read read} but only
037 * used for back-references.
038 * </p>
039 *
040 * <p>
041 * Subclasses must override the three-arg {@link #read read} method as the no-arg version delegates to it and the default implementation delegates to the no-arg
042 * version, leading to infinite mutual recursion and a {@code StackOverflowError} otherwise.
043 * </p>
044 *
045 * <p>
046 * The contract for subclasses' {@code read} implementation is:
047 * </p>
048 * <ul>
049 *
050 * <li>keep track of the current state of the stream. Is it inside a literal block or a back-reference or in-between blocks?</li>
051 *
052 * <li>Use {@link #readOneByte} to access the underlying stream directly.</li>
053 *
054 * <li>If a new literal block starts, use {@link #startLiteral} to tell this class about it and read the literal data using {@link #readLiteral} until it
055 * returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readLiteral} would return {@code 0}.</li>
056 *
057 * <li>If a new back-reference starts, use {@link #startBackReference} to tell this class about it and read the literal data using {@link #readBackReference}
058 * until it returns {@code 0}. {@link #hasMoreDataInBlock} will return {@code false} before the next call to {@link #readBackReference} would return
059 * {@code 0}.</li>
060 *
061 * <li>If the end of the stream has been reached, return {@code -1} as this class' methods will never do so themselves.</li>
062 *
063 * </ul>
064 *
065 * <p>
066 * {@link #readOneByte} and {@link #readLiteral} update the counter for bytes read.
067 * </p>
068 *
069 * @since 1.14
070 */
071public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
072
073    /** Size of the window - must be bigger than the biggest offset expected. */
074    private final int windowSize;
075
076    /**
077     * Buffer to write decompressed bytes to for back-references, will be three times windowSize big.
078     *
079     * <p>
080     * Three times so we can slide the whole buffer a windowSize to the left once we've read twice windowSize and still have enough data inside of it to satisfy
081     * back-references.
082     * </p>
083     */
084    private final byte[] buf;
085
086    /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
087    private int writeIndex;
088
089    /** Index of the next byte to be read. */
090    private int readIndex;
091
092    /** The underlying stream to read compressed data from */
093    private final BoundedInputStream in;
094
095    /** Number of bytes still to be read from the current literal or back-reference. */
096    private long bytesRemaining;
097
098    /** Offset of the current back-reference. */
099    private int backReferenceOffset;
100
101    /** Uncompressed size */
102    private int size;
103
104    // used in no-arg read method
105    private final byte[] oneByte = new byte[1];
106
107    /**
108     * Supplier that delegates to {@link #readOneByte}.
109     */
110    protected final ByteUtils.ByteSupplier supplier = this::readOneByte;
111
112    /**
113     * Creates a new LZ77 input stream.
114     *
115     * @param is         An InputStream to read compressed data from
116     * @param windowSize Size of the window kept for back-references, must be bigger than the biggest offset expected.
117     * @throws IllegalArgumentException if windowSize is not bigger than 0
118     */
119    public AbstractLZ77CompressorInputStream(final InputStream is, final int windowSize) {
120        this.in = BoundedInputStream.builder().setInputStream(is).asSupplier().get();
121        if (windowSize <= 0) {
122            throw new IllegalArgumentException("windowSize must be bigger than 0");
123        }
124        this.windowSize = windowSize;
125        buf = new byte[3 * windowSize];
126        writeIndex = readIndex = 0;
127        bytesRemaining = 0;
128    }
129
130    /** {@inheritDoc} */
131    @Override
132    public int available() {
133        return writeIndex - readIndex;
134    }
135
136    /** {@inheritDoc} */
137    @Override
138    public void close() throws IOException {
139        in.close();
140    }
141
142    /**
143     * @since 1.17
144     */
145    @Override
146    public long getCompressedCount() {
147        return in.getCount();
148    }
149
150    /**
151     * Gets the uncompressed size of the stream
152     *
153     * @return the uncompressed size
154     */
155    public int getSize() {
156        return size;
157    }
158
159    /**
160     * Is there still data remaining inside the current block?
161     *
162     * @return true if there is still data remaining inside the current block.
163     */
164    protected final boolean hasMoreDataInBlock() {
165        return bytesRemaining > 0;
166    }
167
168    /**
169     * Adds some initial data to fill the window with.
170     *
171     * <p>
172     * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the
173     * LZ4 frame format using block dependency.
174     * </p>
175     *
176     * @param data the data to fill the window with.
177     * @throws IllegalStateException if the stream has already started to read data
178     */
179    public void prefill(final byte[] data) {
180        if (writeIndex != 0) {
181            throw new IllegalStateException("The stream has already been read from, can't prefill anymore");
182        }
183        // we don't need more data than the big offset could refer to, so cap it
184        final int len = Math.min(windowSize, data.length);
185        // we need the last data as we are dealing with *back*-references
186        System.arraycopy(data, data.length - len, buf, 0, len);
187        writeIndex += len;
188        readIndex += len;
189    }
190
191    /** {@inheritDoc} */
192    @Override
193    public int read() throws IOException {
194        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
195    }
196
197    /**
198     * Reads data from the current back-reference.
199     *
200     * @param b   buffer to write data to
201     * @param off offset to start writing to
202     * @param len maximum amount of data to read
203     * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
204     * @throws NullPointerException      if {@code b} is null
205     * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
206     */
207    protected final int readBackReference(final byte[] b, final int off, final int len) {
208        final int avail = available();
209        if (len > avail) {
210            tryToCopy(len - avail);
211        }
212        return readFromBuffer(b, off, len);
213    }
214
215    private int readFromBuffer(final byte[] b, final int off, final int len) {
216        final int readable = Math.min(len, available());
217        if (readable > 0) {
218            System.arraycopy(buf, readIndex, b, off, readable);
219            readIndex += readable;
220            if (readIndex > 2 * windowSize) {
221                slideBuffer();
222            }
223        }
224        size += readable;
225        return readable;
226    }
227
228    /**
229     * Reads data from the current literal block.
230     *
231     * @param b   buffer to write data to
232     * @param off offset to start writing to
233     * @param len maximum amount of data to read
234     * @return number of bytes read, may be 0. Will never return -1 as EOF-detection is the responsibility of the subclass
235     * @throws IOException               if the underlying stream throws or signals an EOF before the amount of data promised for the block have been read
236     * @throws NullPointerException      if {@code b} is null
237     * @throws IndexOutOfBoundsException if {@code off} is negative, {@code len} is negative, or {@code len} is greater than {@code b.length - off}
238     */
239    protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
240        final int avail = available();
241        if (len > avail) {
242            tryToReadLiteral(len - avail);
243        }
244        return readFromBuffer(b, off, len);
245    }
246
247    /**
248     * Reads a single byte from the real input stream and ensures the data is accounted for.
249     *
250     * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
251     * @throws IOException if the underlying stream throws
252     */
253    protected final int readOneByte() throws IOException {
254        final int b = in.read();
255        if (b != -1) {
256            count(1);
257            return b & 0xFF;
258        }
259        return -1;
260    }
261
262    private void slideBuffer() {
263        System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
264        writeIndex -= windowSize;
265        readIndex -= windowSize;
266    }
267
268    /**
269     * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
270     *
271     * @param offset the offset of the back-reference
272     * @param length the length of the back-reference
273     * @throws IllegalArgumentException if offset not bigger than 0 or bigger than the number of bytes available for back-references or if length is negative
274     */
275    protected final void startBackReference(final int offset, final long length) {
276        if (offset <= 0 || offset > writeIndex) {
277            throw new IllegalArgumentException("offset must be bigger than 0 but not bigger than the number of bytes available for back-references");
278        }
279        if (length < 0) {
280            throw new IllegalArgumentException("length must not be negative");
281        }
282        backReferenceOffset = offset;
283        bytesRemaining = length;
284    }
285
286    /**
287     * Used by subclasses to signal the next block contains the given amount of literal data.
288     *
289     * @param length the length of the block
290     * @throws IllegalArgumentException if length is negative
291     */
292    protected final void startLiteral(final long length) {
293        if (length < 0) {
294            throw new IllegalArgumentException("length must not be negative");
295        }
296        bytesRemaining = length;
297    }
298
299    private void tryToCopy(final int bytesToCopy) {
300        // this will fit into the buffer without sliding and not
301        // require more than is available inside the back-reference
302        final int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), buf.length - writeIndex);
303        if (copy == 0) {
304            // NOP
305        } else if (backReferenceOffset == 1) { // pretty common special case
306            final byte last = buf[writeIndex - 1];
307            Arrays.fill(buf, writeIndex, writeIndex + copy, last);
308            writeIndex += copy;
309        } else if (copy < backReferenceOffset) {
310            System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
311            writeIndex += copy;
312        } else {
313            // back-reference overlaps with the bytes created from it
314            // like go back two bytes and then copy six (by copying
315            // the last two bytes three time).
316            final int fullRots = copy / backReferenceOffset;
317            for (int i = 0; i < fullRots; i++) {
318                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
319                writeIndex += backReferenceOffset;
320            }
321
322            final int pad = copy - backReferenceOffset * fullRots;
323            if (pad > 0) {
324                System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
325                writeIndex += pad;
326            }
327        }
328        bytesRemaining -= copy;
329    }
330
331    private void tryToReadLiteral(final int bytesToRead) throws IOException {
332        // min of "what is still inside the literal", "what does the user want" and "how much can fit into the buffer"
333        final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), buf.length - writeIndex);
334        final int bytesRead = reallyTryToRead > 0 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) : 0 /* happens for bytesRemaining == 0 */;
335        count(bytesRead);
336        if (reallyTryToRead != bytesRead) {
337            throw new IOException("Premature end of stream reading literal");
338        }
339        writeIndex += reallyTryToRead;
340        bytesRemaining -= reallyTryToRead;
341    }
342}