001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Objects; 023 024import org.apache.commons.lang3.ArrayFill; 025 026/** 027 * Helper class for compression algorithms that use the ideas of LZ77. 028 * 029 * <p> 030 * Most LZ77 derived algorithms split input data into blocks of uncompressed data (called literal blocks) and back-references (pairs of offsets and lengths) 031 * that state "add {@code length} bytes that are the same as those already written starting {@code offset} bytes before the current position. The details of how 032 * those blocks and back-references are encoded are quite different between the algorithms and some algorithms perform additional steps (Huffman encoding in the 033 * case of DEFLATE for example). 034 * </p> 035 * 036 * <p> 037 * This class attempts to extract the core logic - finding back-references - so it can be re-used. It follows the algorithm explained in section 4 of RFC 1951 038 * (DEFLATE) and currently doesn't implement the "lazy match" optimization. The three-byte hash function used in this class is the same as the one used by zlib 039 * and InfoZIP's ZIP implementation of DEFLATE. The whole class is strongly inspired by InfoZIP's implementation. 040 * </p> 041 * 042 * <p> 043 * LZ77 is used vaguely here (as well as many other places that talk about it :-), LZSS would likely be closer to the truth but LZ77 has become the synonym for 044 * a whole family of algorithms. 045 * </p> 046 * 047 * <p> 048 * The API consists of a compressor that is fed {@code byte}s and emits {@link Block}s to a registered callback where the blocks represent either 049 * {@link LiteralBlock literal blocks}, {@link BackReference back-references} or {@link EOD end of data markers}. In order to ensure the callback receives all 050 * information, the {@code #finish} method must be used once all data has been fed into the compressor. 051 * </p> 052 * 053 * <p> 054 * Several parameters influence the outcome of the "compression": 055 * </p> 056 * <dl> 057 * 058 * <dt>{@code windowSize}</dt> 059 * <dd>the size of the sliding window, must be a power of two - this determines the maximum offset a back-reference can take. The compressor maintains a buffer 060 * of twice of {@code windowSize} - real world values are in the area of 32k.</dd> 061 * 062 * <dt>{@code minBackReferenceLength}</dt> 063 * <dd>Minimal length of a back-reference found. A true minimum of 3 is hard-coded inside of this implementation but bigger lengths can be configured.</dd> 064 * 065 * <dt>{@code maxBackReferenceLength}</dt> 066 * <dd>Maximal length of a back-reference found.</dd> 067 * 068 * <dt>{@code maxOffset}</dt> 069 * <dd>Maximal offset of a back-reference.</dd> 070 * 071 * <dt>{@code maxLiteralLength}</dt> 072 * <dd>Maximal length of a literal block.</dd> 073 * </dl> 074 * 075 * @see "https://tools.ietf.org/html/rfc1951#section-4" 076 * @since 1.14 077 * @NotThreadSafe 078 */ 079public class LZ77Compressor { 080 081 /** 082 * Represents a back-reference. 083 */ 084 public abstract static class AbstractReference extends Block { 085 086 private final int offset; 087 private final int length; 088 089 /** 090 * Constructs a new instance. 091 * 092 * @param blockType The block type. 093 * @param offset the offset of the reference. 094 * @param length the offset of the reference. 095 */ 096 public AbstractReference(final BlockType blockType, final int offset, final int length) { 097 super(blockType); 098 this.offset = offset; 099 this.length = length; 100 } 101 102 /** 103 * Gets the offset of the reference. 104 * 105 * @return the length 106 */ 107 public int getLength() { 108 return length; 109 } 110 111 /** 112 * Gets the offset of the reference. 113 * 114 * @return the offset 115 */ 116 public int getOffset() { 117 return offset; 118 } 119 120 @Override 121 public String toString() { 122 return super.toString() + " with offset " + offset + " and length " + length; 123 } 124 } 125 126 /** 127 * Represents a back-reference. 128 */ 129 public static final class BackReference extends AbstractReference { 130 131 /** 132 * Constructs a new instance. 133 * 134 * @param offset the offset of the back-reference. 135 * @param length the offset of the back-reference. 136 */ 137 public BackReference(final int offset, final int length) { 138 super(BlockType.BACK_REFERENCE, offset, length); 139 } 140 141 } 142 143 /** 144 * Base class representing blocks the compressor may emit. 145 * 146 * <p> 147 * This class is not supposed to be subclassed by classes outside of Commons Compress so it is considered internal and changed that would break subclasses 148 * may get introduced with future releases. 149 * </p> 150 */ 151 public abstract static class Block { 152 153 /** 154 * Enumerates the block types the compressor emits. 155 */ 156 public enum BlockType { 157 158 /** 159 * The literal block type. 160 */ 161 LITERAL, 162 163 /** 164 * The back-reference block type. 165 */ 166 BACK_REFERENCE, 167 168 /** 169 * The end-of-data block type. 170 */ 171 EOD 172 } 173 174 private final BlockType type; 175 176 /** 177 * Constructs a new typeless instance. 178 * 179 * @deprecated Use {@link #Block()}. 180 */ 181 @Deprecated 182 public Block() { 183 this.type = null; 184 } 185 186 /** 187 * Constructs a new instance. 188 * 189 * @param type the block type, may not be {@code null}. 190 */ 191 protected Block(final BlockType type) { 192 this.type = Objects.requireNonNull(type); 193 } 194 195 /** 196 * Gets the the block type. 197 * 198 * @return the the block type. 199 */ 200 public BlockType getType() { 201 return type; 202 } 203 204 @Override 205 public String toString() { 206 return getClass().getSimpleName() + " " + getType(); 207 } 208 } 209 210 /** 211 * Callback invoked while the compressor processes data. 212 * 213 * <p> 214 * The callback is invoked on the same thread that receives the bytes to compress and may be invoked multiple times during the execution of 215 * {@link #compress} or {@link #finish}. 216 * </p> 217 */ 218 public interface Callback { 219 220 /** 221 * Consumes a block. 222 * 223 * @param b the block to consume 224 * @throws IOException in case of an error 225 */ 226 void accept(Block b) throws IOException; 227 } 228 229 /** A simple "we are done" marker. */ 230 public static final class EOD extends Block { 231 232 /** 233 * The singleton instance. 234 */ 235 private static final EOD INSTANCE = new EOD(); 236 237 /** 238 * Constructs a new instance. 239 */ 240 public EOD() { 241 super(BlockType.EOD); 242 } 243 244 } 245 246 /** 247 * Represents a literal block of data. 248 * 249 * <p> 250 * For performance reasons this encapsulates the real data, not a copy of it. Don't modify the data and process it inside of {@link Callback#accept} 251 * immediately as it will get overwritten sooner or later. 252 * </p> 253 */ 254 public static final class LiteralBlock extends AbstractReference { 255 256 private final byte[] data; 257 258 /** 259 * Constructs a new instance. 260 * 261 * @param data the literal data. 262 * @param offset the length of literal block. 263 * @param length the length of literal block. 264 */ 265 public LiteralBlock(final byte[] data, final int offset, final int length) { 266 super(BlockType.LITERAL, offset, length); 267 this.data = data; 268 } 269 270 /** 271 * Gets the literal data. 272 * 273 * <p> 274 * This returns a live view of the actual data in order to avoid copying, modify the array at your own risk. 275 * </p> 276 * 277 * @return the data 278 */ 279 public byte[] getData() { 280 return data; 281 } 282 283 } 284 285 static final int NUMBER_OF_BYTES_IN_HASH = 3; 286 private static final int NO_MATCH = -1; 287 288 // we use a 15 bit hash code as calculated in updateHash 289 private static final int HASH_SIZE = 1 << 15; 290 private static final int HASH_MASK = HASH_SIZE - 1; 291 292 private static final int H_SHIFT = 5; 293 private final Parameters params; 294 private final Callback callback; 295 296 // the sliding window, twice as big as "windowSize" parameter 297 private final byte[] window; 298 299 // the head of hash-chain - indexed by hash-code, points to the 300 // location inside of window of the latest sequence of bytes with 301 // the given hash. 302 private final int[] head; 303 // for each window-location points to the latest earlier location 304 // with the same hash. Only stores values for the latest 305 // "windowSize" elements, the index is "window location modulo 306 // windowSize". 307 private final int[] prev; 308 // bit mask used when indexing into prev 309 private final int wMask; 310 private boolean initialized; 311 // the position inside of window that shall be encoded right now 312 private int currentPosition; 313 // the number of bytes available to compress including the one at 314 // currentPosition 315 private int lookahead; 316 // the hash of the three bytes stating at the current position 317 private int insertHash; 318 319 // the position inside the window where the current literal 320 // block starts (in case we are inside a literal block). 321 private int blockStart; 322 323 // position of the current match 324 private int matchStart = NO_MATCH; 325 326 // number of missed insertString calls for the up to three last 327 // bytes of the last match that can only be performed once more 328 // data has been read 329 private int missedInserts; 330 331 /** 332 * Initializes a compressor with parameters and a callback. 333 * 334 * @param params the parameters 335 * @param callback the callback 336 * @throws NullPointerException if either parameter is {@code null} 337 */ 338 public LZ77Compressor(final Parameters params, final Callback callback) { 339 Objects.requireNonNull(params, "params"); 340 Objects.requireNonNull(callback, "callback"); 341 342 this.params = params; 343 this.callback = callback; 344 345 final int wSize = params.getWindowSize(); 346 window = new byte[wSize * 2]; 347 wMask = wSize - 1; 348 head = ArrayFill.fill(new int[HASH_SIZE], NO_MATCH); 349 prev = new int[wSize]; 350 } 351 352 private void catchUpMissedInserts() { 353 while (missedInserts > 0) { 354 insertString(currentPosition - missedInserts--); 355 } 356 } 357 358 private void compress() throws IOException { 359 final int minMatch = params.getMinBackReferenceLength(); 360 final boolean lazy = params.getLazyMatching(); 361 final int lazyThreshold = params.getLazyMatchingThreshold(); 362 363 while (lookahead >= minMatch) { 364 catchUpMissedInserts(); 365 int matchLength = 0; 366 final int hashHead = insertString(currentPosition); 367 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 368 // sets matchStart as a side effect 369 matchLength = longestMatch(hashHead); 370 371 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 372 // try to find a longer match using the next position 373 matchLength = longestMatchForNextPosition(matchLength); 374 } 375 } 376 if (matchLength >= minMatch) { 377 if (blockStart != currentPosition) { 378 // emit preceding literal block 379 flushLiteralBlock(); 380 blockStart = NO_MATCH; 381 } 382 flushBackReference(matchLength); 383 insertStringsInMatch(matchLength); 384 lookahead -= matchLength; 385 currentPosition += matchLength; 386 blockStart = currentPosition; 387 } else { 388 // no match, append to current or start a new literal 389 lookahead--; 390 currentPosition++; 391 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 392 flushLiteralBlock(); 393 blockStart = currentPosition; 394 } 395 } 396 } 397 } 398 399 /** 400 * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method. 401 * 402 * @param data the data to compress - must not be null 403 * @throws IOException if the callback throws an exception 404 */ 405 public void compress(final byte[] data) throws IOException { 406 compress(data, 0, data.length); 407 } 408 409 /** 410 * Feeds bytes into the compressor which in turn may emit zero or more blocks to the callback during the execution of this method. 411 * 412 * @param data the data to compress - must not be null 413 * @param off the start offset of the data 414 * @param len the number of bytes to compress 415 * @throws IOException if the callback throws an exception 416 */ 417 public void compress(final byte[] data, int off, int len) throws IOException { 418 final int wSize = params.getWindowSize(); 419 while (len > wSize) { // chop into windowSize sized chunks 420 doCompress(data, off, wSize); 421 off += wSize; 422 len -= wSize; 423 } 424 if (len > 0) { 425 doCompress(data, off, len); 426 } 427 } 428 429 // performs the actual algorithm with the pre-condition len <= windowSize 430 private void doCompress(final byte[] data, final int off, final int len) throws IOException { 431 final int spaceLeft = window.length - currentPosition - lookahead; 432 if (len > spaceLeft) { 433 slide(); 434 } 435 System.arraycopy(data, off, window, currentPosition + lookahead, len); 436 lookahead += len; 437 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 438 initialize(); 439 } 440 if (initialized) { 441 compress(); 442 } 443 } 444 445 /** 446 * Tells the compressor to process all remaining data and signal end of data to the callback. 447 * 448 * <p> 449 * The compressor will in turn emit at least one block ({@link EOD}) but potentially multiple blocks to the callback during the execution of this method. 450 * </p> 451 * 452 * @throws IOException if the callback throws an exception 453 */ 454 public void finish() throws IOException { 455 if (blockStart != currentPosition || lookahead > 0) { 456 currentPosition += lookahead; 457 flushLiteralBlock(); 458 } 459 callback.accept(EOD.INSTANCE); 460 } 461 462 private void flushBackReference(final int matchLength) throws IOException { 463 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 464 } 465 466 private void flushLiteralBlock() throws IOException { 467 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 468 } 469 470 private void initialize() { 471 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 472 insertHash = nextHash(insertHash, window[i]); 473 } 474 initialized = true; 475 } 476 477 /** 478 * Inserts the current three byte sequence into the dictionary and returns the previous head of the hash-chain. 479 * 480 * <p> 481 * Updates {@code insertHash} and {@code prev} as a side effect. 482 * </p> 483 */ 484 private int insertString(final int pos) { 485 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 486 final int hashHead = head[insertHash]; 487 prev[pos & wMask] = hashHead; 488 head[insertHash] = pos; 489 return hashHead; 490 } 491 492 private void insertStringsInMatch(final int matchLength) { 493 // inserts strings contained in current match 494 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 495 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 496 // currentPosition has been inserted already 497 for (int i = 1; i <= stop; i++) { 498 insertString(currentPosition + i); 499 } 500 missedInserts = matchLength - stop - 1; 501 } 502 503 /** 504 * Searches the hash chain for real matches and returns the length of the longest match (0 if none were found) that isn't too far away (WRT maxOffset). 505 * 506 * <p> 507 * Sets matchStart to the index of the start position of the longest match as a side effect. 508 * </p> 509 */ 510 private int longestMatch(int matchHead) { 511 final int minLength = params.getMinBackReferenceLength(); 512 int longestMatchLength = minLength - 1; 513 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 514 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 515 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 516 final int maxCandidates = params.getMaxCandidates(); 517 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 518 int currentLength = 0; 519 for (int i = 0; i < maxPossibleLength; i++) { 520 if (window[matchHead + i] != window[currentPosition + i]) { 521 break; 522 } 523 currentLength++; 524 } 525 if (currentLength > longestMatchLength) { 526 longestMatchLength = currentLength; 527 matchStart = matchHead; 528 if (currentLength >= niceBackReferenceLength) { 529 // no need to search any further 530 break; 531 } 532 } 533 matchHead = prev[matchHead & wMask]; 534 } 535 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 536 } 537 538 private int longestMatchForNextPosition(final int prevMatchLength) { 539 // save a bunch of values to restore them if the next match isn't better than the current one 540 final int prevMatchStart = matchStart; 541 final int prevInsertHash = insertHash; 542 543 lookahead--; 544 currentPosition++; 545 final int hashHead = insertString(currentPosition); 546 final int prevHashHead = prev[currentPosition & wMask]; 547 int matchLength = longestMatch(hashHead); 548 549 if (matchLength <= prevMatchLength) { 550 // use the first match, as the next one isn't any better 551 matchLength = prevMatchLength; 552 matchStart = prevMatchStart; 553 554 // restore modified values 555 head[insertHash] = prevHashHead; 556 insertHash = prevInsertHash; 557 currentPosition--; 558 lookahead++; 559 } 560 return matchLength; 561 } 562 563 /** 564 * Assumes we are calculating the hash for three consecutive bytes as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC the new hash for BCD is 565 * nextHash(H, D). 566 * 567 * <p> 568 * The hash is shifted by five bits on each update so all effects of A have been swapped after the third update. 569 * </p> 570 */ 571 private int nextHash(final int oldHash, final byte nextByte) { 572 final int nextVal = nextByte & 0xFF; 573 return (oldHash << H_SHIFT ^ nextVal) & HASH_MASK; 574 } 575 576 /** 577 * Adds some initial data to fill the window with. 578 * 579 * <p> 580 * This is used if the stream has been cut into blocks and back-references of one block may refer to data of the previous block(s). One such example is the 581 * LZ4 frame format using block dependency. 582 * </p> 583 * 584 * @param data the data to fill the window with. 585 * @throws IllegalStateException if the compressor has already started to accept data 586 */ 587 public void prefill(final byte[] data) { 588 if (currentPosition != 0 || lookahead != 0) { 589 throw new IllegalStateException("The compressor has already started to accept data, can't prefill anymore"); 590 } 591 592 // don't need more than windowSize for back-references 593 final int len = Math.min(params.getWindowSize(), data.length); 594 System.arraycopy(data, data.length - len, window, 0, len); 595 596 if (len >= NUMBER_OF_BYTES_IN_HASH) { 597 initialize(); 598 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 599 for (int i = 0; i < stop; i++) { 600 insertString(i); 601 } 602 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 603 } else { // not enough data to hash anything 604 missedInserts = len; 605 } 606 blockStart = currentPosition = len; 607 } 608 609 private void slide() throws IOException { 610 final int wSize = params.getWindowSize(); 611 if (blockStart != currentPosition && blockStart < wSize) { 612 flushLiteralBlock(); 613 blockStart = currentPosition; 614 } 615 System.arraycopy(window, wSize, window, 0, wSize); 616 currentPosition -= wSize; 617 matchStart -= wSize; 618 blockStart -= wSize; 619 for (int i = 0; i < HASH_SIZE; i++) { 620 final int h = head[i]; 621 head[i] = h >= wSize ? h - wSize : NO_MATCH; 622 } 623 for (int i = 0; i < wSize; i++) { 624 final int p = prev[i]; 625 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 626 } 627 } 628}