cache preset dict in LZ4WithPresetDictDecompressor

kkewwei · kkewwei · commit e5193bcf959c · 2025-03-24T22:25:58.000+08:00
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -200,6 +200,8 @@ Optimizations
 
 * GITHUB#14373: Optimized `ParallelLeafReader` to improve term vector fetching efficiency. (Divyansh Agrawal)
 
+* GITHUB#14397: Cache preset dict for LZ4WithPresetDictDecompressor. (kkewwei)
+
 Bug Fixes
 ---------------------
 
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java b/lucene/core/src/java/org/apache/lucene/codecs/compressing/Decompressor.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.BytesRef;
 
 /** A decompressor. */
@@ -42,6 +43,13 @@ protected Decompressor() {}
   public abstract void decompress(
       DataInput in, int originalLength, int offset, int length, BytesRef bytes) throws IOException;
 
+  public void decompress(IndexInput in, int originalLength, int offset, int length, BytesRef bytes)
+      throws IOException {
+    decompress(((DataInput) in), originalLength, offset, length, bytes);
+  }
+
   @Override
   public abstract Decompressor clone();
+
+  public void reset() {}
 }
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java
@@ -25,6 +25,7 @@
 import org.apache.lucene.store.ByteBuffersDataOutput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.compress.LZ4;
@@ -64,6 +65,9 @@ private static final class LZ4WithPresetDictDecompressor extends Decompressor {
 
     private int[] compressedLengths;
     private byte[] buffer;
+    private long cachedDictFilPointer = -1;
+    private int cachedDictLength = -1;
+    private long dictEndFilePointer = -1;
 
     LZ4WithPresetDictDecompressor() {
       compressedLengths = new int[0];
@@ -144,10 +148,85 @@ public void decompress(DataInput in, int originalLength, int offset, int length,
       assert bytes.isValid();
     }
 
+    @Override
+    public void decompress(
+        IndexInput in, int originalLength, int offset, int length, BytesRef bytes)
+        throws IOException {
+      assert offset + length <= originalLength;
+
+      if (length == 0) {
+        bytes.length = 0;
+        return;
+      }
+
+      final int dictLength = in.readVInt();
+      final int blockLength = in.readVInt();
+
+      final int numBlocks = readCompressedLengths(in, originalLength, dictLength, blockLength);
+
+      buffer = ArrayUtil.grow(buffer, dictLength + blockLength);
+      long startPointer = in.getFilePointer();
+      bytes.length = 0;
+      if (cachedDictFilPointer == startPointer) {
+        assert cachedDictLength == dictLength && dictEndFilePointer > 0;
+        in.seek(dictEndFilePointer);
+      } else {
+        // Read the dictionary
+        if (LZ4.decompress(in, dictLength, buffer, 0) != dictLength) {
+          throw new CorruptIndexException("Illegal dict length", in);
+        }
+        cachedDictLength = dictLength;
+        dictEndFilePointer = in.getFilePointer();
+        cachedDictFilPointer = startPointer;
+      }
+
+      int offsetInBlock = dictLength;
+      int offsetInBytesRef = offset;
+      if (offset >= dictLength) {
+        offsetInBytesRef -= dictLength;
+
+        // Skip unneeded blocks
+        int numBytesToSkip = 0;
+        for (int i = 0; i < numBlocks && offsetInBlock + blockLength < offset; ++i) {
+          int compressedBlockLength = compressedLengths[i];
+          numBytesToSkip += compressedBlockLength;
+          offsetInBlock += blockLength;
+          offsetInBytesRef -= blockLength;
+        }
+        in.skipBytes(numBytesToSkip);
+      } else {
+        // The dictionary contains some bytes we need, copy its content to the BytesRef
+        bytes.bytes = ArrayUtil.grow(bytes.bytes, dictLength);
+        System.arraycopy(buffer, 0, bytes.bytes, 0, dictLength);
+        bytes.length = dictLength;
+      }
+
+      // Read blocks that intersect with the interval we need
+      while (offsetInBlock < offset + length) {
+        final int bytesToDecompress = Math.min(blockLength, offset + length - offsetInBlock);
+        LZ4.decompress(in, bytesToDecompress, buffer, dictLength);
+        bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + bytesToDecompress);
+        System.arraycopy(buffer, dictLength, bytes.bytes, bytes.length, bytesToDecompress);
+        bytes.length += bytesToDecompress;
+        offsetInBlock += blockLength;
+      }
+
+      bytes.offset = offsetInBytesRef;
+      bytes.length = length;
+      assert bytes.isValid();
+    }
+
     @Override
     public Decompressor clone() {
       return new LZ4WithPresetDictDecompressor();
     }
+
+    @Override
+    public void reset() {
+      cachedDictFilPointer = -1;
+      cachedDictLength = -1;
+      dictEndFilePointer = -1;
+    }
   }
 
   private static class LZ4WithPresetDictCompressor extends Compressor {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/compressing/Lucene90CompressingStoredFieldsReader.java
@@ -512,6 +512,7 @@ private void doReset(int docID) throws IOException {
           bytes.offset = bytes.length = 0;
           for (int decompressed = 0; decompressed < totalLength; ) {
             final int toDecompress = Math.min(totalLength - decompressed, chunkSize);
+            decompressor.reset();
             decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare);
             bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length);
             System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length);
@@ -573,6 +574,7 @@ void fillBuffer() throws IOException {
                   throw new EOFException();
                 }
                 final int toDecompress = Math.min(length - decompressed, chunkSize);
+                decompressor.reset();
                 decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes);
                 decompressed += toDecompress;
               }
@@ -644,6 +646,7 @@ SerializedDocument serializedDocument(int docID) throws IOException {
     if (state.contains(docID) == false) {
       fieldsStream.seek(indexReader.getStartPointer(docID));
       state.reset(docID);
+      decompressor.reset();
     }
     assert state.contains(docID);
     return state.document(docID);
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java b/lucene/core/src/test/org/apache/lucene/codecs/compressing/AbstractTestCompressionMode.java
@@ -17,13 +17,16 @@
 package org.apache.lucene.codecs.compressing;
 
 import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
+import java.io.EOFException;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Random;
+import org.apache.lucene.codecs.lucene90.LZ4WithPresetDictCompressionMode;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.ByteBuffersDataInput;
+import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.lucene.util.ArrayUtil;
@@ -68,23 +71,34 @@ static byte[] compress(Compressor compressor, byte[] decompressed, int off, int
 
   byte[] decompress(byte[] compressed, int originalLength) throws IOException {
     Decompressor decompressor = mode.newDecompressor();
-    return decompress(decompressor, compressed, originalLength);
+    return decompress(mode, decompressor, compressed, originalLength);
   }
 
-  static byte[] decompress(Decompressor decompressor, byte[] compressed, int originalLength)
+  static byte[] decompress(
+      CompressionMode mode, Decompressor decompressor, byte[] compressed, int originalLength)
       throws IOException {
     final BytesRef bytes = new BytesRef();
-    decompressor.decompress(
-        new ByteArrayDataInput(compressed), originalLength, 0, originalLength, bytes);
+    if ((mode instanceof LZ4WithPresetDictCompressionMode == false) || random().nextBoolean()) {
+      decompressor.decompress(
+          new ByteArrayDataInput(compressed), originalLength, 0, originalLength, bytes);
+    } else {
+      decompressor.decompress(
+          new MockIndexInput(compressed), originalLength, 0, originalLength, bytes);
+    }
     return BytesRef.deepCopyOf(bytes).bytes;
   }
 
   byte[] decompress(byte[] compressed, int originalLength, int offset, int length)
       throws IOException {
     Decompressor decompressor = mode.newDecompressor();
     final BytesRef bytes = new BytesRef();
-    decompressor.decompress(
-        new ByteArrayDataInput(compressed), originalLength, offset, length, bytes);
+    if ((mode instanceof LZ4WithPresetDictCompressionMode == false) || random().nextBoolean()) {
+      decompressor.decompress(
+          new ByteArrayDataInput(compressed), originalLength, offset, length, bytes);
+    } else {
+      decompressor.decompress(
+          new MockIndexInput(compressed), originalLength, offset, length, bytes);
+    }
     return BytesRef.deepCopyOf(bytes).bytes;
   }
 
@@ -162,4 +176,66 @@ public void testExtremelyLargeInput() throws IOException {
     }
     test(decompressed);
   }
+
+  private static class MockIndexInput extends IndexInput {
+    private byte[] bytes;
+    private final long length;
+    private final int startOffset;
+    private int pos;
+
+    MockIndexInput(byte[] bytes) throws EOFException {
+      this(bytes, 0, bytes.length);
+    }
+
+    MockIndexInput(byte[] bytes, int startOffset, long length) throws EOFException {
+      super("MockIndexInput");
+      if ((length + startOffset) > bytes.length) {
+        throw new EOFException();
+      }
+      this.bytes = bytes;
+      this.startOffset = startOffset;
+      this.length = length;
+      this.pos = 0;
+    }
+
+    @Override
+    public byte readByte() throws IOException {
+      if (this.startOffset + this.pos + 1 > length) {
+        throw new EOFException();
+      }
+      return bytes[startOffset + pos++];
+    }
+
+    @Override
+    public void readBytes(byte[] b, int offset, int len) throws IOException {
+      if (this.startOffset + this.pos + len > length) {
+        throw new EOFException();
+      }
+      System.arraycopy(bytes, startOffset + pos, b, offset, len);
+      pos += len;
+    }
+
+    @Override
+    public void close() throws IOException {}
+
+    @Override
+    public long getFilePointer() {
+      return pos;
+    }
+
+    @Override
+    public void seek(long pos) throws IOException {
+      this.pos = startOffset + (int) pos;
+    }
+
+    @Override
+    public long length() {
+      return length;
+    }
+
+    @Override
+    public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
+      return new MockIndexInput(bytes, startOffset + (int) offset, length);
+    }
+  }
 }