From 3365d8649eb014b83c4456fc0dc50a1e0b72f322 Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Wed, 17 Jul 2024 11:43:01 -0400 Subject: [PATCH 1/3] experiment: allow extended lz4 lookback window --- .../org/apache/lucene/util/compress/LZ4.java | 53 +++++++++++++++++-- .../lucene/util/compress/LZ4TestCase.java | 44 ++++++++++++++- 2 files changed, 91 insertions(+), 6 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java index 67bbdc96ab2b..bdd441125ddf 100644 --- a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java +++ b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java @@ -53,6 +53,27 @@ private LZ4() {} */ public static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference + /** + * Longer lookback window size. Cf. {@link #MAX_DISTANCE}. This allows the context window to be + * 256k instead of the default 64k, and can provide substantial compression ratio and performance + * boost for data where the repetition period is longer. + */ + public static final int EXTENDED_MAX_DISTANCE = (1 << 18) - 1; + + /** + * There are some use cases (e.g., 256k block-level compression applied over index files) where + * the period of pattern repetition is longer. Such cases benefit from a combination of {@link + * HighCompressionHashTable} and a longer lookback window ({@link #EXTENDED_MAX_DISTANCE} instead + * of {@link #MAX_DISTANCE}). The benefits are both in compression (real-world cases with ~3x + * improved compression!), but also in latency/CPU-efficiency, in some cases with >2x faster + * execution. + * + *

We want to support lz4 with {@link #EXTENDED_MAX_DISTANCE} for these special cases, but also + * provide {@link #DEFAULT_EXTENDED_MAX_DISTANCE} to allow tests to be run exercising lz4 with + * {@link #EXTENDED_MAX_DISTANCE}. + */ + public static final boolean DEFAULT_EXTENDED_MAX_DISTANCE = true; + static final int MEMORY_USAGE = 14; static final int MIN_MATCH = 4; // minimum length of a match static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals @@ -88,6 +109,12 @@ private static int commonBytes(byte[] b, int o1, int o2, int limit) { */ public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException { + return decompress(compressed, decompressedLen, dest, dOff, DEFAULT_EXTENDED_MAX_DISTANCE); + } + + public static int decompress( + DataInput compressed, int decompressedLen, byte[] dest, int dOff, boolean ext) + throws IOException { final int destEnd = dOff + decompressedLen; do { @@ -112,7 +139,7 @@ public static int decompress(DataInput compressed, int decompressedLen, byte[] d } // matchs - final int matchDec = compressed.readShort() & 0xFFFF; + final int matchDec = ext ? compressed.readVInt() : (compressed.readShort() & 0xFFFF); assert matchDec > 0; int matchLen = token & 0x0F; @@ -170,7 +197,13 @@ private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, } private static void encodeSequence( - byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) + byte[] bytes, + int anchor, + int matchRef, + int matchOff, + int matchLen, + DataOutput out, + boolean ext) throws IOException { final int literalLen = matchOff - anchor; assert matchLen >= 4; @@ -180,8 +213,12 @@ private static void encodeSequence( // encode match dec final int matchDec = matchOff - matchRef; - assert matchDec > 0 && matchDec < 1 << 16; - out.writeShort((short) matchDec); + assert matchDec > 0 && matchDec < (ext ? EXTENDED_MAX_DISTANCE : MAX_DISTANCE); + if (ext) { + out.writeVInt(matchDec); + } else { + out.writeShort((short) matchDec); + } // encode match len if (matchLen >= MIN_MATCH + 0x0F) { @@ -524,6 +561,12 @@ public static void compress(byte[] bytes, int off, int len, DataOutput out, Hash public static void compressWithDictionary( byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht) throws IOException { + compressWithDictionary(bytes, dictOff, dictLen, len, out, ht, DEFAULT_EXTENDED_MAX_DISTANCE); + } + + public static void compressWithDictionary( + byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht, boolean ext) + throws IOException { Objects.checkFromIndexSize(dictOff, dictLen, bytes.length); Objects.checkFromIndexSize(dictOff + dictLen, len, bytes.length); if (dictLen > MAX_DISTANCE) { @@ -575,7 +618,7 @@ public static void compressWithDictionary( } } - encodeSequence(bytes, anchor, ref, off, matchLen, out); + encodeSequence(bytes, anchor, ref, off, matchLen, out, ext); off += matchLen; anchor = off; } diff --git a/lucene/core/src/test/org/apache/lucene/util/compress/LZ4TestCase.java b/lucene/core/src/test/org/apache/lucene/util/compress/LZ4TestCase.java index ddb556822d47..c652abbbc334 100644 --- a/lucene/core/src/test/org/apache/lucene/util/compress/LZ4TestCase.java +++ b/lucene/core/src/test/org/apache/lucene/util/compress/LZ4TestCase.java @@ -76,6 +76,41 @@ private void doTest(byte[] data, LZ4.HashTable hashTable) throws IOException { doTest(copy, offset, data.length, hashTable); } + private static int readVInt(byte[] compressed, int off, int[] size) throws IOException { + byte b = compressed[off++]; + if (b >= 0) { + size[0] = 1; + return b; + } + int i = b & 0x7F; + b = compressed[off++]; + i |= (b & 0x7F) << 7; + if (b >= 0) { + size[0] = 2; + return i; + } + b = compressed[off++]; + i |= (b & 0x7F) << 14; + if (b >= 0) { + size[0] = 3; + return i; + } + b = compressed[off++]; + i |= (b & 0x7F) << 21; + if (b >= 0) { + size[0] = 4; + return i; + } + b = compressed[off]; + // Warning: the next ands use 0x0F / 0xF0 - beware copy/paste errors: + i |= (b & 0x0F) << 28; + if ((b & 0xF0) == 0) { + size[0] = 5; + return i; + } + throw new IOException("Invalid vInt detected (too many bits)"); + } + private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable) throws IOException { ByteBuffersDataOutput out = new ByteBuffersDataOutput(); @@ -84,6 +119,7 @@ private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable int off = 0; int decompressedOff = 0; + final int[] vintSize = LZ4.DEFAULT_EXTENDED_MAX_DISTANCE ? new int[1] : null; for (; ; ) { final int token = compressed[off++] & 0xFF; int literalLen = token >>> 4; @@ -108,7 +144,13 @@ private void doTest(byte[] data, int offset, int length, LZ4.HashTable hashTable break; } - final int matchDec = (compressed[off++] & 0xFF) | ((compressed[off++] & 0xFF) << 8); + final int matchDec; + if (LZ4.DEFAULT_EXTENDED_MAX_DISTANCE) { + matchDec = readVInt(compressed, off, vintSize); + off += vintSize[0]; + } else { + matchDec = (compressed[off++] & 0xFF) | ((compressed[off++] & 0xFF) << 8); + } // check that match dec is not 0 assertTrue(matchDec + " " + decompressedOff, matchDec > 0 && matchDec <= decompressedOff); From f00d6f1bc35cc4f952d15c1f2098240ca2eb1485 Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Wed, 17 Jul 2024 13:48:42 -0400 Subject: [PATCH 2/3] fix problem preventing actual increase in lookback window --- .../core/src/java/org/apache/lucene/util/compress/LZ4.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java index bdd441125ddf..1190c2ccc6be 100644 --- a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java +++ b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java @@ -569,9 +569,10 @@ public static void compressWithDictionary( throws IOException { Objects.checkFromIndexSize(dictOff, dictLen, bytes.length); Objects.checkFromIndexSize(dictOff + dictLen, len, bytes.length); - if (dictLen > MAX_DISTANCE) { + final int maxDistance = ext ? EXTENDED_MAX_DISTANCE : MAX_DISTANCE; + if (dictLen > maxDistance) { throw new IllegalArgumentException( - "dictLen must not be greater than 64kB, but got " + dictLen); + "dictLen must not be greater than " + (ext ? "256k" : "64k") + ", but got " + dictLen); } final int end = dictOff + dictLen + len; @@ -607,7 +608,7 @@ public static void compressWithDictionary( int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit); // try to find a better match - for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, dictOff); + for (int r = ht.previous(ref), min = Math.max(off - maxDistance + 1, dictOff); r >= min; r = ht.previous(r)) { assert readInt(bytes, r) == readInt(bytes, off); From 91f435b2c1a909600e034868168270014ac41c8b Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Wed, 17 Jul 2024 13:56:36 -0400 Subject: [PATCH 3/3] we want to leave the default extendedWindow=false --- lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java index 1190c2ccc6be..f3433ad00642 100644 --- a/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java +++ b/lucene/core/src/java/org/apache/lucene/util/compress/LZ4.java @@ -72,7 +72,7 @@ private LZ4() {} * provide {@link #DEFAULT_EXTENDED_MAX_DISTANCE} to allow tests to be run exercising lz4 with * {@link #EXTENDED_MAX_DISTANCE}. */ - public static final boolean DEFAULT_EXTENDED_MAX_DISTANCE = true; + public static final boolean DEFAULT_EXTENDED_MAX_DISTANCE = false; static final int MEMORY_USAGE = 14; static final int MIN_MATCH = 4; // minimum length of a match