From 156b1f83cd30cb52d460532d67fb8dd81c2f8f55 Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Fri, 11 Oct 2024 14:34:01 -0400 Subject: [PATCH 1/7] compile with support for writing lucene 9.7 indexes --- .../lucene/codecs/lucene90/ForUtil.java | 1054 +++++++++ .../LZ4WithPresetDictCompressionMode.java | 4 +- .../lucene90/Lucene90CompoundFormat.java | 1 - .../lucene90/Lucene90DocValuesConsumer.java | 18 +- .../lucene90/Lucene90DocValuesProducer.java | 35 +- .../lucene90/Lucene90NormsProducer.java | 17 +- .../codecs/lucene90/Lucene90PointsReader.java | 5 +- .../codecs/lucene90/Lucene90PointsWriter.java | 2 +- .../lucene90/Lucene90PostingsFormat.java | 507 ++++ .../lucene90/Lucene90PostingsReader.java | 2068 +++++++++++++++++ .../lucene90/Lucene90PostingsWriter.java | 536 +++++ .../lucene90/Lucene90ScoreSkipReader.java | 159 ++ .../lucene90/Lucene90SegmentInfoFormat.java | 242 ++ .../codecs/lucene90/Lucene90SkipReader.java | 206 ++ .../codecs/lucene90/Lucene90SkipWriter.java | 237 ++ .../lucene90/Lucene90StoredFieldsFormat.java | 24 +- .../lucene/codecs/lucene90/PForUtil.java | 323 +++ ...Lucene90CompressingStoredFieldsReader.java | 6 +- ...Lucene90CompressingStoredFieldsWriter.java | 13 - .../Lucene90CompressingTermVectorsReader.java | 2 +- .../Lucene90CompressingTermVectorsWriter.java | 22 +- .../lucene/codecs/lucene90/gen_ForUtil.py | 442 ++++ .../lucene94/Lucene94FieldInfosFormat.java | 39 +- .../lucene/codecs/lucene95/Lucene95Codec.java | 218 ++ 24 files changed, 6070 insertions(+), 110 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtil.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsWriter.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90ScoreSkipReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipReader.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipWriter.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene90/gen_ForUtil.py create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtil.java new file mode 100644 index 000000000000..a4de8c525ef9 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/ForUtil.java @@ -0,0 +1,1054 @@ +// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; + +// Inspired from https://fulmicoton.com/posts/bitpacking/ +// Encodes multiple integers in a long to get SIMD-like speedups. +// If bitsPerValue <= 8 then we pack 8 ints per long +// else if bitsPerValue <= 16 we pack 4 ints per long +// else we pack 2 ints per long +final class ForUtil { + + static final int BLOCK_SIZE = 128; + private static final int BLOCK_SIZE_LOG2 = 7; + + private static long expandMask32(long mask32) { + return mask32 | (mask32 << 32); + } + + private static long expandMask16(long mask16) { + return expandMask32(mask16 | (mask16 << 16)); + } + + private static long expandMask8(long mask8) { + return expandMask16(mask8 | (mask8 << 8)); + } + + private static long mask32(int bitsPerValue) { + return expandMask32((1L << bitsPerValue) - 1); + } + + private static long mask16(int bitsPerValue) { + return expandMask16((1L << bitsPerValue) - 1); + } + + private static long mask8(int bitsPerValue) { + return expandMask8((1L << bitsPerValue) - 1); + } + + private static void expand8(long[] arr) { + for (int i = 0; i < 16; ++i) { + long l = arr[i]; + arr[i] = (l >>> 56) & 0xFFL; + arr[16 + i] = (l >>> 48) & 0xFFL; + arr[32 + i] = (l >>> 40) & 0xFFL; + arr[48 + i] = (l >>> 32) & 0xFFL; + arr[64 + i] = (l >>> 24) & 0xFFL; + arr[80 + i] = (l >>> 16) & 0xFFL; + arr[96 + i] = (l >>> 8) & 0xFFL; + arr[112 + i] = l & 0xFFL; + } + } + + private static void expand8To32(long[] arr) { + for (int i = 0; i < 16; ++i) { + long l = arr[i]; + arr[i] = (l >>> 24) & 0x000000FF000000FFL; + arr[16 + i] = (l >>> 16) & 0x000000FF000000FFL; + arr[32 + i] = (l >>> 8) & 0x000000FF000000FFL; + arr[48 + i] = l & 0x000000FF000000FFL; + } + } + + private static void collapse8(long[] arr) { + for (int i = 0; i < 16; ++i) { + arr[i] = + (arr[i] << 56) + | (arr[16 + i] << 48) + | (arr[32 + i] << 40) + | (arr[48 + i] << 32) + | (arr[64 + i] << 24) + | (arr[80 + i] << 16) + | (arr[96 + i] << 8) + | arr[112 + i]; + } + } + + private static void expand16(long[] arr) { + for (int i = 0; i < 32; ++i) { + long l = arr[i]; + arr[i] = (l >>> 48) & 0xFFFFL; + arr[32 + i] = (l >>> 32) & 0xFFFFL; + arr[64 + i] = (l >>> 16) & 0xFFFFL; + arr[96 + i] = l & 0xFFFFL; + } + } + + private static void expand16To32(long[] arr) { + for (int i = 0; i < 32; ++i) { + long l = arr[i]; + arr[i] = (l >>> 16) & 0x0000FFFF0000FFFFL; + arr[32 + i] = l & 0x0000FFFF0000FFFFL; + } + } + + private static void collapse16(long[] arr) { + for (int i = 0; i < 32; ++i) { + arr[i] = (arr[i] << 48) | (arr[32 + i] << 32) | (arr[64 + i] << 16) | arr[96 + i]; + } + } + + private static void expand32(long[] arr) { + for (int i = 0; i < 64; ++i) { + long l = arr[i]; + arr[i] = l >>> 32; + arr[64 + i] = l & 0xFFFFFFFFL; + } + } + + private static void collapse32(long[] arr) { + for (int i = 0; i < 64; ++i) { + arr[i] = (arr[i] << 32) | arr[64 + i]; + } + } + + private final long[] tmp = new long[BLOCK_SIZE / 2]; + + /** Encode 128 integers from {@code longs} into {@code out}. */ + void encode(long[] longs, int bitsPerValue, DataOutput out) throws IOException { + final int nextPrimitive; + final int numLongs; + if (bitsPerValue <= 8) { + nextPrimitive = 8; + numLongs = BLOCK_SIZE / 8; + collapse8(longs); + } else if (bitsPerValue <= 16) { + nextPrimitive = 16; + numLongs = BLOCK_SIZE / 4; + collapse16(longs); + } else { + nextPrimitive = 32; + numLongs = BLOCK_SIZE / 2; + collapse32(longs); + } + + final int numLongsPerShift = bitsPerValue * 2; + int idx = 0; + int shift = nextPrimitive - bitsPerValue; + for (int i = 0; i < numLongsPerShift; ++i) { + tmp[i] = longs[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + for (int i = 0; i < numLongsPerShift; ++i) { + tmp[i] |= longs[idx++] << shift; + } + } + + final int remainingBitsPerLong = shift + bitsPerValue; + final long maskRemainingBitsPerLong; + if (nextPrimitive == 8) { + maskRemainingBitsPerLong = MASKS8[remainingBitsPerLong]; + } else if (nextPrimitive == 16) { + maskRemainingBitsPerLong = MASKS16[remainingBitsPerLong]; + } else { + maskRemainingBitsPerLong = MASKS32[remainingBitsPerLong]; + } + + int tmpIdx = 0; + int remainingBitsPerValue = bitsPerValue; + while (idx < numLongs) { + if (remainingBitsPerValue >= remainingBitsPerLong) { + remainingBitsPerValue -= remainingBitsPerLong; + tmp[tmpIdx++] |= (longs[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerLong; + if (remainingBitsPerValue == 0) { + idx++; + remainingBitsPerValue = bitsPerValue; + } + } else { + final long mask1, mask2; + if (nextPrimitive == 8) { + mask1 = MASKS8[remainingBitsPerValue]; + mask2 = MASKS8[remainingBitsPerLong - remainingBitsPerValue]; + } else if (nextPrimitive == 16) { + mask1 = MASKS16[remainingBitsPerValue]; + mask2 = MASKS16[remainingBitsPerLong - remainingBitsPerValue]; + } else { + mask1 = MASKS32[remainingBitsPerValue]; + mask2 = MASKS32[remainingBitsPerLong - remainingBitsPerValue]; + } + tmp[tmpIdx] |= (longs[idx++] & mask1) << (remainingBitsPerLong - remainingBitsPerValue); + remainingBitsPerValue = bitsPerValue - remainingBitsPerLong + remainingBitsPerValue; + tmp[tmpIdx++] |= (longs[idx] >>> remainingBitsPerValue) & mask2; + } + } + + for (int i = 0; i < numLongsPerShift; ++i) { + out.writeLong(tmp[i]); + } + } + + /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ + int numBytes(int bitsPerValue) { + return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); + } + + private static void decodeSlow(int bitsPerValue, DataInput in, long[] tmp, long[] longs) + throws IOException { + final int numLongs = bitsPerValue << 1; + in.readLongs(tmp, 0, numLongs); + final long mask = MASKS32[bitsPerValue]; + int longsIdx = 0; + int shift = 32 - bitsPerValue; + for (; shift >= 0; shift -= bitsPerValue) { + shiftLongs(tmp, numLongs, longs, longsIdx, shift, mask); + longsIdx += numLongs; + } + final int remainingBitsPerLong = shift + bitsPerValue; + final long mask32RemainingBitsPerLong = MASKS32[remainingBitsPerLong]; + int tmpIdx = 0; + int remainingBits = remainingBitsPerLong; + for (; longsIdx < BLOCK_SIZE / 2; ++longsIdx) { + int b = bitsPerValue - remainingBits; + long l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; + while (b >= remainingBitsPerLong) { + b -= remainingBitsPerLong; + l |= (tmp[tmpIdx++] & mask32RemainingBitsPerLong) << b; + } + if (b > 0) { + l |= (tmp[tmpIdx] >>> (remainingBitsPerLong - b)) & MASKS32[b]; + remainingBits = remainingBitsPerLong - b; + } else { + remainingBits = remainingBitsPerLong; + } + longs[longsIdx] = l; + } + } + + /** + * The pattern that this shiftLongs method applies is recognized by the C2 compiler, which + * generates SIMD instructions for it in order to shift multiple longs at once. + */ + private static void shiftLongs(long[] a, int count, long[] b, int bi, int shift, long mask) { + for (int i = 0; i < count; ++i) { + b[bi + i] = (a[i] >>> shift) & mask; + } + } + + private static final long[] MASKS8 = new long[8]; + private static final long[] MASKS16 = new long[16]; + private static final long[] MASKS32 = new long[32]; + + static { + for (int i = 0; i < 8; ++i) { + MASKS8[i] = mask8(i); + } + for (int i = 0; i < 16; ++i) { + MASKS16[i] = mask16(i); + } + for (int i = 0; i < 32; ++i) { + MASKS32[i] = mask32(i); + } + } + // mark values in array as final longs to avoid the cost of reading array, arrays should only be + // used when the idx is a variable + private static final long MASK8_1 = MASKS8[1]; + private static final long MASK8_2 = MASKS8[2]; + private static final long MASK8_3 = MASKS8[3]; + private static final long MASK8_4 = MASKS8[4]; + private static final long MASK8_5 = MASKS8[5]; + private static final long MASK8_6 = MASKS8[6]; + private static final long MASK8_7 = MASKS8[7]; + private static final long MASK16_1 = MASKS16[1]; + private static final long MASK16_2 = MASKS16[2]; + private static final long MASK16_3 = MASKS16[3]; + private static final long MASK16_4 = MASKS16[4]; + private static final long MASK16_5 = MASKS16[5]; + private static final long MASK16_6 = MASKS16[6]; + private static final long MASK16_7 = MASKS16[7]; + private static final long MASK16_9 = MASKS16[9]; + private static final long MASK16_10 = MASKS16[10]; + private static final long MASK16_11 = MASKS16[11]; + private static final long MASK16_12 = MASKS16[12]; + private static final long MASK16_13 = MASKS16[13]; + private static final long MASK16_14 = MASKS16[14]; + private static final long MASK16_15 = MASKS16[15]; + private static final long MASK32_1 = MASKS32[1]; + private static final long MASK32_2 = MASKS32[2]; + private static final long MASK32_3 = MASKS32[3]; + private static final long MASK32_4 = MASKS32[4]; + private static final long MASK32_5 = MASKS32[5]; + private static final long MASK32_6 = MASKS32[6]; + private static final long MASK32_7 = MASKS32[7]; + private static final long MASK32_8 = MASKS32[8]; + private static final long MASK32_9 = MASKS32[9]; + private static final long MASK32_10 = MASKS32[10]; + private static final long MASK32_11 = MASKS32[11]; + private static final long MASK32_12 = MASKS32[12]; + private static final long MASK32_13 = MASKS32[13]; + private static final long MASK32_14 = MASKS32[14]; + private static final long MASK32_15 = MASKS32[15]; + private static final long MASK32_17 = MASKS32[17]; + private static final long MASK32_18 = MASKS32[18]; + private static final long MASK32_19 = MASKS32[19]; + private static final long MASK32_20 = MASKS32[20]; + private static final long MASK32_21 = MASKS32[21]; + private static final long MASK32_22 = MASKS32[22]; + private static final long MASK32_23 = MASKS32[23]; + private static final long MASK32_24 = MASKS32[24]; + + /** Decode 128 integers into {@code longs}. */ + void decode(int bitsPerValue, DataInput in, long[] longs) throws IOException { + switch (bitsPerValue) { + case 1: + decode1(in, tmp, longs); + expand8(longs); + break; + case 2: + decode2(in, tmp, longs); + expand8(longs); + break; + case 3: + decode3(in, tmp, longs); + expand8(longs); + break; + case 4: + decode4(in, tmp, longs); + expand8(longs); + break; + case 5: + decode5(in, tmp, longs); + expand8(longs); + break; + case 6: + decode6(in, tmp, longs); + expand8(longs); + break; + case 7: + decode7(in, tmp, longs); + expand8(longs); + break; + case 8: + decode8(in, tmp, longs); + expand8(longs); + break; + case 9: + decode9(in, tmp, longs); + expand16(longs); + break; + case 10: + decode10(in, tmp, longs); + expand16(longs); + break; + case 11: + decode11(in, tmp, longs); + expand16(longs); + break; + case 12: + decode12(in, tmp, longs); + expand16(longs); + break; + case 13: + decode13(in, tmp, longs); + expand16(longs); + break; + case 14: + decode14(in, tmp, longs); + expand16(longs); + break; + case 15: + decode15(in, tmp, longs); + expand16(longs); + break; + case 16: + decode16(in, tmp, longs); + expand16(longs); + break; + case 17: + decode17(in, tmp, longs); + expand32(longs); + break; + case 18: + decode18(in, tmp, longs); + expand32(longs); + break; + case 19: + decode19(in, tmp, longs); + expand32(longs); + break; + case 20: + decode20(in, tmp, longs); + expand32(longs); + break; + case 21: + decode21(in, tmp, longs); + expand32(longs); + break; + case 22: + decode22(in, tmp, longs); + expand32(longs); + break; + case 23: + decode23(in, tmp, longs); + expand32(longs); + break; + case 24: + decode24(in, tmp, longs); + expand32(longs); + break; + default: + decodeSlow(bitsPerValue, in, tmp, longs); + expand32(longs); + break; + } + } + + /** + * Decodes 128 integers into 64 {@code longs} such that each long contains two values, each + * represented with 32 bits. Values [0..63] are encoded in the high-order bits of {@code longs} + * [0..63], and values [64..127] are encoded in the low-order bits of {@code longs} [0..63]. This + * representation may allow subsequent operations to be performed on two values at a time. + */ + void decodeTo32(int bitsPerValue, DataInput in, long[] longs) throws IOException { + switch (bitsPerValue) { + case 1: + decode1(in, tmp, longs); + expand8To32(longs); + break; + case 2: + decode2(in, tmp, longs); + expand8To32(longs); + break; + case 3: + decode3(in, tmp, longs); + expand8To32(longs); + break; + case 4: + decode4(in, tmp, longs); + expand8To32(longs); + break; + case 5: + decode5(in, tmp, longs); + expand8To32(longs); + break; + case 6: + decode6(in, tmp, longs); + expand8To32(longs); + break; + case 7: + decode7(in, tmp, longs); + expand8To32(longs); + break; + case 8: + decode8(in, tmp, longs); + expand8To32(longs); + break; + case 9: + decode9(in, tmp, longs); + expand16To32(longs); + break; + case 10: + decode10(in, tmp, longs); + expand16To32(longs); + break; + case 11: + decode11(in, tmp, longs); + expand16To32(longs); + break; + case 12: + decode12(in, tmp, longs); + expand16To32(longs); + break; + case 13: + decode13(in, tmp, longs); + expand16To32(longs); + break; + case 14: + decode14(in, tmp, longs); + expand16To32(longs); + break; + case 15: + decode15(in, tmp, longs); + expand16To32(longs); + break; + case 16: + decode16(in, tmp, longs); + expand16To32(longs); + break; + case 17: + decode17(in, tmp, longs); + break; + case 18: + decode18(in, tmp, longs); + break; + case 19: + decode19(in, tmp, longs); + break; + case 20: + decode20(in, tmp, longs); + break; + case 21: + decode21(in, tmp, longs); + break; + case 22: + decode22(in, tmp, longs); + break; + case 23: + decode23(in, tmp, longs); + break; + case 24: + decode24(in, tmp, longs); + break; + default: + decodeSlow(bitsPerValue, in, tmp, longs); + break; + } + } + + private static void decode1(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 2); + shiftLongs(tmp, 2, longs, 0, 7, MASK8_1); + shiftLongs(tmp, 2, longs, 2, 6, MASK8_1); + shiftLongs(tmp, 2, longs, 4, 5, MASK8_1); + shiftLongs(tmp, 2, longs, 6, 4, MASK8_1); + shiftLongs(tmp, 2, longs, 8, 3, MASK8_1); + shiftLongs(tmp, 2, longs, 10, 2, MASK8_1); + shiftLongs(tmp, 2, longs, 12, 1, MASK8_1); + shiftLongs(tmp, 2, longs, 14, 0, MASK8_1); + } + + private static void decode2(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 4); + shiftLongs(tmp, 4, longs, 0, 6, MASK8_2); + shiftLongs(tmp, 4, longs, 4, 4, MASK8_2); + shiftLongs(tmp, 4, longs, 8, 2, MASK8_2); + shiftLongs(tmp, 4, longs, 12, 0, MASK8_2); + } + + private static void decode3(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 6); + shiftLongs(tmp, 6, longs, 0, 5, MASK8_3); + shiftLongs(tmp, 6, longs, 6, 2, MASK8_3); + for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 2; ++iter, tmpIdx += 3, longsIdx += 2) { + long l0 = (tmp[tmpIdx + 0] & MASK8_2) << 1; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2; + l1 |= (tmp[tmpIdx + 2] & MASK8_2) << 0; + longs[longsIdx + 1] = l1; + } + } + + private static void decode4(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 8); + shiftLongs(tmp, 8, longs, 0, 4, MASK8_4); + shiftLongs(tmp, 8, longs, 8, 0, MASK8_4); + } + + private static void decode5(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 10); + shiftLongs(tmp, 10, longs, 0, 3, MASK8_5); + for (int iter = 0, tmpIdx = 0, longsIdx = 10; iter < 2; ++iter, tmpIdx += 5, longsIdx += 3) { + long l0 = (tmp[tmpIdx + 0] & MASK8_3) << 2; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4; + l1 |= (tmp[tmpIdx + 2] & MASK8_3) << 1; + l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3; + l2 |= (tmp[tmpIdx + 4] & MASK8_3) << 0; + longs[longsIdx + 2] = l2; + } + } + + private static void decode6(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 12); + shiftLongs(tmp, 12, longs, 0, 2, MASK8_6); + shiftLongs(tmp, 12, tmp, 0, 0, MASK8_2); + for (int iter = 0, tmpIdx = 0, longsIdx = 12; iter < 4; ++iter, tmpIdx += 3, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 4; + l0 |= tmp[tmpIdx + 1] << 2; + l0 |= tmp[tmpIdx + 2] << 0; + longs[longsIdx + 0] = l0; + } + } + + private static void decode7(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 14); + shiftLongs(tmp, 14, longs, 0, 1, MASK8_7); + shiftLongs(tmp, 14, tmp, 0, 0, MASK8_1); + for (int iter = 0, tmpIdx = 0, longsIdx = 14; iter < 2; ++iter, tmpIdx += 7, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 6; + l0 |= tmp[tmpIdx + 1] << 5; + l0 |= tmp[tmpIdx + 2] << 4; + l0 |= tmp[tmpIdx + 3] << 3; + l0 |= tmp[tmpIdx + 4] << 2; + l0 |= tmp[tmpIdx + 5] << 1; + l0 |= tmp[tmpIdx + 6] << 0; + longs[longsIdx + 0] = l0; + } + } + + private static void decode8(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(longs, 0, 16); + } + + private static void decode9(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 18); + shiftLongs(tmp, 18, longs, 0, 7, MASK16_9); + for (int iter = 0, tmpIdx = 0, longsIdx = 18; iter < 2; ++iter, tmpIdx += 9, longsIdx += 7) { + long l0 = (tmp[tmpIdx + 0] & MASK16_7) << 2; + l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4; + l1 |= (tmp[tmpIdx + 2] >>> 3) & MASK16_4; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 2] & MASK16_3) << 6; + l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8; + l3 |= (tmp[tmpIdx + 4] & MASK16_7) << 1; + l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3; + l4 |= (tmp[tmpIdx + 6] >>> 4) & MASK16_3; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 6] & MASK16_4) << 5; + l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5; + longs[longsIdx + 5] = l5; + long l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7; + l6 |= (tmp[tmpIdx + 8] & MASK16_7) << 0; + longs[longsIdx + 6] = l6; + } + } + + private static void decode10(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 20); + shiftLongs(tmp, 20, longs, 0, 6, MASK16_10); + for (int iter = 0, tmpIdx = 0, longsIdx = 20; iter < 4; ++iter, tmpIdx += 5, longsIdx += 3) { + long l0 = (tmp[tmpIdx + 0] & MASK16_6) << 4; + l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8; + l1 |= (tmp[tmpIdx + 2] & MASK16_6) << 2; + l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6; + l2 |= (tmp[tmpIdx + 4] & MASK16_6) << 0; + longs[longsIdx + 2] = l2; + } + } + + private static void decode11(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 22); + shiftLongs(tmp, 22, longs, 0, 5, MASK16_11); + for (int iter = 0, tmpIdx = 0, longsIdx = 22; iter < 2; ++iter, tmpIdx += 11, longsIdx += 5) { + long l0 = (tmp[tmpIdx + 0] & MASK16_5) << 6; + l0 |= (tmp[tmpIdx + 1] & MASK16_5) << 1; + l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7; + l1 |= (tmp[tmpIdx + 3] & MASK16_5) << 2; + l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8; + l2 |= (tmp[tmpIdx + 5] & MASK16_5) << 3; + l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9; + l3 |= (tmp[tmpIdx + 7] & MASK16_5) << 4; + l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10; + l4 |= (tmp[tmpIdx + 9] & MASK16_5) << 5; + l4 |= (tmp[tmpIdx + 10] & MASK16_5) << 0; + longs[longsIdx + 4] = l4; + } + } + + private static void decode12(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 24); + shiftLongs(tmp, 24, longs, 0, 4, MASK16_12); + shiftLongs(tmp, 24, tmp, 0, 0, MASK16_4); + for (int iter = 0, tmpIdx = 0, longsIdx = 24; iter < 8; ++iter, tmpIdx += 3, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 8; + l0 |= tmp[tmpIdx + 1] << 4; + l0 |= tmp[tmpIdx + 2] << 0; + longs[longsIdx + 0] = l0; + } + } + + private static void decode13(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 26); + shiftLongs(tmp, 26, longs, 0, 3, MASK16_13); + for (int iter = 0, tmpIdx = 0, longsIdx = 26; iter < 2; ++iter, tmpIdx += 13, longsIdx += 3) { + long l0 = (tmp[tmpIdx + 0] & MASK16_3) << 10; + l0 |= (tmp[tmpIdx + 1] & MASK16_3) << 7; + l0 |= (tmp[tmpIdx + 2] & MASK16_3) << 4; + l0 |= (tmp[tmpIdx + 3] & MASK16_3) << 1; + l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11; + l1 |= (tmp[tmpIdx + 5] & MASK16_3) << 8; + l1 |= (tmp[tmpIdx + 6] & MASK16_3) << 5; + l1 |= (tmp[tmpIdx + 7] & MASK16_3) << 2; + l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12; + l2 |= (tmp[tmpIdx + 9] & MASK16_3) << 9; + l2 |= (tmp[tmpIdx + 10] & MASK16_3) << 6; + l2 |= (tmp[tmpIdx + 11] & MASK16_3) << 3; + l2 |= (tmp[tmpIdx + 12] & MASK16_3) << 0; + longs[longsIdx + 2] = l2; + } + } + + private static void decode14(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 28); + shiftLongs(tmp, 28, longs, 0, 2, MASK16_14); + shiftLongs(tmp, 28, tmp, 0, 0, MASK16_2); + for (int iter = 0, tmpIdx = 0, longsIdx = 28; iter < 4; ++iter, tmpIdx += 7, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 12; + l0 |= tmp[tmpIdx + 1] << 10; + l0 |= tmp[tmpIdx + 2] << 8; + l0 |= tmp[tmpIdx + 3] << 6; + l0 |= tmp[tmpIdx + 4] << 4; + l0 |= tmp[tmpIdx + 5] << 2; + l0 |= tmp[tmpIdx + 6] << 0; + longs[longsIdx + 0] = l0; + } + } + + private static void decode15(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 30); + shiftLongs(tmp, 30, longs, 0, 1, MASK16_15); + shiftLongs(tmp, 30, tmp, 0, 0, MASK16_1); + for (int iter = 0, tmpIdx = 0, longsIdx = 30; iter < 2; ++iter, tmpIdx += 15, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 14; + l0 |= tmp[tmpIdx + 1] << 13; + l0 |= tmp[tmpIdx + 2] << 12; + l0 |= tmp[tmpIdx + 3] << 11; + l0 |= tmp[tmpIdx + 4] << 10; + l0 |= tmp[tmpIdx + 5] << 9; + l0 |= tmp[tmpIdx + 6] << 8; + l0 |= tmp[tmpIdx + 7] << 7; + l0 |= tmp[tmpIdx + 8] << 6; + l0 |= tmp[tmpIdx + 9] << 5; + l0 |= tmp[tmpIdx + 10] << 4; + l0 |= tmp[tmpIdx + 11] << 3; + l0 |= tmp[tmpIdx + 12] << 2; + l0 |= tmp[tmpIdx + 13] << 1; + l0 |= tmp[tmpIdx + 14] << 0; + longs[longsIdx + 0] = l0; + } + } + + private static void decode16(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(longs, 0, 32); + } + + private static void decode17(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 34); + shiftLongs(tmp, 34, longs, 0, 15, MASK32_17); + for (int iter = 0, tmpIdx = 0, longsIdx = 34; iter < 2; ++iter, tmpIdx += 17, longsIdx += 15) { + long l0 = (tmp[tmpIdx + 0] & MASK32_15) << 2; + l0 |= (tmp[tmpIdx + 1] >>> 13) & MASK32_2; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK32_13) << 4; + l1 |= (tmp[tmpIdx + 2] >>> 11) & MASK32_4; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 2] & MASK32_11) << 6; + l2 |= (tmp[tmpIdx + 3] >>> 9) & MASK32_6; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 3] & MASK32_9) << 8; + l3 |= (tmp[tmpIdx + 4] >>> 7) & MASK32_8; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 4] & MASK32_7) << 10; + l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_10; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 5] & MASK32_5) << 12; + l5 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_12; + longs[longsIdx + 5] = l5; + long l6 = (tmp[tmpIdx + 6] & MASK32_3) << 14; + l6 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_14; + longs[longsIdx + 6] = l6; + long l7 = (tmp[tmpIdx + 7] & MASK32_1) << 16; + l7 |= (tmp[tmpIdx + 8] & MASK32_15) << 1; + l7 |= (tmp[tmpIdx + 9] >>> 14) & MASK32_1; + longs[longsIdx + 7] = l7; + long l8 = (tmp[tmpIdx + 9] & MASK32_14) << 3; + l8 |= (tmp[tmpIdx + 10] >>> 12) & MASK32_3; + longs[longsIdx + 8] = l8; + long l9 = (tmp[tmpIdx + 10] & MASK32_12) << 5; + l9 |= (tmp[tmpIdx + 11] >>> 10) & MASK32_5; + longs[longsIdx + 9] = l9; + long l10 = (tmp[tmpIdx + 11] & MASK32_10) << 7; + l10 |= (tmp[tmpIdx + 12] >>> 8) & MASK32_7; + longs[longsIdx + 10] = l10; + long l11 = (tmp[tmpIdx + 12] & MASK32_8) << 9; + l11 |= (tmp[tmpIdx + 13] >>> 6) & MASK32_9; + longs[longsIdx + 11] = l11; + long l12 = (tmp[tmpIdx + 13] & MASK32_6) << 11; + l12 |= (tmp[tmpIdx + 14] >>> 4) & MASK32_11; + longs[longsIdx + 12] = l12; + long l13 = (tmp[tmpIdx + 14] & MASK32_4) << 13; + l13 |= (tmp[tmpIdx + 15] >>> 2) & MASK32_13; + longs[longsIdx + 13] = l13; + long l14 = (tmp[tmpIdx + 15] & MASK32_2) << 15; + l14 |= (tmp[tmpIdx + 16] & MASK32_15) << 0; + longs[longsIdx + 14] = l14; + } + } + + private static void decode18(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 36); + shiftLongs(tmp, 36, longs, 0, 14, MASK32_18); + for (int iter = 0, tmpIdx = 0, longsIdx = 36; iter < 4; ++iter, tmpIdx += 9, longsIdx += 7) { + long l0 = (tmp[tmpIdx + 0] & MASK32_14) << 4; + l0 |= (tmp[tmpIdx + 1] >>> 10) & MASK32_4; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK32_10) << 8; + l1 |= (tmp[tmpIdx + 2] >>> 6) & MASK32_8; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 2] & MASK32_6) << 12; + l2 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_12; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 3] & MASK32_2) << 16; + l3 |= (tmp[tmpIdx + 4] & MASK32_14) << 2; + l3 |= (tmp[tmpIdx + 5] >>> 12) & MASK32_2; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 5] & MASK32_12) << 6; + l4 |= (tmp[tmpIdx + 6] >>> 8) & MASK32_6; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 6] & MASK32_8) << 10; + l5 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_10; + longs[longsIdx + 5] = l5; + long l6 = (tmp[tmpIdx + 7] & MASK32_4) << 14; + l6 |= (tmp[tmpIdx + 8] & MASK32_14) << 0; + longs[longsIdx + 6] = l6; + } + } + + private static void decode19(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 38); + shiftLongs(tmp, 38, longs, 0, 13, MASK32_19); + for (int iter = 0, tmpIdx = 0, longsIdx = 38; iter < 2; ++iter, tmpIdx += 19, longsIdx += 13) { + long l0 = (tmp[tmpIdx + 0] & MASK32_13) << 6; + l0 |= (tmp[tmpIdx + 1] >>> 7) & MASK32_6; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK32_7) << 12; + l1 |= (tmp[tmpIdx + 2] >>> 1) & MASK32_12; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 2] & MASK32_1) << 18; + l2 |= (tmp[tmpIdx + 3] & MASK32_13) << 5; + l2 |= (tmp[tmpIdx + 4] >>> 8) & MASK32_5; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 4] & MASK32_8) << 11; + l3 |= (tmp[tmpIdx + 5] >>> 2) & MASK32_11; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 5] & MASK32_2) << 17; + l4 |= (tmp[tmpIdx + 6] & MASK32_13) << 4; + l4 |= (tmp[tmpIdx + 7] >>> 9) & MASK32_4; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 7] & MASK32_9) << 10; + l5 |= (tmp[tmpIdx + 8] >>> 3) & MASK32_10; + longs[longsIdx + 5] = l5; + long l6 = (tmp[tmpIdx + 8] & MASK32_3) << 16; + l6 |= (tmp[tmpIdx + 9] & MASK32_13) << 3; + l6 |= (tmp[tmpIdx + 10] >>> 10) & MASK32_3; + longs[longsIdx + 6] = l6; + long l7 = (tmp[tmpIdx + 10] & MASK32_10) << 9; + l7 |= (tmp[tmpIdx + 11] >>> 4) & MASK32_9; + longs[longsIdx + 7] = l7; + long l8 = (tmp[tmpIdx + 11] & MASK32_4) << 15; + l8 |= (tmp[tmpIdx + 12] & MASK32_13) << 2; + l8 |= (tmp[tmpIdx + 13] >>> 11) & MASK32_2; + longs[longsIdx + 8] = l8; + long l9 = (tmp[tmpIdx + 13] & MASK32_11) << 8; + l9 |= (tmp[tmpIdx + 14] >>> 5) & MASK32_8; + longs[longsIdx + 9] = l9; + long l10 = (tmp[tmpIdx + 14] & MASK32_5) << 14; + l10 |= (tmp[tmpIdx + 15] & MASK32_13) << 1; + l10 |= (tmp[tmpIdx + 16] >>> 12) & MASK32_1; + longs[longsIdx + 10] = l10; + long l11 = (tmp[tmpIdx + 16] & MASK32_12) << 7; + l11 |= (tmp[tmpIdx + 17] >>> 6) & MASK32_7; + longs[longsIdx + 11] = l11; + long l12 = (tmp[tmpIdx + 17] & MASK32_6) << 13; + l12 |= (tmp[tmpIdx + 18] & MASK32_13) << 0; + longs[longsIdx + 12] = l12; + } + } + + private static void decode20(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 40); + shiftLongs(tmp, 40, longs, 0, 12, MASK32_20); + for (int iter = 0, tmpIdx = 0, longsIdx = 40; iter < 8; ++iter, tmpIdx += 5, longsIdx += 3) { + long l0 = (tmp[tmpIdx + 0] & MASK32_12) << 8; + l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_8; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK32_4) << 16; + l1 |= (tmp[tmpIdx + 2] & MASK32_12) << 4; + l1 |= (tmp[tmpIdx + 3] >>> 8) & MASK32_4; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 3] & MASK32_8) << 12; + l2 |= (tmp[tmpIdx + 4] & MASK32_12) << 0; + longs[longsIdx + 2] = l2; + } + } + + private static void decode21(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 42); + shiftLongs(tmp, 42, longs, 0, 11, MASK32_21); + for (int iter = 0, tmpIdx = 0, longsIdx = 42; iter < 2; ++iter, tmpIdx += 21, longsIdx += 11) { + long l0 = (tmp[tmpIdx + 0] & MASK32_11) << 10; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK32_10; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 1] & MASK32_1) << 20; + l1 |= (tmp[tmpIdx + 2] & MASK32_11) << 9; + l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_9; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 3] & MASK32_2) << 19; + l2 |= (tmp[tmpIdx + 4] & MASK32_11) << 8; + l2 |= (tmp[tmpIdx + 5] >>> 3) & MASK32_8; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 5] & MASK32_3) << 18; + l3 |= (tmp[tmpIdx + 6] & MASK32_11) << 7; + l3 |= (tmp[tmpIdx + 7] >>> 4) & MASK32_7; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 7] & MASK32_4) << 17; + l4 |= (tmp[tmpIdx + 8] & MASK32_11) << 6; + l4 |= (tmp[tmpIdx + 9] >>> 5) & MASK32_6; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 9] & MASK32_5) << 16; + l5 |= (tmp[tmpIdx + 10] & MASK32_11) << 5; + l5 |= (tmp[tmpIdx + 11] >>> 6) & MASK32_5; + longs[longsIdx + 5] = l5; + long l6 = (tmp[tmpIdx + 11] & MASK32_6) << 15; + l6 |= (tmp[tmpIdx + 12] & MASK32_11) << 4; + l6 |= (tmp[tmpIdx + 13] >>> 7) & MASK32_4; + longs[longsIdx + 6] = l6; + long l7 = (tmp[tmpIdx + 13] & MASK32_7) << 14; + l7 |= (tmp[tmpIdx + 14] & MASK32_11) << 3; + l7 |= (tmp[tmpIdx + 15] >>> 8) & MASK32_3; + longs[longsIdx + 7] = l7; + long l8 = (tmp[tmpIdx + 15] & MASK32_8) << 13; + l8 |= (tmp[tmpIdx + 16] & MASK32_11) << 2; + l8 |= (tmp[tmpIdx + 17] >>> 9) & MASK32_2; + longs[longsIdx + 8] = l8; + long l9 = (tmp[tmpIdx + 17] & MASK32_9) << 12; + l9 |= (tmp[tmpIdx + 18] & MASK32_11) << 1; + l9 |= (tmp[tmpIdx + 19] >>> 10) & MASK32_1; + longs[longsIdx + 9] = l9; + long l10 = (tmp[tmpIdx + 19] & MASK32_10) << 11; + l10 |= (tmp[tmpIdx + 20] & MASK32_11) << 0; + longs[longsIdx + 10] = l10; + } + } + + private static void decode22(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 44); + shiftLongs(tmp, 44, longs, 0, 10, MASK32_22); + for (int iter = 0, tmpIdx = 0, longsIdx = 44; iter < 4; ++iter, tmpIdx += 11, longsIdx += 5) { + long l0 = (tmp[tmpIdx + 0] & MASK32_10) << 12; + l0 |= (tmp[tmpIdx + 1] & MASK32_10) << 2; + l0 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 2] & MASK32_8) << 14; + l1 |= (tmp[tmpIdx + 3] & MASK32_10) << 4; + l1 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 4] & MASK32_6) << 16; + l2 |= (tmp[tmpIdx + 5] & MASK32_10) << 6; + l2 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 6] & MASK32_4) << 18; + l3 |= (tmp[tmpIdx + 7] & MASK32_10) << 8; + l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 8] & MASK32_2) << 20; + l4 |= (tmp[tmpIdx + 9] & MASK32_10) << 10; + l4 |= (tmp[tmpIdx + 10] & MASK32_10) << 0; + longs[longsIdx + 4] = l4; + } + } + + private static void decode23(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 46); + shiftLongs(tmp, 46, longs, 0, 9, MASK32_23); + for (int iter = 0, tmpIdx = 0, longsIdx = 46; iter < 2; ++iter, tmpIdx += 23, longsIdx += 9) { + long l0 = (tmp[tmpIdx + 0] & MASK32_9) << 14; + l0 |= (tmp[tmpIdx + 1] & MASK32_9) << 5; + l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK32_5; + longs[longsIdx + 0] = l0; + long l1 = (tmp[tmpIdx + 2] & MASK32_4) << 19; + l1 |= (tmp[tmpIdx + 3] & MASK32_9) << 10; + l1 |= (tmp[tmpIdx + 4] & MASK32_9) << 1; + l1 |= (tmp[tmpIdx + 5] >>> 8) & MASK32_1; + longs[longsIdx + 1] = l1; + long l2 = (tmp[tmpIdx + 5] & MASK32_8) << 15; + l2 |= (tmp[tmpIdx + 6] & MASK32_9) << 6; + l2 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_6; + longs[longsIdx + 2] = l2; + long l3 = (tmp[tmpIdx + 7] & MASK32_3) << 20; + l3 |= (tmp[tmpIdx + 8] & MASK32_9) << 11; + l3 |= (tmp[tmpIdx + 9] & MASK32_9) << 2; + l3 |= (tmp[tmpIdx + 10] >>> 7) & MASK32_2; + longs[longsIdx + 3] = l3; + long l4 = (tmp[tmpIdx + 10] & MASK32_7) << 16; + l4 |= (tmp[tmpIdx + 11] & MASK32_9) << 7; + l4 |= (tmp[tmpIdx + 12] >>> 2) & MASK32_7; + longs[longsIdx + 4] = l4; + long l5 = (tmp[tmpIdx + 12] & MASK32_2) << 21; + l5 |= (tmp[tmpIdx + 13] & MASK32_9) << 12; + l5 |= (tmp[tmpIdx + 14] & MASK32_9) << 3; + l5 |= (tmp[tmpIdx + 15] >>> 6) & MASK32_3; + longs[longsIdx + 5] = l5; + long l6 = (tmp[tmpIdx + 15] & MASK32_6) << 17; + l6 |= (tmp[tmpIdx + 16] & MASK32_9) << 8; + l6 |= (tmp[tmpIdx + 17] >>> 1) & MASK32_8; + longs[longsIdx + 6] = l6; + long l7 = (tmp[tmpIdx + 17] & MASK32_1) << 22; + l7 |= (tmp[tmpIdx + 18] & MASK32_9) << 13; + l7 |= (tmp[tmpIdx + 19] & MASK32_9) << 4; + l7 |= (tmp[tmpIdx + 20] >>> 5) & MASK32_4; + longs[longsIdx + 7] = l7; + long l8 = (tmp[tmpIdx + 20] & MASK32_5) << 18; + l8 |= (tmp[tmpIdx + 21] & MASK32_9) << 9; + l8 |= (tmp[tmpIdx + 22] & MASK32_9) << 0; + longs[longsIdx + 8] = l8; + } + } + + private static void decode24(DataInput in, long[] tmp, long[] longs) throws IOException { + in.readLongs(tmp, 0, 48); + shiftLongs(tmp, 48, longs, 0, 8, MASK32_24); + shiftLongs(tmp, 48, tmp, 0, 0, MASK32_8); + for (int iter = 0, tmpIdx = 0, longsIdx = 48; iter < 16; ++iter, tmpIdx += 3, longsIdx += 1) { + long l0 = tmp[tmpIdx + 0] << 16; + l0 |= tmp[tmpIdx + 1] << 8; + l0 |= tmp[tmpIdx + 2] << 0; + longs[longsIdx + 0] = l0; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java index b3acc0ef1699..6f28ca3d0a60 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java @@ -128,12 +128,10 @@ public void decompress(DataInput in, int originalLength, int offset, int length, } // Read blocks that intersect with the interval we need - if (offsetInBlock < offset + length) { - bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + offset + length - offsetInBlock); - } while (offsetInBlock < offset + length) { final int bytesToDecompress = Math.min(blockLength, offset + length - offsetInBlock); LZ4.decompress(in, bytesToDecompress, buffer, dictLength); + bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + bytesToDecompress); System.arraycopy(buffer, dictLength, bytes.bytes, bytes.length, bytesToDecompress); bytes.length += bytesToDecompress; offsetInBlock += blockLength; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java index ade1699fe753..72d83853879b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java @@ -69,7 +69,6 @@ public final class Lucene90CompoundFormat extends CompoundFormat { /** Extension of compound file */ static final String DATA_EXTENSION = "cfs"; - /** Extension of compound file entries */ static final String ENTRIES_EXTENSION = "cfe"; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java index d64dce985c01..c3d170447ae6 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java @@ -22,6 +22,10 @@ import java.io.IOException; import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; @@ -36,8 +40,6 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.internal.hppc.LongHashSet; -import org.apache.lucene.internal.hppc.LongIntHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.SortedSetSelector; import org.apache.lucene.store.ByteArrayDataOutput; @@ -197,7 +199,7 @@ private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, bo MinMaxTracker minMax = new MinMaxTracker(); MinMaxTracker blockMinMax = new MinMaxTracker(); long gcd = 0; - LongHashSet uniqueValues = ords ? null : new LongHashSet(); + Set uniqueValues = ords ? null : new HashSet<>(); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { for (int i = 0, count = values.docValueCount(); i < count; ++i) { long v = values.nextValue(); @@ -271,7 +273,7 @@ private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, bo meta.writeLong(numValues); final int numBitsPerValue; boolean doBlocks = false; - LongIntHashMap encode = null; + Map encode = null; if (min >= max) { // meta[-1]: All values are 0 numBitsPerValue = 0; meta.writeInt(-1); // tablesize @@ -281,13 +283,13 @@ private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, bo && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < DirectWriter.unsignedBitsRequired((max - min) / gcd)) { numBitsPerValue = DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1); - final long[] sortedUniqueValues = uniqueValues.toArray(); + final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]); Arrays.sort(sortedUniqueValues); meta.writeInt(sortedUniqueValues.length); // tablesize - for (long v : sortedUniqueValues) { + for (Long v : sortedUniqueValues) { meta.writeLong(v); // table[] entry } - encode = new LongIntHashMap(); + encode = new HashMap<>(); for (int i = 0; i < sortedUniqueValues.length; ++i) { encode.put(sortedUniqueValues[i], i); } @@ -337,7 +339,7 @@ private void writeValuesSingleBlock( int numBitsPerValue, long min, long gcd, - LongIntHashMap encode) + Map encode) throws IOException { DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index d0544f3db246..3718d0ae401f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -1154,7 +1154,6 @@ private long seekTermsIndex(BytesRef text) throws IOException { assert hi < 0 || getTermFromIndex(hi).compareTo(text) <= 0; assert hi == ((entry.termsDictSize - 1) >> entry.termsDictIndexShift) || getTermFromIndex(hi + 1).compareTo(text) > 0; - assert hi < 0 ^ entry.termsDictSize > 0; // return -1 iff empty term dict return hi; } @@ -1171,9 +1170,7 @@ private BytesRef getFirstTermFromBlock(long block) throws IOException { private long seekBlock(BytesRef text) throws IOException { long index = seekTermsIndex(text); if (index == -1L) { - // empty terms dict - this.ord = 0; - return -2L; + return -1L; } long ordLo = index << entry.termsDictIndexShift; @@ -1197,30 +1194,26 @@ private long seekBlock(BytesRef text) throws IOException { assert blockHi == ((entry.termsDictSize - 1) >>> TERMS_DICT_BLOCK_LZ4_SHIFT) || getFirstTermFromBlock(blockHi + 1).compareTo(text) > 0; - // read the block only if term dict is not empty - assert entry.termsDictSize > 0; - // reset ord and bytes to the ceiling block even if - // text is before the first term (blockHi == -1) - final long block = Math.max(blockHi, 0); - final long blockAddress = blockAddresses.get(block); - this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT; - bytes.seek(blockAddress); - decompressBlock(); - return blockHi; } @Override public SeekStatus seekCeil(BytesRef text) throws IOException { final long block = seekBlock(text); - if (block == -2) { - // empty terms dict - assert entry.termsDictSize == 0; - return SeekStatus.END; - } else if (block == -1) { - // before the first term - return SeekStatus.NOT_FOUND; + if (block == -1) { + // before the first term, or empty terms dict + if (entry.termsDictSize == 0) { + ord = 0; + return SeekStatus.END; + } else { + seekExact(0L); + return SeekStatus.NOT_FOUND; + } } + final long blockAddress = blockAddresses.get(block); + this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT; + bytes.seek(blockAddress); + decompressBlock(); while (true) { int cmp = term.compareTo(text); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java index dcfa45f5b60b..d145bfdb0ebb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java @@ -20,6 +20,8 @@ import static org.apache.lucene.codecs.lucene90.Lucene90NormsFormat.VERSION_START; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.index.CorruptIndexException; @@ -29,7 +31,6 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -39,13 +40,13 @@ /** Reader for {@link Lucene90NormsFormat} */ final class Lucene90NormsProducer extends NormsProducer implements Cloneable { // metadata maps (just file pointers and minimal stuff) - private final IntObjectHashMap norms = new IntObjectHashMap<>(); + private final Map norms = new HashMap<>(); private final int maxDoc; private IndexInput data; private boolean merging; - private IntObjectHashMap disiInputs; - private IntObjectHashMap disiJumpTables; - private IntObjectHashMap dataInputs; + private Map disiInputs; + private Map disiJumpTables; + private Map dataInputs; Lucene90NormsProducer( SegmentReadState state, @@ -121,9 +122,9 @@ public NormsProducer getMergeInstance() { throw new RuntimeException(e); } clone.data = data.clone(); - clone.disiInputs = new IntObjectHashMap<>(); - clone.disiJumpTables = new IntObjectHashMap<>(); - clone.dataInputs = new IntObjectHashMap<>(); + clone.disiInputs = new HashMap<>(); + clone.disiJumpTables = new HashMap<>(); + clone.dataInputs = new HashMap<>(); clone.merging = true; return clone; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java index 3e072cb8770d..1656dd0a1bae 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java @@ -17,6 +17,8 @@ package org.apache.lucene.codecs.lucene90; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.index.CorruptIndexException; @@ -24,7 +26,6 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -35,7 +36,7 @@ public class Lucene90PointsReader extends PointsReader { final IndexInput indexIn, dataIn; final SegmentReadState readState; - final IntObjectHashMap readers = new IntObjectHashMap<>(); + final Map readers = new HashMap<>(); /** Sole constructor */ public Lucene90PointsReader(SegmentReadState readState) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java index e50d6a0fdb5a..d310b8ace197 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java @@ -106,7 +106,7 @@ public Lucene90PointsWriter( } /** - * Uses the defaults values for {@code maxPointsInLeafNode} (512) and {@code maxMBSortInHeap} + * Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} * (16.0) */ public Lucene90PointsWriter(SegmentWriteState writeState) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsFormat.java new file mode 100644 index 000000000000..77dc6626cc3f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsFormat.java @@ -0,0 +1,507 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.MultiLevelSkipListWriter; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; +import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Lucene 5.0 postings format, which encodes postings in packed integer blocks for fast decode. + * + *

Basic idea: + * + *

    + *
  • Packed Blocks and VInt Blocks: + *

    In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed + * format}): the block size (i.e. number of integers inside block) is fixed (currently 128). + * Additionally blocks that are all the same value are encoded in an optimized way. + *

    In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block + * size is variable. + *

  • Block structure: + *

    When the postings are long enough, Lucene90PostingsFormat will try to encode most + * integer data as a packed block. + *

    Take a term with 259 documents as an example, the first 256 document ids are encoded as + * two packed blocks, while the remaining 3 are encoded as one VInt block. + *

    Different kinds of data are always encoded separately into different packed blocks, but + * may possibly be interleaved into the same VInt block. + *

    This strategy is applied to pairs: <document number, frequency>, <position, + * payload length>, <position, offset start, offset length>, and <position, + * payload length, offsetstart, offset length>. + *

  • Skipdata settings: + *

    The structure of skip table is quite similar to previous version of Lucene. Skip + * interval is the same as block size, and each skip entry points to the beginning of each + * block. However, for the first block, skip data is omitted. + *

  • Positions, Payloads, and Offsets: + *

    A position is an integer indicating where the term occurs within one document. A payload + * is a blob of metadata associated with current position. An offset is a pair of integers + * indicating the tokenized start/end offsets for given term in current position: it is + * essentially a specialized payload. + *

    When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets + * (assuming a null payload contributes one count). As mentioned in block structure, it is + * possible to encode these three either combined or separately. + *

    In all cases, payloads and offsets are stored together. When encoded as a packed block, + * position data is separated out as .pos, while payloads and offsets are encoded in .pay + * (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all + * these three are stored interleaved into the .pos (so is payload metadata). + *

    With this strategy, the majority of payload and offset data will be outside .pos file. + * So for queries that require only position data, running on a full index with payloads and + * offsets, this reduces disk pre-fetches. + *

+ * + *

Files and detailed format: + * + *

+ * + * + * + *
+ *
Term Dictionary + *

The .tim file contains the list of terms in each field along with per-term statistics + * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the + * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on + * the format. + *

NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the PostingsHeader and + * TermMetadata sections described here: + *

    + *
  • PostingsHeader --> Header, PackedBlockSize + *
  • TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, + * PayFPDelta?, SkipFPDelta? + *
  • Header, --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt} + *
  • DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link + * DataOutput#writeVLong VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version + * information for the postings. + *
  • PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width + * is determined by the largest integer. Smaller block size result in smaller variance + * among width of integers hence smaller indexes. Larger block size result in more + * efficient bulk i/o hence better acceleration. This value should always be a multiple + * of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to + * accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}. + *
  • DocFPDelta determines the position of this term's TermFreqs within the .doc file. In + * particular, it is the difference of file offset between this term's data and previous + * term's data (or zero, for the first term in the block).On disk it is stored as the + * difference from previous value in sequence. + *
  • PosFPDelta determines the position of this term's TermPositions within the .pos file. + * While PayFPDelta determines the position of this term's <TermPayloads, + * TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference + * between two file positions (or neglected, for fields that omit payloads and offsets). + *
  • PosVIntBlockFPDelta determines the position of this term's last TermPosition in last + * pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or + * OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to + * load following payloads and offsets from .pos instead of .pay. Every time a new block + * of positions are to be loaded, the PostingsReader will use this value to check + * whether current block is packed format or VInt. When packed format, payloads and + * offsets are fetched from .pay, otherwise from .pos. (this value is neglected when + * total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize). + *
  • SkipFPDelta determines the position of this term's SkipData within the .doc file. In + * particular, it is the length of the TermFreq data. SkipDelta is only stored if + * DocFreq is not smaller than SkipMinimum (i.e. 128 in Lucene90PostingsFormat). + *
  • SingletonDocID is an optimization when a term only appears in one document. In this + * case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a + * VIntBlock at that location, the single document ID is written to the term dictionary. + *
+ *
+ * + * + * + *
+ *
Term Index + *

The .tip file contains an index into the term dictionary, so that it can be accessed + * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + *

+ * + * + * + *
+ *
Frequencies and Skip Data + *

The .doc file contains the lists of documents which contain each term, along with the + * frequency of the term in that document (except when frequencies are omitted: {@link + * IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block, + * when the length of document list is larger than packed block size. + *

    + *
  • docFile(.doc) --> Header, <TermFreqs, SkipData?>TermCount, Footer + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • TermFreqs --> <PackedBlock> PackedDocBlockNum, VIntBlock? + *
  • PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock? + *
  • VIntBlock --> <DocDelta[, + * Freq?]>DocFreq-PackedBlockSize*PackedDocBlockNum + *
  • SkipData --> <<SkipLevelLength, SkipLevel> NumSkipLevels-1, + * SkipLevel>, SkipDatum? + *
  • SkipLevel --> <SkipDatum> TrimmedDocFreq/(PackedBlockSize^(Level + + * 1)) + *
  • SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?, + * PayFPSkip?>?, ImpactLength, <CompetitiveFreqDelta, CompetitiveNormDelta?> + * ImpactCount, SkipChildLevelPointer? + *
  • PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts} + *
  • DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto, + * PayFPSkip, ImpactLength, CompetitiveFreqDelta --> {@link DataOutput#writeVInt + * VInt} + *
  • CompetitiveNormDelta --> {@link DataOutput#writeZLong ZLong} + *
  • SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong} + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • PackedDocDeltaBlock is theoretically generated from two steps: + *
      + *
    1. Calculate the difference between each document number and previous one, and get + * a d-gaps list (for the first document, use absolute value); + *
    2. For those d-gaps from first one to + * PackedDocBlockNum*PackedBlockSizeth, separately encode as packed + * blocks. + *
    + * If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step. + *
  • VIntBlock stores remaining d-gaps (along with frequencies when possible) with a + * format that encodes DocDelta and Freq: + *

    DocDelta: if frequencies are indexed, this determines both the document number and + * the frequency. In particular, DocDelta/2 is the difference between this document + * number and the previous document number (or zero when this is the first document in a + * TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the + * frequency is read as another VInt. If frequencies are omitted, DocDelta contains the + * gap (not multiplied by 2) between document numbers and no frequency information is + * stored. + *

    For example, the TermFreqs for a term which occurs once in document seven and + * three times in document eleven, with frequencies indexed, would be the following + * sequence of VInts: + *

    15, 8, 3 + *

    If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence + * of VInts instead: + *

    7,4 + *

  • PackedDocBlockNum is the number of packed blocks for current term's docids or + * frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) + *
  • TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq. We use this + * trick since the definition of skip entry is a little different from base interface. + * In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for + * skipIntervalth, 2*skipIntervalth ... posting in the list. + * However, in Lucene90PostingsFormat, the skip data is saved for + * skipInterval+1th, 2*skipInterval+1th ... posting + * (skipInterval==PackedBlockSize in this case). When DocFreq is multiple of + * PackedBlockSize, MultiLevelSkipListWriter will expect one more skip data than + * Lucene90SkipWriter. + *
  • SkipDatum is the metadata of one skip entry. For the first block (no matter packed or + * VInt), it is omitted. + *
  • DocSkip records the document number of every PackedBlockSizeth document + * number in the postings (i.e. last document number in each packed block). On disk it + * is stored as the difference from previous value in the sequence. + *
  • DocFPSkip records the file offsets of each block (excluding )posting at + * PackedBlockSize+1th, 2*PackedBlockSize+1th ... , in DocFile. + * The file offsets are relative to the start of current term's TermFreqs. On disk it is + * also stored as the difference from previous SkipDatum in the sequence. + *
  • Since positions and payloads are also block encoded, the skip should skip to related + * block first, then fetch the values according to in-block offset. PosFPSkip and + * PayFPSkip record the file offsets of related block in .pos and .pay, respectively. + * While PosBlockOffset indicates which value to fetch inside the related block + * (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as + * DocFPSkip, the file offsets are relative to the start of current term's TermFreqs, + * and stored as a difference sequence. + *
  • PayByteUpto indicates the start offset of the current payload. It is equivalent to + * the sum of the payload lengths in the current block up to PosBlockOffset + *
  • ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta + * pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score + * calculation for uncompetitive documents; See {@link + * org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details. + *
+ *
+ * + * + * + *
+ *
Positions + *

The .pos file contains the lists of positions that each term occurs at within documents. + * It also sometimes stores part of payloads and offsets for speedup. + *

    + *
  • PosFile(.pos) --> Header, <TermPositions> TermCount, Footer + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • TermPositions --> <PackedPosDeltaBlock> PackedPosBlockNum, + * VIntBlock? + *
  • VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?, + * OffsetLength?>PosVIntCount + *
  • PackedPosDeltaBlock --> {@link PackedInts PackedInts} + *
  • PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt} + *
  • PayloadData --> {@link DataOutput#writeByte byte}PayLength + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • TermPositions are order by term (terms are implicit, from the term dictionary), and + * position values for each term document pair are incremental, and ordered by document + * number. + *
  • PackedPosBlockNum is the number of packed blocks for current term's positions, + * payloads or offsets. In particular, PackedPosBlockNum = + * floor(totalTermFreq/PackedBlockSize) + *
  • PosVIntCount is the number of positions encoded as VInt format. In particular, + * PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize + *
  • The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock + * in chapter Frequencies and Skip Data. + *
  • PositionDelta is, if payloads are disabled for the term's field, the difference + * between the position of the current occurrence in the document and the previous + * occurrence (or zero, if this is the first occurrence in this document). If payloads + * are enabled for the term's field, then PositionDelta/2 is the difference between the + * current and the previous position. If payloads are enabled and PositionDelta is odd, + * then PayloadLength is stored, indicating the length of the payload at the current + * term position. + *
  • For example, the TermPositions for a term which occurs as the fourth term in one + * document, and as the fifth and ninth term in a subsequent document, would be the + * following sequence of VInts (payloads disabled): + *

    4, 5, 4 + *

  • PayloadData is metadata associated with the current term position. If PayloadLength + * is stored at the current position, then it indicates the length of this payload. If + * PayloadLength is not stored, then this payload has the same length as the payload at + * the previous position. + *
  • OffsetDelta/2 is the difference between this position's startOffset from the previous + * occurrence (or zero, if this is the first occurrence in this document). If + * OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous + * occurrence and an OffsetLength follows. Offset data is only written for {@link + * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. + *
+ *
+ * + * + * + *
+ *
Payloads and Offsets + *

The .pay file will store payloads and offsets associated with certain term-document + * positions. Some payloads and offsets will be separated out into .pos file, for performance + * reasons. + *

    + *
  • PayFile(.pay): --> Header, <TermPayloads?, TermOffsets?> + * TermCount, Footer + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> + * PackedPayBlockNum + *
  • TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> + * PackedPayBlockNum + *
  • PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> + * {@link PackedInts PackedInts} + *
  • SumPayLength --> {@link DataOutput#writeVInt VInt} + *
  • PayData --> {@link DataOutput#writeByte byte}SumPayLength + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ *

Notes: + *

    + *
  • The order of TermPayloads/TermOffsets will be the same as TermPositions, note that + * part of payload/offsets are stored in .pos. + *
  • The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is + * the same as PackedFreqBlock in chapter Frequencies and Skip + * Data. While PackedStartDeltaBlock follows a same procedure as + * PackedDocDeltaBlock. + *
  • PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also + * synonym for PackedOffsetBlockNum. + *
  • SumPayLength is the total length of payloads written within one block, should be the + * sum of PayLengths in one packed block. + *
  • PayLength in PackedPayLengthBlock is the length of each payload associated with the + * current position. + *
+ *
+ * + * @lucene.experimental + */ +public final class Lucene90PostingsFormat extends PostingsFormat { + + /** + * Filename extension for document number, frequencies, and skip data. See chapter: Frequencies and Skip Data + */ + public static final String DOC_EXTENSION = "doc"; + + /** Filename extension for positions. See chapter: Positions */ + public static final String POS_EXTENSION = "pos"; + + /** + * Filename extension for payloads and offsets. See chapter: Payloads and + * Offsets + */ + public static final String PAY_EXTENSION = "pay"; + + /** Size of blocks. */ + public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE; + + /** + * Expert: The maximum number of skip levels. Smaller values result in slightly smaller indexes, + * but slower skipping in big posting lists. + */ + static final int MAX_SKIP_LEVELS = 10; + + static final String TERMS_CODEC = "Lucene90PostingsWriterTerms"; + static final String DOC_CODEC = "Lucene90PostingsWriterDoc"; + static final String POS_CODEC = "Lucene90PostingsWriterPos"; + static final String PAY_CODEC = "Lucene90PostingsWriterPay"; + + // Increment version to change it + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + private final int minTermBlockSize; + private final int maxTermBlockSize; + + /** Creates {@code Lucene90PostingsFormat} with default settings. */ + public Lucene90PostingsFormat() { + this( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + } + + /** + * Creates {@code Lucene90PostingsFormat} with custom values for {@code minBlockSize} and {@code + * maxBlockSize} passed to block terms dictionary. + * + * @see + * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) + */ + public Lucene90PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + super("Lucene90"); + Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); + this.minTermBlockSize = minTermBlockSize; + this.maxTermBlockSize = maxTermBlockSize; + } + + @Override + public String toString() { + return getName(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene90PostingsWriter(state); + boolean success = false; + try { + FieldsConsumer ret = + new Lucene90BlockTreeTermsWriter( + state, postingsWriter, minTermBlockSize, maxTermBlockSize); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene90PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new Lucene90BlockTreeTermsReader(postingsReader, state); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } + + /** + * Holds all state required for {@link Lucene90PostingsReader} to produce a {@link + * org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict. + * + * @lucene.internal + */ + public static final class IntBlockTermState extends BlockTermState { + /** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */ + public long docStartFP; + /** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */ + public long posStartFP; + /** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */ + public long payStartFP; + /** + * file offset for the start of the skip list, relative to docStartFP, if there are more than + * {@link ForUtil#BLOCK_SIZE} docs; otherwise -1 + */ + public long skipOffset; + /** + * file offset for the last position in the last block, if there are more than {@link + * ForUtil#BLOCK_SIZE} positions; otherwise -1 + */ + public long lastPosBlockOffset; + /** + * docid when there is a single pulsed posting, otherwise -1. freq is always implicitly + * totalTermFreq in this case. + */ + public int singletonDocID; + + /** Sole constructor. */ + public IntBlockTermState() { + skipOffset = -1; + lastPosBlockOffset = -1; + singletonDocID = -1; + } + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + skipOffset = other.skipOffset; + singletonDocID = other.singletonDocID; + } + + @Override + public String toString() { + return super.toString() + + " docStartFP=" + + docStartFP + + " posStartFP=" + + posStartFP + + " payStartFP=" + + payStartFP + + " lastPosBlockOffset=" + + lastPosBlockOffset + + " singletonDocID=" + + singletonDocID; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsReader.java new file mode 100644 index 000000000000..e2223598033f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsReader.java @@ -0,0 +1,2068 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import static org.apache.lucene.codecs.lucene90.ForUtil.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.MAX_SKIP_LEVELS; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.VERSION_START; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SlowImpactsEnum; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +/** + * Concrete class that reads docId(maybe frq,pos,offset,payloads) list with postings format. + * + * @lucene.experimental + */ +public final class Lucene90PostingsReader extends PostingsReaderBase { + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + private final int version; + + /** Sole constructor. */ + public Lucene90PostingsReader(SegmentReadState state) throws IOException { + boolean success = false; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + + // NOTE: these data files are too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + + String docName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene90PostingsFormat.DOC_EXTENSION); + try { + docIn = state.directory.openInput(docName, state.context); + version = + CodecUtil.checkIndexHeader( + docIn, + DOC_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + CodecUtil.retrieveChecksum(docIn); + + if (state.fieldInfos.hasProx()) { + String proxName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene90PostingsFormat.POS_EXTENSION); + posIn = state.directory.openInput(proxName, state.context); + CodecUtil.checkIndexHeader( + posIn, POS_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(posIn); + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene90PostingsFormat.PAY_EXTENSION); + payIn = state.directory.openInput(payName, state.context); + CodecUtil.checkIndexHeader( + payIn, PAY_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(payIn); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + } + + @Override + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { + // Make sure we are talking to the matching postings writer + CodecUtil.checkIndexHeader( + termsIn, + TERMS_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != BLOCK_SIZE) { + throw new IllegalStateException( + "index-time BLOCK_SIZE (" + + indexBlockSize + + ") != read-time BLOCK_SIZE (" + + BLOCK_SIZE + + ")"); + } + } + + /** Read values that have been written using variable-length encoding instead of bit-packing. */ + static void readVIntBlock( + IndexInput docIn, long[] docBuffer, long[] freqBuffer, int num, boolean indexHasFreq) + throws IOException { + if (indexHasFreq) { + for (int i = 0; i < num; i++) { + final int code = docIn.readVInt(); + docBuffer[i] = code >>> 1; + if ((code & 1) != 0) { + freqBuffer[i] = 1; + } else { + freqBuffer[i] = docIn.readVInt(); + } + } + } else { + for (int i = 0; i < num; i++) { + docBuffer[i] = docIn.readVInt(); + } + } + } + + static void prefixSum(long[] buffer, int count, long base) { + buffer[0] += base; + for (int i = 1; i < count; ++i) { + buffer[i] += buffer[i - 1]; + } + } + + static int findFirstGreater(long[] buffer, int target, int from) { + for (int i = from; i < BLOCK_SIZE; ++i) { + if (buffer[i] >= target) { + return i; + } + } + return BLOCK_SIZE; + } + + @Override + public BlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void close() throws IOException { + IOUtils.close(docIn, posIn, payIn); + } + + @Override + public void decodeTerm( + DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + final boolean fieldHasPositions = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean fieldHasOffsets = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + final boolean fieldHasPayloads = fieldInfo.hasPayloads(); + + if (absolute) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } + + final long l = in.readVLong(); + if ((l & 0x01) == 0) { + termState.docStartFP += l >>> 1; + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + } else { + termState.singletonDocID = -1; + } + } else { + assert absolute == false; + assert termState.singletonDocID != -1; + termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1); + } + + if (fieldHasPositions) { + termState.posStartFP += in.readVLong(); + if (fieldHasOffsets || fieldHasPayloads) { + termState.payStartFP += in.readVLong(); + } + if (termState.totalTermFreq > BLOCK_SIZE) { + termState.lastPosBlockOffset = in.readVLong(); + } else { + termState.lastPosBlockOffset = -1; + } + } + + if (termState.docFreq > BLOCK_SIZE) { + termState.skipOffset = in.readVLong(); + } else { + termState.skipOffset = -1; + } + } + + @Override + public PostingsEnum postings( + FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) + throws IOException { + + boolean indexHasPositions = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + + if (indexHasPositions == false + || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { + BlockDocsEnum docsEnum; + if (reuse instanceof BlockDocsEnum) { + docsEnum = (BlockDocsEnum) reuse; + if (!docsEnum.canReuse(docIn, fieldInfo)) { + docsEnum = new BlockDocsEnum(fieldInfo); + } + } else { + docsEnum = new BlockDocsEnum(fieldInfo); + } + return docsEnum.reset((IntBlockTermState) termState, flags); + } else { + EverythingEnum everythingEnum; + if (reuse instanceof EverythingEnum) { + everythingEnum = (EverythingEnum) reuse; + if (!everythingEnum.canReuse(docIn, fieldInfo)) { + everythingEnum = new EverythingEnum(fieldInfo); + } + } else { + everythingEnum = new EverythingEnum(fieldInfo); + } + return everythingEnum.reset((IntBlockTermState) termState, flags); + } + } + + @Override + public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) + throws IOException { + if (state.docFreq <= BLOCK_SIZE) { + // no skip data + return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); + } + + final boolean indexHasPositions = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean indexHasOffsets = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + final boolean indexHasPayloads = fieldInfo.hasPayloads(); + + if (indexHasPositions == false + || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { + return new BlockImpactsDocsEnum(fieldInfo, (IntBlockTermState) state); + } + + if (indexHasPositions + && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) + && (indexHasOffsets == false + || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) + && (indexHasPayloads == false + || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) { + return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state); + } + + return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags); + } + + final class BlockDocsEnum extends PostingsEnum { + + final PForUtil pforUtil = new PForUtil(new ForUtil()); + + private final long[] docBuffer = new long[BLOCK_SIZE + 1]; + private final long[] freqBuffer = new long[BLOCK_SIZE]; + + private int docBufferUpto; + + private Lucene90SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + private int blockUpto; // number of docs in or before the current block + private int doc; // doc we last read + private long accum; // accumulator for doc deltas + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + // docID for next skip point, we won't use skipper if + // target docID is not larger than this + private int nextSkipDoc; + + private boolean needsFreq; // true if the caller actually needs frequencies + // as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block + // always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer + // (needsFreq=false) + private boolean isFreqsRead; + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + public BlockDocsEnum(FieldInfo fieldInfo) throws IOException { + this.startDocIn = Lucene90PostingsReader.this.docIn; + this.docIn = null; + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = + fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn + && indexHasFreq + == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0) + && indexHasPos + == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) + >= 0) + && indexHasPayloads == fieldInfo.hasPayloads(); + } + + public PostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + docFreq = termState.docFreq; + totalTermFreq = indexHasFreq ? termState.totalTermFreq : docFreq; + docTermStartFP = termState.docStartFP; + skipOffset = termState.skipOffset; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + + doc = -1; + this.needsFreq = PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + this.isFreqsRead = true; + if (indexHasFreq == false || needsFreq == false) { + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + freqBuffer[i] = 1; + } + } + accum = 0; + blockUpto = 0; + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + if (isFreqsRead == false) { + pforUtil.decode(docIn, freqBuffer); // read freqBuffer for this block + isFreqsRead = true; + } + return (int) freqBuffer[docBufferUpto - 1]; + } + + @Override + public int nextPosition() throws IOException { + return -1; + } + + @Override + public int startOffset() throws IOException { + return -1; + } + + @Override + public int endOffset() throws IOException { + return -1; + } + + @Override + public BytesRef getPayload() throws IOException { + return null; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + // Check if we skipped reading the previous block of freqBuffer, and if yes, position docIn + // after it + if (isFreqsRead == false) { + pforUtil.skip(docIn); + isFreqsRead = true; + } + + final int left = docFreq - blockUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer); + + if (indexHasFreq) { + if (needsFreq) { + isFreqsRead = false; + } else { + pforUtil.skip(docIn); // skip over freqBuffer if we don't need them at all + } + } + blockUpto += BLOCK_SIZE; + } else if (docFreq == 1) { + docBuffer[0] = singletonDocID; + freqBuffer[0] = totalTermFreq; + docBuffer[1] = NO_MORE_DOCS; + blockUpto++; + } else { + // Read vInts: + readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq); + prefixSum(docBuffer, left, accum); + docBuffer[left] = NO_MORE_DOCS; + blockUpto += left; + } + accum = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); // we don't need to load freqBuffer for now (will be loaded later if + // necessary) + } + + doc = (int) docBuffer[docBufferUpto]; + docBufferUpto++; + return doc; + } + + @Override + public int advance(int target) throws IOException { + // current skip docID < docIDs generated from current buffer <= next skip docID + // we don't need to skip if target is buffered already + if (docFreq > BLOCK_SIZE && target > nextSkipDoc) { + + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = + new Lucene90SkipReader( + docIn.clone(), MAX_SKIP_LEVELS, indexHasPos, indexHasOffsets, indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init(docTermStartFP + skipOffset, docTermStartFP, 0, 0, docFreq); + skipped = true; + } + + // always plus one to fix the result, since skip position in Lucene90SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto >= blockUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + blockUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); // actually, this is just lastSkipEntry + docIn.seek(skipper.getDocPointer()); // now point to the block we want to search + // even if freqBuffer were not read from the previous block, we will mark them as read, + // as we don't need to skip the previous block freqBuffer in refillDocs, + // as we have already positioned docIn where in needs to be. + isFreqsRead = true; + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan... this is an inlined/pared down version + // of nextDoc(): + long doc; + while (true) { + doc = docBuffer[docBufferUpto]; + + if (doc >= target) { + break; + } + ++docBufferUpto; + } + + docBufferUpto++; + return this.doc = (int) doc; + } + + @Override + public long cost() { + return docFreq; + } + } + + // Also handles payloads + offsets + final class EverythingEnum extends PostingsEnum { + + final PForUtil pforUtil = new PForUtil(new ForUtil()); + + private final long[] docBuffer = new long[BLOCK_SIZE + 1]; + private final long[] freqBuffer = new long[BLOCK_SIZE + 1]; + private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; + + private final long[] payloadLengthBuffer; + private final long[] offsetStartDeltaBuffer; + private final long[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int docBufferUpto; + private int posBufferUpto; + + private Lucene90SkipReader skipper; + private boolean skipped; + + final IndexInput startDocIn; + + IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int blockUpto; // number of docs in or before the current block + private int doc; // doc we last read + private long accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // Where this term's skip data starts (after + // docTermStartFP) in the .doc file (or -1 if there is + // no skip data for this term): + private long skipOffset; + + private int nextSkipDoc; + + private boolean needsOffsets; // true if we actually need offsets + private boolean needsPayloads; // true if we actually need payloads + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + public EverythingEnum(FieldInfo fieldInfo) throws IOException { + indexHasOffsets = + fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + + this.startDocIn = Lucene90PostingsReader.this.docIn; + this.docIn = null; + this.posIn = Lucene90PostingsReader.this.posIn.clone(); + if (indexHasOffsets || indexHasPayloads) { + this.payIn = Lucene90PostingsReader.this.payIn.clone(); + } else { + this.payIn = null; + } + if (indexHasOffsets) { + offsetStartDeltaBuffer = new long[BLOCK_SIZE]; + offsetLengthBuffer = new long[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new long[BLOCK_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + } + + public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) { + return docIn == startDocIn + && indexHasOffsets + == (fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0) + && indexHasPayloads == fieldInfo.hasPayloads(); + } + + public EverythingEnum reset(IntBlockTermState termState, int flags) throws IOException { + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + skipOffset = termState.skipOffset; + totalTermFreq = termState.totalTermFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = startDocIn.clone(); + } + docIn.seek(docTermStartFP); + } + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + this.needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + this.needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + + doc = -1; + accum = 0; + blockUpto = 0; + if (docFreq > BLOCK_SIZE) { + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + } else { + nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping + } + docBufferUpto = BLOCK_SIZE; + skipped = false; + return this; + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - blockUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer); + pforUtil.decode(docIn, freqBuffer); + blockUpto += BLOCK_SIZE; + } else if (docFreq == 1) { + docBuffer[0] = singletonDocID; + freqBuffer[0] = totalTermFreq; + docBuffer[1] = NO_MORE_DOCS; + blockUpto++; + } else { + readVIntBlock(docIn, docBuffer, freqBuffer, left, true); + prefixSum(docBuffer, left, accum); + docBuffer[left] = NO_MORE_DOCS; + blockUpto += left; + } + accum = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + payloadByteUpto = 0; + } else { + pforUtil.decode(posIn, posDeltaBuffer); + + if (indexHasPayloads) { + if (needsPayloads) { + pforUtil.decode(payIn, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + pforUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + pforUtil.decode(payIn, offsetStartDeltaBuffer); + pforUtil.decode(payIn, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + pforUtil.skip(payIn); // skip over starts + pforUtil.skip(payIn); // skip over lengths + } + } + } + } + + @Override + public int nextDoc() throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + doc = (int) docBuffer[docBufferUpto]; + freq = (int) freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + + position = 0; + lastStartOffset = 0; + return doc; + } + + @Override + public int advance(int target) throws IOException { + if (target > nextSkipDoc) { + if (skipper == null) { + // Lazy init: first time this enum has ever been used for skipping + skipper = + new Lucene90SkipReader( + docIn.clone(), MAX_SKIP_LEVELS, true, indexHasOffsets, indexHasPayloads); + } + + if (!skipped) { + assert skipOffset != -1; + // This is the first time this enum has skipped + // since reset() was called; load the skip data: + skipper.init( + docTermStartFP + skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq); + skipped = true; + } + + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > blockUpto - BLOCK_SIZE + docBufferUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + blockUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + docIn.seek(skipper.getDocPointer()); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastStartOffset = 0; // new document + payloadByteUpto = skipper.getPayloadByteUpto(); + } + nextSkipDoc = skipper.getNextSkipDoc(); + } + if (docBufferUpto == BLOCK_SIZE) { + refillDocs(); + } + + // Now scan: + long doc; + while (true) { + doc = docBuffer[docBufferUpto]; + freq = (int) freqBuffer[docBufferUpto]; + posPendingCount += freq; + docBufferUpto++; + + if (doc >= target) { + break; + } + } + + position = 0; + lastStartOffset = 0; + return this.doc = (int) doc; + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while (posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + pforUtil.skip(posIn); + + if (indexHasPayloads) { + // Skip payloadLength block: + pforUtil.skip(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + pforUtil.skip(payIn); + pforUtil.skip(payIn); + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + payloadByteUpto = 0; + posBufferUpto = 0; + while (posBufferUpto < toSkip) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } + + position = 0; + lastStartOffset = 0; + } + + @Override + public int nextPosition() throws IOException { + assert posPendingCount > 0; + + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + if (payPendingFP != -1 && payIn != null) { + payIn.seek(payPendingFP); + payPendingFP = -1; + } + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (indexHasPayloads) { + payloadLength = (int) payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (indexHasOffsets) { + startOffset = lastStartOffset + (int) offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + (int) offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + } + + final class BlockImpactsDocsEnum extends ImpactsEnum { + + final PForUtil pforUtil = new PForUtil(new ForUtil()); + + private final long[] docBuffer = new long[BLOCK_SIZE + 1]; + private final long[] freqBuffer = new long[BLOCK_SIZE]; + + private int docBufferUpto; + + private final Lucene90ScoreSkipReader skipper; + + final IndexInput docIn; + + final boolean indexHasFreqs; + + private int docFreq; // number of docs in this posting list + private int blockUpto; // number of documents in or before the current block + private int doc; // doc we last read + private long accum; // accumulator for doc deltas + + private int nextSkipDoc = -1; + + private long seekTo = -1; + + // as we read freqBuffer lazily, isFreqsRead shows if freqBuffer are read for the current block + // always true when we don't have freqBuffer (indexHasFreq=false) or don't need freqBuffer + // (needsFreq=false) + private boolean isFreqsRead; + + public BlockImpactsDocsEnum(FieldInfo fieldInfo, IntBlockTermState termState) + throws IOException { + indexHasFreqs = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + final boolean indexHasPositions = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + final boolean indexHasOffsets = + fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + final boolean indexHasPayloads = fieldInfo.hasPayloads(); + + this.docIn = Lucene90PostingsReader.this.docIn.clone(); + + docFreq = termState.docFreq; + docIn.seek(termState.docStartFP); + + doc = -1; + accum = 0; + blockUpto = 0; + docBufferUpto = BLOCK_SIZE; + + skipper = + new Lucene90ScoreSkipReader( + docIn.clone(), MAX_SKIP_LEVELS, indexHasPositions, indexHasOffsets, indexHasPayloads); + skipper.init( + termState.docStartFP + termState.skipOffset, + termState.docStartFP, + termState.posStartFP, + termState.payStartFP, + docFreq); + + // We set the last element of docBuffer to NO_MORE_DOCS, it helps save conditionals in + // advance() + docBuffer[BLOCK_SIZE] = NO_MORE_DOCS; + this.isFreqsRead = true; + if (indexHasFreqs == false) { + Arrays.fill(freqBuffer, 1L); + } + } + + @Override + public int freq() throws IOException { + if (isFreqsRead == false) { + pforUtil.decode(docIn, freqBuffer); // read freqBuffer for this block + isFreqsRead = true; + } + return (int) freqBuffer[docBufferUpto - 1]; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + // Check if we skipped reading the previous block of freqBuffer, and if yes, position docIn + // after it + if (isFreqsRead == false) { + pforUtil.skip(docIn); + isFreqsRead = true; + } + + final int left = docFreq - blockUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer); + if (indexHasFreqs) { + pforUtil.decode(docIn, freqBuffer); + } + blockUpto += BLOCK_SIZE; + } else { + readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreqs); + prefixSum(docBuffer, left, accum); + docBuffer[left] = NO_MORE_DOCS; + blockUpto += left; + } + accum = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + assert docBuffer[BLOCK_SIZE] == NO_MORE_DOCS; + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > nextSkipDoc) { + // always plus one to fix the result, since skip position in Lucene90SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto >= blockUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + blockUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + seekTo = skipper.getDocPointer(); // delay the seek + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + assert nextSkipDoc >= target; + } + + @Override + public Impacts getImpacts() throws IOException { + advanceShallow(doc); + return skipper.getImpacts(); + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target > nextSkipDoc) { + advanceShallow(target); + } + if (docBufferUpto == BLOCK_SIZE) { + if (seekTo >= 0) { + docIn.seek(seekTo); + isFreqsRead = true; // reset isFreqsRead + seekTo = -1; + } + refillDocs(); + } + + int next = findFirstGreater(docBuffer, target, docBufferUpto); + this.doc = (int) docBuffer[next]; + docBufferUpto = next + 1; + return doc; + } + + @Override + public int nextPosition() throws IOException { + return -1; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + @Override + public long cost() { + return docFreq; + } + } + + final class BlockImpactsPostingsEnum extends ImpactsEnum { + + final PForUtil pforUtil = new PForUtil(new ForUtil()); + + private final long[] docBuffer = new long[BLOCK_SIZE]; + private final long[] freqBuffer = new long[BLOCK_SIZE]; + private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; + + private int docBufferUpto; + private int posBufferUpto; + + private final Lucene90ScoreSkipReader skipper; + + final IndexInput docIn; + final IndexInput posIn; + + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int doc; // doc we last read + private long accum; // accumulator for doc deltas + private int freq; // freq we last read + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + private int nextSkipDoc = -1; + + private long seekTo = -1; + + public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) + throws IOException { + indexHasOffsets = + fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + + this.docIn = Lucene90PostingsReader.this.docIn.clone(); + + this.posIn = Lucene90PostingsReader.this.posIn.clone(); + + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + totalTermFreq = termState.totalTermFreq; + docIn.seek(docTermStartFP); + posPendingFP = posTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + docBufferUpto = BLOCK_SIZE; + + skipper = + new Lucene90ScoreSkipReader( + docIn.clone(), MAX_SKIP_LEVELS, true, indexHasOffsets, indexHasPayloads); + skipper.init( + docTermStartFP + termState.skipOffset, + docTermStartFP, + posTermStartFP, + payTermStartFP, + docFreq); + } + + @Override + public int freq() throws IOException { + return freq; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + final int left = docFreq - docUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer); + pforUtil.decode(docIn, freqBuffer); + } else { + readVIntBlock(docIn, docBuffer, freqBuffer, left, true); + prefixSum(docBuffer, left, accum); + docBuffer[left] = NO_MORE_DOCS; + } + accum = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + posIn.seek(posIn.getFilePointer() + payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + if (indexHasOffsets) { + if ((posIn.readVInt() & 1) != 0) { + // offset length changed + posIn.readVInt(); + } + } + } + } else { + pforUtil.decode(posIn, posDeltaBuffer); + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > nextSkipDoc) { + // always plus one to fix the result, since skip position in Lucene90SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + posPendingFP = skipper.getPosPointer(); + posPendingCount = skipper.getPosBufferUpto(); + seekTo = skipper.getDocPointer(); // delay the seek + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + assert nextSkipDoc >= target; + } + + @Override + public Impacts getImpacts() throws IOException { + advanceShallow(doc); + return skipper.getImpacts(); + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target > nextSkipDoc) { + advanceShallow(target); + } + if (docBufferUpto == BLOCK_SIZE) { + if (seekTo >= 0) { + docIn.seek(seekTo); + seekTo = -1; + } + refillDocs(); + } + + int next = findFirstGreater(docBuffer, target, docBufferUpto); + if (next == BLOCK_SIZE) { + return doc = NO_MORE_DOCS; + } + this.doc = (int) docBuffer[next]; + this.freq = (int) freqBuffer[next]; + for (int i = docBufferUpto; i <= next; ++i) { + posPendingCount += freqBuffer[i]; + } + docUpto += next - docBufferUpto + 1; + docBufferUpto = next + 1; + position = 0; + return doc; + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + posBufferUpto += toSkip; + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + pforUtil.skip(posIn); + toSkip -= BLOCK_SIZE; + } + refillPositions(); + posBufferUpto = toSkip; + } + + position = 0; + } + + @Override + public int nextPosition() throws IOException { + assert posPendingCount > 0; + + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freq) { + skipPositions(); + posPendingCount = freq; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto++]; + + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return -1; + } + + @Override + public int endOffset() { + return -1; + } + + @Override + public BytesRef getPayload() { + return null; + } + + @Override + public long cost() { + return docFreq; + } + } + + final class BlockImpactsEverythingEnum extends ImpactsEnum { + + final PForUtil pforUtil = new PForUtil(new ForUtil()); + + private final long[] docBuffer = new long[BLOCK_SIZE]; + private final long[] freqBuffer = new long[BLOCK_SIZE]; + private final long[] posDeltaBuffer = new long[BLOCK_SIZE]; + + private final long[] payloadLengthBuffer; + private final long[] offsetStartDeltaBuffer; + private final long[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset = -1; + private int endOffset = -1; + + private int docBufferUpto; + private int posBufferUpto; + + private final Lucene90ScoreSkipReader skipper; + + final IndexInput docIn; + final IndexInput posIn; + final IndexInput payIn; + final BytesRef payload; + + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // number of positions in this posting list + private int docUpto; // how many docs we've read + private int posDocUpTo; // for how many docs we've read positions, offsets, and payloads + private int doc; // doc we last read + private long accum; // accumulator for doc deltas + private int position; // current position + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // Lazy pos seek: if != -1 then we must seek to this FP + // before reading positions: + private long posPendingFP; + + // Lazy pay seek: if != -1 then we must seek to this FP + // before reading payloads/offsets: + private long payPendingFP; + + // Where this term's postings start in the .doc file: + private long docTermStartFP; + + // Where this term's postings start in the .pos file: + private long posTermStartFP; + + // Where this term's payloads/offsets start in the .pay + // file: + private long payTermStartFP; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + private int nextSkipDoc = -1; + + private final boolean needsPositions; + private final boolean needsOffsets; // true if we actually need offsets + private final boolean needsPayloads; // true if we actually need payloads + + private boolean + isFreqsRead; // shows if freqBuffer for the current doc block are read into freqBuffer + + private long seekTo = -1; + + public BlockImpactsEverythingEnum(FieldInfo fieldInfo, IntBlockTermState termState, int flags) + throws IOException { + indexHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = + fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = + fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + + needsPositions = PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS); + needsOffsets = PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + needsPayloads = PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + + this.docIn = Lucene90PostingsReader.this.docIn.clone(); + + if (indexHasPos && needsPositions) { + this.posIn = Lucene90PostingsReader.this.posIn.clone(); + } else { + this.posIn = null; + } + + if ((indexHasOffsets && needsOffsets) || (indexHasPayloads && needsPayloads)) { + this.payIn = Lucene90PostingsReader.this.payIn.clone(); + } else { + this.payIn = null; + } + + if (indexHasOffsets) { + offsetStartDeltaBuffer = new long[BLOCK_SIZE]; + offsetLengthBuffer = new long[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new long[BLOCK_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + + docFreq = termState.docFreq; + docTermStartFP = termState.docStartFP; + posTermStartFP = termState.posStartFP; + payTermStartFP = termState.payStartFP; + totalTermFreq = termState.totalTermFreq; + docIn.seek(docTermStartFP); + posPendingFP = posTermStartFP; + payPendingFP = payTermStartFP; + posPendingCount = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + doc = -1; + accum = 0; + docUpto = 0; + posDocUpTo = 0; + isFreqsRead = true; + docBufferUpto = BLOCK_SIZE; + + skipper = + new Lucene90ScoreSkipReader( + docIn.clone(), MAX_SKIP_LEVELS, indexHasPos, indexHasOffsets, indexHasPayloads); + skipper.init( + docTermStartFP + termState.skipOffset, + docTermStartFP, + posTermStartFP, + payTermStartFP, + docFreq); + + if (indexHasFreq == false) { + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + freqBuffer[i] = 1; + } + } + } + + @Override + public int freq() throws IOException { + if (indexHasFreq && (isFreqsRead == false)) { + pforUtil.decode(docIn, freqBuffer); // read freqBuffer for this block + isFreqsRead = true; + } + return (int) freqBuffer[docBufferUpto - 1]; + } + + @Override + public int docID() { + return doc; + } + + private void refillDocs() throws IOException { + if (indexHasFreq) { + if (isFreqsRead == false) { // previous freq block was not read + // check if we need to load the previous freq block to catch up on positions or we can + // skip it + if (indexHasPos && needsPositions && (posDocUpTo < docUpto)) { + pforUtil.decode(docIn, freqBuffer); // load the previous freq block + } else { + pforUtil.skip(docIn); // skip it + } + isFreqsRead = true; + } + if (indexHasPos && needsPositions) { + while (posDocUpTo + < docUpto) { // catch on positions, bring posPendingCount upto the current doc + posPendingCount += freqBuffer[docBufferUpto - (docUpto - posDocUpTo)]; + posDocUpTo++; + } + } + } + + final int left = docFreq - docUpto; + assert left >= 0; + + if (left >= BLOCK_SIZE) { + pforUtil.decodeAndPrefixSum(docIn, accum, docBuffer); + if (indexHasFreq) { + isFreqsRead = + false; // freq block will be loaded lazily when necessary, we don't load it here + } + } else { + readVIntBlock(docIn, docBuffer, freqBuffer, left, indexHasFreq); + prefixSum(docBuffer, left, accum); + docBuffer[left] = NO_MORE_DOCS; + } + accum = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + payloadByteUpto = 0; + } else { + pforUtil.decode(posIn, posDeltaBuffer); + + if (indexHasPayloads && payIn != null) { + if (needsPayloads) { + pforUtil.decode(payIn, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + pforUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets && payIn != null) { + if (needsOffsets) { + pforUtil.decode(payIn, offsetStartDeltaBuffer); + pforUtil.decode(payIn, offsetLengthBuffer); + } else { + // this works, because when writing a vint block we always force the first length to be + // written + pforUtil.skip(payIn); // skip over starts + pforUtil.skip(payIn); // skip over lengths + } + } + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > nextSkipDoc) { + // always plus one to fix the result, since skip position in Lucene90SkipReader + // is a little different from MultiLevelSkipListReader + final int newDocUpto = skipper.skipTo(target) + 1; + + if (newDocUpto > docUpto) { + // Skipper moved + assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto; + docUpto = newDocUpto; + posDocUpTo = docUpto; + + // Force to read next block + docBufferUpto = BLOCK_SIZE; + accum = skipper.getDoc(); + posPendingFP = skipper.getPosPointer(); + payPendingFP = skipper.getPayPointer(); + posPendingCount = skipper.getPosBufferUpto(); + lastStartOffset = 0; // new document + payloadByteUpto = skipper.getPayloadByteUpto(); // actually, this is just lastSkipEntry + seekTo = skipper.getDocPointer(); // delay the seek + } + // next time we call advance, this is used to + // foresee whether skipper is necessary. + nextSkipDoc = skipper.getNextSkipDoc(); + } + assert nextSkipDoc >= target; + } + + @Override + public Impacts getImpacts() throws IOException { + advanceShallow(doc); + return skipper.getImpacts(); + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + if (target > nextSkipDoc) { + advanceShallow(target); + } + if (docBufferUpto == BLOCK_SIZE) { + if (seekTo >= 0) { + docIn.seek(seekTo); + seekTo = -1; + isFreqsRead = true; // reset isFreqsRead + } + refillDocs(); + } + + // Now scan: + long doc; + while (true) { + doc = docBuffer[docBufferUpto]; + docBufferUpto++; + docUpto++; + + if (doc >= target) { + break; + } + + if (docBufferUpto == BLOCK_SIZE) { + return this.doc = NO_MORE_DOCS; + } + } + position = 0; + lastStartOffset = 0; + + return this.doc = (int) doc; + } + + // TODO: in theory we could avoid loading frq block + // when not needed, ie, use skip data to load how far to + // seek the pos pointer ... instead of having to load frq + // blocks only to sum up how many positions to skip + private void skipPositions() throws IOException { + // Skip positions now: + int toSkip = posPendingCount - (int) freqBuffer[docBufferUpto - 1]; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + while (posBufferUpto < end) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + pforUtil.skip(posIn); + + if (indexHasPayloads && payIn != null) { + // Skip payloadLength block: + pforUtil.skip(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets && payIn != null) { + pforUtil.skip(payIn); + pforUtil.skip(payIn); + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + payloadByteUpto = 0; + posBufferUpto = 0; + while (posBufferUpto < toSkip) { + if (indexHasPayloads) { + payloadByteUpto += payloadLengthBuffer[posBufferUpto]; + } + posBufferUpto++; + } + } + + position = 0; + lastStartOffset = 0; + } + + @Override + public int nextPosition() throws IOException { + if (indexHasPos == false || needsPositions == false) { + return -1; + } + + if (isFreqsRead == false) { + pforUtil.decode(docIn, freqBuffer); // read freqBuffer for this docs block + isFreqsRead = true; + } + while (posDocUpTo < docUpto) { // bring posPendingCount upto the current doc + posPendingCount += freqBuffer[docBufferUpto - (docUpto - posDocUpTo)]; + posDocUpTo++; + } + + assert posPendingCount > 0; + + if (posPendingFP != -1) { + posIn.seek(posPendingFP); + posPendingFP = -1; + + if (payPendingFP != -1 && payIn != null) { + payIn.seek(payPendingFP); + payPendingFP = -1; + } + + // Force buffer refill: + posBufferUpto = BLOCK_SIZE; + } + + if (posPendingCount > freqBuffer[docBufferUpto - 1]) { + skipPositions(); + posPendingCount = (int) freqBuffer[docBufferUpto - 1]; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (indexHasPayloads) { + payloadLength = (int) payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (indexHasOffsets && needsOffsets) { + startOffset = lastStartOffset + (int) offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + (int) offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + return startOffset; + } + + @Override + public int endOffset() { + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + } + + @Override + public void checkIntegrity() throws IOException { + if (docIn != null) { + CodecUtil.checksumEntireFile(docIn); + } + if (posIn != null) { + CodecUtil.checksumEntireFile(posIn); + } + if (payIn != null) { + CodecUtil.checksumEntireFile(payIn); + } + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(positions=" + + (posIn != null) + + ",payloads=" + + (payIn != null) + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsWriter.java new file mode 100644 index 000000000000..f640d8d2c573 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PostingsWriter.java @@ -0,0 +1,536 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import static org.apache.lucene.codecs.lucene90.ForUtil.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.MAX_SKIP_LEVELS; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.VERSION_CURRENT; + +import java.io.IOException; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PushPostingsWriterBase; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +/** + * Concrete class that writes docId(maybe frq,pos,offset,payloads) list with postings format. + * + *

Postings list for each term will be stored separately. + * + * @see Lucene90SkipWriter for details about skipping setting and postings layout. + * @lucene.experimental + */ +public final class Lucene90PostingsWriter extends PushPostingsWriterBase { + + IndexOutput docOut; + IndexOutput posOut; + IndexOutput payOut; + + static final IntBlockTermState emptyState = new IntBlockTermState(); + IntBlockTermState lastState; + + // Holds starting file pointers for current term: + private long docStartFP; + private long posStartFP; + private long payStartFP; + + final long[] docDeltaBuffer; + final long[] freqBuffer; + private int docBufferUpto; + + final long[] posDeltaBuffer; + final long[] payloadLengthBuffer; + final long[] offsetStartDeltaBuffer; + final long[] offsetLengthBuffer; + private int posBufferUpto; + + private byte[] payloadBytes; + private int payloadByteUpto; + + private int lastBlockDocID; + private long lastBlockPosFP; + private long lastBlockPayFP; + private int lastBlockPosBufferUpto; + private int lastBlockPayloadByteUpto; + + private int lastDocID; + private int lastPosition; + private int lastStartOffset; + private int docCount; + + private final PForUtil pforUtil; + private final Lucene90SkipWriter skipWriter; + + private boolean fieldHasNorms; + private NumericDocValues norms; + private final CompetitiveImpactAccumulator competitiveFreqNormAccumulator = + new CompetitiveImpactAccumulator(); + + /** Creates a postings writer */ + public Lucene90PostingsWriter(SegmentWriteState state) throws IOException { + + String docFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene90PostingsFormat.DOC_EXTENSION); + docOut = state.directory.createOutput(docFileName, state.context); + IndexOutput posOut = null; + IndexOutput payOut = null; + boolean success = false; + try { + CodecUtil.writeIndexHeader( + docOut, DOC_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + pforUtil = new PForUtil(new ForUtil()); + if (state.fieldInfos.hasProx()) { + posDeltaBuffer = new long[BLOCK_SIZE]; + String posFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene90PostingsFormat.POS_EXTENSION); + posOut = state.directory.createOutput(posFileName, state.context); + CodecUtil.writeIndexHeader( + posOut, POS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + + if (state.fieldInfos.hasPayloads()) { + payloadBytes = new byte[128]; + payloadLengthBuffer = new long[BLOCK_SIZE]; + } else { + payloadBytes = null; + payloadLengthBuffer = null; + } + + if (state.fieldInfos.hasOffsets()) { + offsetStartDeltaBuffer = new long[BLOCK_SIZE]; + offsetLengthBuffer = new long[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + } + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene90PostingsFormat.PAY_EXTENSION); + payOut = state.directory.createOutput(payFileName, state.context); + CodecUtil.writeIndexHeader( + payOut, PAY_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + } + } else { + posDeltaBuffer = null; + payloadLengthBuffer = null; + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + payloadBytes = null; + } + this.payOut = payOut; + this.posOut = posOut; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docOut, posOut, payOut); + } + } + + docDeltaBuffer = new long[BLOCK_SIZE]; + freqBuffer = new long[BLOCK_SIZE]; + + // TODO: should we try skipping every 2/4 blocks...? + skipWriter = + new Lucene90SkipWriter( + MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut); + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeIndexHeader( + termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + termsOut.writeVInt(BLOCK_SIZE); + } + + @Override + public void setField(FieldInfo fieldInfo) { + super.setField(fieldInfo); + skipWriter.setField(writePositions, writeOffsets, writePayloads); + lastState = emptyState; + fieldHasNorms = fieldInfo.hasNorms(); + } + + @Override + public void startTerm(NumericDocValues norms) { + docStartFP = docOut.getFilePointer(); + if (writePositions) { + posStartFP = posOut.getFilePointer(); + if (writePayloads || writeOffsets) { + payStartFP = payOut.getFilePointer(); + } + } + lastDocID = 0; + lastBlockDocID = -1; + skipWriter.resetSkip(); + this.norms = norms; + competitiveFreqNormAccumulator.clear(); + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + // Have collected a block of docs, and get a new doc. + // Should write skip data as well as postings list for + // current block. + if (lastBlockDocID != -1 && docBufferUpto == 0) { + skipWriter.bufferSkip( + lastBlockDocID, + competitiveFreqNormAccumulator, + docCount, + lastBlockPosFP, + lastBlockPayFP, + lastBlockPosBufferUpto, + lastBlockPayloadByteUpto); + competitiveFreqNormAccumulator.clear(); + } + + final int docDelta = docID - lastDocID; + + if (docID < 0 || (docCount > 0 && docDelta <= 0)) { + throw new CorruptIndexException( + "docs out of order (" + docID + " <= " + lastDocID + " )", docOut); + } + + docDeltaBuffer[docBufferUpto] = docDelta; + if (writeFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + docBufferUpto++; + docCount++; + + if (docBufferUpto == BLOCK_SIZE) { + pforUtil.encode(docDeltaBuffer, docOut); + if (writeFreqs) { + pforUtil.encode(freqBuffer, docOut); + } + // NOTE: don't set docBufferUpto back to 0 here; + // finishDoc will do so (because it needs to see that + // the block was filled so it can save skip data) + } + + lastDocID = docID; + lastPosition = 0; + lastStartOffset = 0; + + long norm; + if (fieldHasNorms) { + boolean found = norms.advanceExact(docID); + if (found == false) { + // This can happen if indexing hits a problem after adding a doc to the + // postings but before buffering the norm. Such documents are written + // deleted and will go away on the first merge. + norm = 1L; + } else { + norm = norms.longValue(); + assert norm != 0 : docID; + } + } else { + norm = 1L; + } + + competitiveFreqNormAccumulator.add(writeFreqs ? termDocFreq : 1, norm); + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) + throws IOException { + if (position > IndexWriter.MAX_POSITION) { + throw new CorruptIndexException( + "position=" + + position + + " is too large (> IndexWriter.MAX_POSITION=" + + IndexWriter.MAX_POSITION + + ")", + docOut); + } + if (position < 0) { + throw new CorruptIndexException("position=" + position + " is < 0", docOut); + } + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (writePayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy( + payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (writeOffsets) { + assert startOffset >= lastStartOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastStartOffset = startOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == BLOCK_SIZE) { + pforUtil.encode(posDeltaBuffer, posOut); + + if (writePayloads) { + pforUtil.encode(payloadLengthBuffer, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (writeOffsets) { + pforUtil.encode(offsetStartDeltaBuffer, payOut); + pforUtil.encode(offsetLengthBuffer, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() throws IOException { + // Since we don't know df for current term, we had to buffer + // those skip data for each block, and when a new doc comes, + // write them to skip file. + if (docBufferUpto == BLOCK_SIZE) { + lastBlockDocID = lastDocID; + if (posOut != null) { + if (payOut != null) { + lastBlockPayFP = payOut.getFilePointer(); + } + lastBlockPosFP = posOut.getFilePointer(); + lastBlockPosBufferUpto = posBufferUpto; + lastBlockPayloadByteUpto = payloadByteUpto; + } + docBufferUpto = 0; + } + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(BlockTermState _state) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + assert state.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert state.docFreq == docCount : state.docFreq + " vs " + docCount; + + // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to + // it. + final int singletonDocID; + if (state.docFreq == 1) { + // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq + singletonDocID = (int) docDeltaBuffer[0]; + } else { + singletonDocID = -1; + // vInt encode the remaining doc deltas and freqs: + for (int i = 0; i < docBufferUpto; i++) { + final int docDelta = (int) docDeltaBuffer[i]; + final int freq = (int) freqBuffer[i]; + if (!writeFreqs) { + docOut.writeVInt(docDelta); + } else if (freq == 1) { + docOut.writeVInt((docDelta << 1) | 1); + } else { + docOut.writeVInt(docDelta << 1); + docOut.writeVInt(freq); + } + } + } + + final long lastPosBlockOffset; + + if (writePositions) { + // totalTermFreq is just total number of positions(or payloads, or offsets) + // associated with current term. + assert state.totalTermFreq != -1; + if (state.totalTermFreq > BLOCK_SIZE) { + // record file offset for last pos in last block + lastPosBlockOffset = posOut.getFilePointer() - posStartFP; + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + // TODO: should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< BLOCK_SIZE) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; // force first payload length to be written + int lastOffsetLength = -1; // force first offset length to be written + int payloadBytesReadUpto = 0; + for (int i = 0; i < posBufferUpto; i++) { + final int posDelta = (int) posDeltaBuffer[i]; + if (writePayloads) { + final int payloadLength = (int) payloadLengthBuffer[i]; + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + posOut.writeVInt((posDelta << 1) | 1); + posOut.writeVInt(payloadLength); + } else { + posOut.writeVInt(posDelta << 1); + } + + if (payloadLength != 0) { + posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength); + payloadBytesReadUpto += payloadLength; + } + } else { + posOut.writeVInt(posDelta); + } + + if (writeOffsets) { + int delta = (int) offsetStartDeltaBuffer[i]; + int length = (int) offsetLengthBuffer[i]; + if (length == lastOffsetLength) { + posOut.writeVInt(delta << 1); + } else { + posOut.writeVInt(delta << 1 | 1); + posOut.writeVInt(length); + lastOffsetLength = length; + } + } + } + + if (writePayloads) { + assert payloadBytesReadUpto == payloadByteUpto; + payloadByteUpto = 0; + } + } + } else { + lastPosBlockOffset = -1; + } + + long skipOffset; + if (docCount > BLOCK_SIZE) { + skipOffset = skipWriter.writeSkip(docOut) - docStartFP; + } else { + skipOffset = -1; + } + + state.docStartFP = docStartFP; + state.posStartFP = posStartFP; + state.payStartFP = payStartFP; + state.singletonDocID = singletonDocID; + state.skipOffset = skipOffset; + state.lastPosBlockOffset = lastPosBlockOffset; + docBufferUpto = 0; + posBufferUpto = 0; + lastDocID = 0; + docCount = 0; + } + + @Override + public void encodeTerm( + DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) + throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + if (absolute) { + lastState = emptyState; + assert lastState.docStartFP == 0; + } + + if (lastState.singletonDocID != -1 + && state.singletonDocID != -1 + && state.docStartFP == lastState.docStartFP) { + // With runs of rare values such as ID fields, the increment of pointers in the docs file is + // often 0. + // Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we + // encode the delta + // between consecutive doc IDs to save space. + final long delta = (long) state.singletonDocID - lastState.singletonDocID; + out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01); + } else { + out.writeVLong((state.docStartFP - lastState.docStartFP) << 1); + if (state.singletonDocID != -1) { + out.writeVInt(state.singletonDocID); + } + } + + if (writePositions) { + out.writeVLong(state.posStartFP - lastState.posStartFP); + if (writePayloads || writeOffsets) { + out.writeVLong(state.payStartFP - lastState.payStartFP); + } + } + if (writePositions) { + if (state.lastPosBlockOffset != -1) { + out.writeVLong(state.lastPosBlockOffset); + } + } + if (state.skipOffset != -1) { + out.writeVLong(state.skipOffset); + } + lastState = state; + } + + @Override + public void close() throws IOException { + // TODO: add a finish() at least to PushBase? DV too...? + boolean success = false; + try { + if (docOut != null) { + CodecUtil.writeFooter(docOut); + } + if (posOut != null) { + CodecUtil.writeFooter(posOut); + } + if (payOut != null) { + CodecUtil.writeFooter(payOut); + } + success = true; + } finally { + if (success) { + IOUtils.close(docOut, posOut, payOut); + } else { + IOUtils.closeWhileHandlingException(docOut, posOut, payOut); + } + docOut = posOut = payOut = null; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90ScoreSkipReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90ScoreSkipReader.java new file mode 100644 index 000000000000..44789a983344 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90ScoreSkipReader.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.AbstractList; +import java.util.Arrays; +import java.util.List; +import java.util.RandomAccess; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; + +final class Lucene90ScoreSkipReader extends Lucene90SkipReader { + + private final byte[][] impactData; + private final int[] impactDataLength; + private final ByteArrayDataInput badi = new ByteArrayDataInput(); + private final Impacts impacts; + private int numLevels = 1; + private final MutableImpactList[] perLevelImpacts; + + public Lucene90ScoreSkipReader( + IndexInput skipStream, + int maxSkipLevels, + boolean hasPos, + boolean hasOffsets, + boolean hasPayloads) { + super(skipStream, maxSkipLevels, hasPos, hasOffsets, hasPayloads); + this.impactData = new byte[maxSkipLevels][]; + Arrays.fill(impactData, new byte[0]); + this.impactDataLength = new int[maxSkipLevels]; + this.perLevelImpacts = new MutableImpactList[maxSkipLevels]; + for (int i = 0; i < perLevelImpacts.length; ++i) { + perLevelImpacts[i] = new MutableImpactList(); + } + impacts = + new Impacts() { + + @Override + public int numLevels() { + return numLevels; + } + + @Override + public int getDocIdUpTo(int level) { + return skipDoc[level]; + } + + @Override + public List getImpacts(int level) { + assert level < numLevels; + if (impactDataLength[level] > 0) { + badi.reset(impactData[level], 0, impactDataLength[level]); + perLevelImpacts[level] = readImpacts(badi, perLevelImpacts[level]); + impactDataLength[level] = 0; + } + return perLevelImpacts[level]; + } + }; + } + + @Override + public int skipTo(int target) throws IOException { + int result = super.skipTo(target); + if (numberOfSkipLevels > 0) { + numLevels = numberOfSkipLevels; + } else { + // End of postings don't have skip data anymore, so we fill with dummy data + // like SlowImpactsEnum. + numLevels = 1; + perLevelImpacts[0].length = 1; + perLevelImpacts[0].impacts[0].freq = Integer.MAX_VALUE; + perLevelImpacts[0].impacts[0].norm = 1L; + impactDataLength[0] = 0; + } + return result; + } + + Impacts getImpacts() { + return impacts; + } + + @Override + protected void readImpacts(int level, IndexInput skipStream) throws IOException { + int length = skipStream.readVInt(); + if (impactData[level].length < length) { + impactData[level] = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; + } + skipStream.readBytes(impactData[level], 0, length); + impactDataLength[level] = length; + } + + static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList reuse) { + int maxNumImpacts = in.length(); // at most one impact per byte + if (reuse.impacts.length < maxNumImpacts) { + int oldLength = reuse.impacts.length; + reuse.impacts = ArrayUtil.grow(reuse.impacts, maxNumImpacts); + for (int i = oldLength; i < reuse.impacts.length; ++i) { + reuse.impacts[i] = new Impact(Integer.MAX_VALUE, 1L); + } + } + + int freq = 0; + long norm = 0; + int length = 0; + while (in.getPosition() < in.length()) { + int freqDelta = in.readVInt(); + if ((freqDelta & 0x01) != 0) { + freq += 1 + (freqDelta >>> 1); + try { + norm += 1 + in.readZLong(); + } catch (IOException e) { + throw new RuntimeException(e); // cannot happen on a BADI + } + } else { + freq += 1 + (freqDelta >>> 1); + norm++; + } + Impact impact = reuse.impacts[length]; + impact.freq = freq; + impact.norm = norm; + length++; + } + reuse.length = length; + return reuse; + } + + static class MutableImpactList extends AbstractList implements RandomAccess { + int length = 1; + Impact[] impacts = new Impact[] {new Impact(Integer.MAX_VALUE, 1L)}; + + @Override + public Impact get(int index) { + return impacts[index]; + } + + @Override + public int size() { + return length; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java new file mode 100644 index 000000000000..fc102e0a608f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SegmentInfoFormat.java @@ -0,0 +1,242 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexSorter; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.index.SortFieldProvider; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Version; + +/** + * Lucene 9.0 Segment info format. + * + *

Files: + * + *

    + *
  • .si: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files, + * Attributes, IndexSort, Footer + *
+ * + * Data types: + * + *
    + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
  • SegSize --> {@link DataOutput#writeInt Int32} + *
  • SegVersion --> {@link DataOutput#writeString String} + *
  • SegMinVersion --> {@link DataOutput#writeString String} + *
  • Files --> {@link DataOutput#writeSetOfStrings Set<String>} + *
  • Diagnostics,Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>} + *
  • IsCompoundFile --> {@link DataOutput#writeByte Int8} + *
  • IndexSort --> {@link DataOutput#writeVInt Int32} count, followed by {@code count} + * SortField + *
  • SortField --> {@link DataOutput#writeString String} sort class, followed by a per-sort + * bytestream (see {@link SortFieldProvider#readSortField(DataInput)}) + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
+ * + * Field Descriptions: + * + *
    + *
  • SegVersion is the code version that created the segment. + *
  • SegMinVersion is the minimum code version that contributed documents to the segment. + *
  • SegSize is the number of documents contained in the segment index. + *
  • IsCompoundFile records whether the segment is written as a compound file or not. If this is + * -1, the segment is not a compound file. If it is 1, the segment is a compound file. + *
  • The Diagnostics Map is privately written by {@link IndexWriter}, as a debugging aid, for + * each segment it creates. It includes metadata like the current Lucene version, OS, Java + * version, why the segment was created (merge, flush, addIndexes), etc. + *
  • Files is a list of files referred to by this segment. + *
+ * + * @see SegmentInfos + * @lucene.experimental + */ +public class Lucene90SegmentInfoFormat extends SegmentInfoFormat { + + /** File extension used to store {@link SegmentInfo}. */ + public static final String SI_EXTENSION = "si"; + + static final String CODEC_NAME = "Lucene90SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor. */ + public Lucene90SegmentInfoFormat() {} + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) + throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName, context)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + CodecUtil.checkIndexHeader( + input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, ""); + si = parseSegmentInfo(dir, input, segment, segmentID); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return si; + } + } + + private SegmentInfo parseSegmentInfo( + Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException { + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + byte hasMinVersion = input.readByte(); + final Version minVersion; + switch (hasMinVersion) { + case 0: + minVersion = null; + break; + case 1: + minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + break; + default: + throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input); + } + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; + + final Map diagnostics = input.readMapOfStrings(); + final Set files = input.readSetOfStrings(); + final Map attributes = input.readMapOfStrings(); + + int numSortFields = input.readVInt(); + Sort indexSort; + if (numSortFields > 0) { + SortField[] sortFields = new SortField[numSortFields]; + for (int i = 0; i < numSortFields; i++) { + String name = input.readString(); + sortFields[i] = SortFieldProvider.forName(name).readSortField(input); + } + indexSort = new Sort(sortFields); + } else if (numSortFields < 0) { + throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); + } else { + indexSort = null; + } + + SegmentInfo si = + new SegmentInfo( + dir, + version, + minVersion, + segment, + docCount, + isCompoundFile, + false, + null, + diagnostics, + segmentID, + attributes, + indexSort); + si.setFiles(files); + return si; + } + + @Override + public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { + final String fileName = IndexFileNames.segmentFileName(si.name, "", SI_EXTENSION); + + try (IndexOutput output = dir.createOutput(fileName, ioContext)) { + // Only add the file once we've successfully created it, else IFD assert can trip: + si.addFile(fileName); + CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, si.getId(), ""); + + writeSegmentInfo(output, si); + + CodecUtil.writeFooter(output); + } + } + + private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOException { + Version version = si.getVersion(); + if (version.major < 7) { + throw new IllegalArgumentException( + "invalid major version: should be >= 7 but got: " + version.major + " segment=" + si); + } + // Write the Lucene version that created this segment, since 3.1 + output.writeInt(version.major); + output.writeInt(version.minor); + output.writeInt(version.bugfix); + + // Write the min Lucene version that contributed docs to the segment, since 7.0 + if (si.getMinVersion() != null) { + output.writeByte((byte) 1); + Version minVersion = si.getMinVersion(); + output.writeInt(minVersion.major); + output.writeInt(minVersion.minor); + output.writeInt(minVersion.bugfix); + } else { + output.writeByte((byte) 0); + } + + assert version.prerelease == 0; + output.writeInt(si.maxDoc()); + + output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO)); + output.writeMapOfStrings(si.getDiagnostics()); + Set files = si.files(); + for (String file : files) { + if (!IndexFileNames.parseSegmentName(file).equals(si.name)) { + throw new IllegalArgumentException( + "invalid files: expected segment=" + si.name + ", got=" + files); + } + } + output.writeSetOfStrings(files); + output.writeMapOfStrings(si.getAttributes()); + + Sort indexSort = si.getIndexSort(); + int numSortFields = indexSort == null ? 0 : indexSort.getSort().length; + output.writeVInt(numSortFields); + for (int i = 0; i < numSortFields; ++i) { + SortField sortField = indexSort.getSort()[i]; + IndexSorter sorter = sortField.getIndexSorter(); + if (sorter == null) { + throw new IllegalArgumentException("cannot serialize SortField " + sortField); + } + output.writeString(sorter.getProviderName()); + SortFieldProvider.write(sortField, output); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipReader.java new file mode 100644 index 000000000000..da31bd75a80a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipReader.java @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.MultiLevelSkipListReader; +import org.apache.lucene.store.IndexInput; + +/** + * Implements the skip list reader for block postings format that stores positions and payloads. + * + *

Although this skipper uses MultiLevelSkipListReader as an interface, its definition of skip + * position will be a little different. + * + *

For example, when skipInterval = blockSize = 3, df = 2*skipInterval = 6, + * + *

+ * 0 1 2 3 4 5
+ * d d d d d d    (posting list)
+ *     ^     ^    (skip point in MultiLeveSkipWriter)
+ *       ^        (skip point in Lucene90SkipWriter)
+ * 
+ * + *

In this case, MultiLevelSkipListReader will use the last document as a skip point, while + * Lucene90SkipReader should assume no skip point will comes. + * + *

If we use the interface directly in Lucene90SkipReader, it may silly try to read another skip + * data after the only skip point is loaded. + * + *

To illustrate this, we can call skipTo(d[5]), since skip point d[3] has smaller docId, and + * numSkipped+blockSize== df, the MultiLevelSkipListReader will assume the skip list isn't exhausted + * yet, and try to load a non-existed skip point + * + *

Therefore, we'll trim df before passing it to the interface. see trim(int) + */ +class Lucene90SkipReader extends MultiLevelSkipListReader { + private long[] docPointer; + private long[] posPointer; + private long[] payPointer; + private int[] posBufferUpto; + private int[] payloadByteUpto; + + private long lastPosPointer; + private long lastPayPointer; + private int lastPayloadByteUpto; + private long lastDocPointer; + private int lastPosBufferUpto; + + public Lucene90SkipReader( + IndexInput skipStream, + int maxSkipLevels, + boolean hasPos, + boolean hasOffsets, + boolean hasPayloads) { + super(skipStream, maxSkipLevels, ForUtil.BLOCK_SIZE, 8); + docPointer = new long[maxSkipLevels]; + if (hasPos) { + posPointer = new long[maxSkipLevels]; + posBufferUpto = new int[maxSkipLevels]; + if (hasPayloads) { + payloadByteUpto = new int[maxSkipLevels]; + } else { + payloadByteUpto = null; + } + if (hasOffsets || hasPayloads) { + payPointer = new long[maxSkipLevels]; + } else { + payPointer = null; + } + } else { + posPointer = null; + } + } + + /** + * Trim original docFreq to tell skipReader read proper number of skip points. + * + *

Since our definition in Lucene90Skip* is a little different from MultiLevelSkip* This + * trimmed docFreq will prevent skipReader from: 1. silly reading a non-existed skip point after + * the last block boundary 2. moving into the vInt block + */ + protected int trim(int df) { + return df % ForUtil.BLOCK_SIZE == 0 ? df - 1 : df; + } + + public void init( + long skipPointer, long docBasePointer, long posBasePointer, long payBasePointer, int df) + throws IOException { + super.init(skipPointer, trim(df)); + lastDocPointer = docBasePointer; + lastPosPointer = posBasePointer; + lastPayPointer = payBasePointer; + + Arrays.fill(docPointer, docBasePointer); + if (posPointer != null) { + Arrays.fill(posPointer, posBasePointer); + if (payPointer != null) { + Arrays.fill(payPointer, payBasePointer); + } + } else { + assert posBasePointer == 0; + } + } + + /** + * Returns the doc pointer of the doc to which the last call of {@link + * MultiLevelSkipListReader#skipTo(int)} has skipped. + */ + public long getDocPointer() { + return lastDocPointer; + } + + public long getPosPointer() { + return lastPosPointer; + } + + public int getPosBufferUpto() { + return lastPosBufferUpto; + } + + public long getPayPointer() { + return lastPayPointer; + } + + public int getPayloadByteUpto() { + return lastPayloadByteUpto; + } + + public int getNextSkipDoc() { + return skipDoc[0]; + } + + @Override + protected void seekChild(int level) throws IOException { + super.seekChild(level); + docPointer[level] = lastDocPointer; + if (posPointer != null) { + posPointer[level] = lastPosPointer; + posBufferUpto[level] = lastPosBufferUpto; + if (payloadByteUpto != null) { + payloadByteUpto[level] = lastPayloadByteUpto; + } + if (payPointer != null) { + payPointer[level] = lastPayPointer; + } + } + } + + @Override + protected void setLastSkipData(int level) { + super.setLastSkipData(level); + lastDocPointer = docPointer[level]; + + if (posPointer != null) { + lastPosPointer = posPointer[level]; + lastPosBufferUpto = posBufferUpto[level]; + if (payPointer != null) { + lastPayPointer = payPointer[level]; + } + if (payloadByteUpto != null) { + lastPayloadByteUpto = payloadByteUpto[level]; + } + } + } + + @Override + protected int readSkipData(int level, IndexInput skipStream) throws IOException { + int delta = skipStream.readVInt(); + docPointer[level] += skipStream.readVLong(); + + if (posPointer != null) { + posPointer[level] += skipStream.readVLong(); + posBufferUpto[level] = skipStream.readVInt(); + + if (payloadByteUpto != null) { + payloadByteUpto[level] = skipStream.readVInt(); + } + + if (payPointer != null) { + payPointer[level] += skipStream.readVLong(); + } + } + readImpacts(level, skipStream); + return delta; + } + + // The default impl skips impacts + protected void readImpacts(int level, IndexInput skipStream) throws IOException { + skipStream.skipBytes(skipStream.readVInt()); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipWriter.java new file mode 100644 index 000000000000..48deafa42251 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90SkipWriter.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.MultiLevelSkipListWriter; +import org.apache.lucene.index.Impact; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; + +/** + * Write skip lists with multiple levels, and support skip within block ints. + * + *

Assume that docFreq = 28, skipInterval = blockSize = 12 + * + *

+ *  |       block#0       | |      block#1        | |vInts|
+ *  d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
+ *                          ^                       ^       (level 0 skip point)
+ * 
+ * + *

Note that skipWriter will ignore first document in block#0, since it is useless as a skip + * point. Also, we'll never skip into the vInts block, only record skip data at the start its start + * point(if it exist). + * + *

For each skip point, we will record: 1. docID in former position, i.e. for position 12, record + * docID[11], etc. 2. its related file points(position, payload), 3. related numbers or + * uptos(position, payload). 4. start offset. + */ +final class Lucene90SkipWriter extends MultiLevelSkipListWriter { + private int[] lastSkipDoc; + private long[] lastSkipDocPointer; + private long[] lastSkipPosPointer; + private long[] lastSkipPayPointer; + private int[] lastPayloadByteUpto; + + private final IndexOutput docOut; + private final IndexOutput posOut; + private final IndexOutput payOut; + + private int curDoc; + private long curDocPointer; + private long curPosPointer; + private long curPayPointer; + private int curPosBufferUpto; + private int curPayloadByteUpto; + private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms; + private boolean fieldHasPositions; + private boolean fieldHasOffsets; + private boolean fieldHasPayloads; + + public Lucene90SkipWriter( + int maxSkipLevels, + int blockSize, + int docCount, + IndexOutput docOut, + IndexOutput posOut, + IndexOutput payOut) { + super(blockSize, 8, maxSkipLevels, docCount); + this.docOut = docOut; + this.posOut = posOut; + this.payOut = payOut; + + lastSkipDoc = new int[maxSkipLevels]; + lastSkipDocPointer = new long[maxSkipLevels]; + if (posOut != null) { + lastSkipPosPointer = new long[maxSkipLevels]; + if (payOut != null) { + lastSkipPayPointer = new long[maxSkipLevels]; + } + lastPayloadByteUpto = new int[maxSkipLevels]; + } + curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels]; + for (int i = 0; i < maxSkipLevels; ++i) { + curCompetitiveFreqNorms[i] = new CompetitiveImpactAccumulator(); + } + } + + public void setField( + boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { + this.fieldHasPositions = fieldHasPositions; + this.fieldHasOffsets = fieldHasOffsets; + this.fieldHasPayloads = fieldHasPayloads; + } + + // tricky: we only skip data for blocks (terms with more than 128 docs), but re-init'ing the + // skipper + // is pretty slow for rare terms in large segments as we have to fill O(log #docs in segment) of + // junk. + // this is the vast majority of terms (worst case: ID field or similar). so in resetSkip() we + // save + // away the previous pointers, and lazy-init only if we need to buffer skip data for the term. + private boolean initialized; + long lastDocFP; + long lastPosFP; + long lastPayFP; + + @Override + public void resetSkip() { + lastDocFP = docOut.getFilePointer(); + if (fieldHasPositions) { + lastPosFP = posOut.getFilePointer(); + if (fieldHasOffsets || fieldHasPayloads) { + lastPayFP = payOut.getFilePointer(); + } + } + if (initialized) { + for (CompetitiveImpactAccumulator acc : curCompetitiveFreqNorms) { + acc.clear(); + } + } + initialized = false; + } + + private void initSkip() { + if (!initialized) { + super.resetSkip(); + Arrays.fill(lastSkipDoc, 0); + Arrays.fill(lastSkipDocPointer, lastDocFP); + if (fieldHasPositions) { + Arrays.fill(lastSkipPosPointer, lastPosFP); + if (fieldHasPayloads) { + Arrays.fill(lastPayloadByteUpto, 0); + } + if (fieldHasOffsets || fieldHasPayloads) { + Arrays.fill(lastSkipPayPointer, lastPayFP); + } + } + // sets of competitive freq,norm pairs should be empty at this point + assert Arrays.stream(curCompetitiveFreqNorms) + .map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs) + .mapToInt(Collection::size) + .sum() + == 0; + initialized = true; + } + } + + /** Sets the values for the current skip data. */ + public void bufferSkip( + int doc, + CompetitiveImpactAccumulator competitiveFreqNorms, + int numDocs, + long posFP, + long payFP, + int posBufferUpto, + int payloadByteUpto) + throws IOException { + initSkip(); + this.curDoc = doc; + this.curDocPointer = docOut.getFilePointer(); + this.curPosPointer = posFP; + this.curPayPointer = payFP; + this.curPosBufferUpto = posBufferUpto; + this.curPayloadByteUpto = payloadByteUpto; + this.curCompetitiveFreqNorms[0].addAll(competitiveFreqNorms); + bufferSkip(numDocs); + } + + private final ByteBuffersDataOutput freqNormOut = ByteBuffersDataOutput.newResettableInstance(); + + @Override + protected void writeSkipData(int level, DataOutput skipBuffer) throws IOException { + + int delta = curDoc - lastSkipDoc[level]; + + skipBuffer.writeVInt(delta); + lastSkipDoc[level] = curDoc; + + skipBuffer.writeVLong(curDocPointer - lastSkipDocPointer[level]); + lastSkipDocPointer[level] = curDocPointer; + + if (fieldHasPositions) { + + skipBuffer.writeVLong(curPosPointer - lastSkipPosPointer[level]); + lastSkipPosPointer[level] = curPosPointer; + skipBuffer.writeVInt(curPosBufferUpto); + + if (fieldHasPayloads) { + skipBuffer.writeVInt(curPayloadByteUpto); + } + + if (fieldHasOffsets || fieldHasPayloads) { + skipBuffer.writeVLong(curPayPointer - lastSkipPayPointer[level]); + lastSkipPayPointer[level] = curPayPointer; + } + } + + CompetitiveImpactAccumulator competitiveFreqNorms = curCompetitiveFreqNorms[level]; + assert competitiveFreqNorms.getCompetitiveFreqNormPairs().size() > 0; + if (level + 1 < numberOfSkipLevels) { + curCompetitiveFreqNorms[level + 1].addAll(competitiveFreqNorms); + } + writeImpacts(competitiveFreqNorms, freqNormOut); + skipBuffer.writeVInt(Math.toIntExact(freqNormOut.size())); + freqNormOut.copyTo(skipBuffer); + freqNormOut.reset(); + competitiveFreqNorms.clear(); + } + + static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) throws IOException { + Collection impacts = acc.getCompetitiveFreqNormPairs(); + Impact previous = new Impact(0, 0); + for (Impact impact : impacts) { + assert impact.freq > previous.freq; + assert Long.compareUnsigned(impact.norm, previous.norm) > 0; + int freqDelta = impact.freq - previous.freq - 1; + long normDelta = impact.norm - previous.norm - 1; + if (normDelta == 0) { + // most of time, norm only increases by 1, so we can fold everything in a single byte + out.writeVInt(freqDelta << 1); + } else { + out.writeVInt((freqDelta << 1) | 1); + out.writeZLong(normDelta); + } + previous = impact; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java index 5646724546a4..edb886b31c17 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java @@ -37,21 +37,21 @@ * *

This {@link StoredFieldsFormat} compresses blocks of documents in order to improve the * compression ratio compared to document-level compression. It uses the LZ4 compression algorithm by default in 8KB blocks and - * shared dictionaries, which is fast to compress and very fast to decompress data. Although the - * default compression method that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on - * speed than on compression ratio, it should provide interesting compression ratios for redundant - * inputs (such as log files, HTML or plain text). For higher compression, you can choose ({@link + * href="http://code.google.com/p/lz4/">LZ4 compression algorithm by default in 16KB blocks, + * which is fast to compress and very fast to decompress data. Although the default compression + * method that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on + * compression ratio, it should provide interesting compression ratios for redundant inputs (such as + * log files, HTML or plain text). For higher compression, you can choose ({@link * Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses the DEFLATE algorithm with 48KB blocks and shared + * href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE algorithm with 48kB blocks and shared * dictionaries for a better ratio at the expense of slower performance. These two options can be * configured like this: * *

  *   // the default: for high performance
- *   indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_SPEED));
+ *   indexWriterConfig.setCodec(new Lucene87Codec(Mode.BEST_SPEED));
  *   // instead for higher performance (but slower):
- *   // indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_COMPRESSION));
+ *   // indexWriterConfig.setCodec(new Lucene87Codec(Mode.BEST_COMPRESSION));
  * 
* *

File formats @@ -61,9 +61,9 @@ *

    *
  1. *

    A fields data file (extension .fdt). This file stores a compact - * representation of documents in compressed blocks of 8KB or more. When writing a segment, + * representation of documents in compressed blocks of 16KB or more. When writing a segment, * documents are appended to an in-memory byte[] buffer. When its size reaches - * 80KB or more, some metadata about the documents is flushed to disk, immediately followed by + * 16KB or more, some metadata about the documents is flushed to disk, immediately followed by * a compressed representation of the buffer using the LZ4 compression @@ -71,10 +71,10 @@ *

    Notes *

    * * @@ -134,14 +132,13 @@ public FieldInfos read( Throwable priorE = null; FieldInfo[] infos = null; try { - int format = - CodecUtil.checkIndexHeader( - input, - Lucene94FieldInfosFormat.CODEC_NAME, - Lucene94FieldInfosFormat.FORMAT_START, - Lucene94FieldInfosFormat.FORMAT_CURRENT, - segmentInfo.getId(), - segmentSuffix); + CodecUtil.checkIndexHeader( + input, + Lucene94FieldInfosFormat.CODEC_NAME, + Lucene94FieldInfosFormat.FORMAT_START, + Lucene94FieldInfosFormat.FORMAT_CURRENT, + segmentInfo.getId(), + segmentSuffix); final int size = input.readVInt(); // read in the size infos = new FieldInfo[size]; @@ -161,18 +158,6 @@ public FieldInfos read( boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; - boolean isParentField = - format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; - - if ((bits & 0xE0) != 0) { - throw new CorruptIndexException( - "unused bits are set \"" + Integer.toBinaryString(bits) + "\"", input); - } - if (format < FORMAT_PARENT_FIELD && (bits & 0xF0) != 0) { - throw new CorruptIndexException( - "parent field bit is set but shouldn't \"" + Integer.toBinaryString(bits) + "\"", - input); - } final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -217,7 +202,7 @@ public FieldInfos read( vectorEncoding, vectorDistFunc, isSoftDeletesField, - isParentField); + false); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException( @@ -393,7 +378,6 @@ public void write( if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; - if (fi.isParentField()) bits |= PARENT_FIELD_FIELD; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -409,7 +393,7 @@ public void write( } output.writeVInt(fi.getVectorDimension()); output.writeByte((byte) fi.getVectorEncoding().ordinal()); - output.writeByte(distFuncToOrd(fi.getVectorSimilarityFunction())); + output.writeByte((byte) fi.getVectorSimilarityFunction().ordinal()); } CodecUtil.writeFooter(output); } @@ -421,14 +405,11 @@ public void write( // Codec header static final String CODEC_NAME = "Lucene94FieldInfos"; static final int FORMAT_START = 0; - // this doesn't actually change the file format but uses up one more bit an existing bit pattern - static final int FORMAT_PARENT_FIELD = 1; - static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD; + static final int FORMAT_CURRENT = FORMAT_START; // Field flags static final byte STORE_TERMVECTOR = 0x1; static final byte OMIT_NORMS = 0x2; static final byte STORE_PAYLOADS = 0x4; static final byte SOFT_DELETES_FIELD = 0x8; - static final byte PARENT_FIELD_FIELD = 0x10; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java new file mode 100644 index 000000000000..140d698a6f79 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95Codec.java @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene95; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90SegmentInfoFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 9.5 index format + * + *

    If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene95 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene95Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene90SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene95Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene95Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene95Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene95Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene95Codec(Mode mode) { + super("Lucene95"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene90PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); // FS: not used + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

    The default implementation always returns "Lucene90". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

    The default implementation always returns "Lucene90". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

    The default implementation always returns "Lucene95". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} From 5df7295958f7fd5eb0f91f98a6e67d9b72d6dde2 Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Fri, 11 Oct 2024 14:46:46 -0400 Subject: [PATCH 2/7] revert changes to Lucene90DocValues (???) --- .../lucene90/Lucene90DocValuesConsumer.java | 18 +++++----- .../lucene90/Lucene90DocValuesProducer.java | 35 +++++++++++-------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java index c3d170447ae6..d64dce985c01 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java @@ -22,10 +22,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesProducer; @@ -40,6 +36,8 @@ import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.internal.hppc.LongHashSet; +import org.apache.lucene.internal.hppc.LongIntHashMap; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.SortedSetSelector; import org.apache.lucene.store.ByteArrayDataOutput; @@ -199,7 +197,7 @@ private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, bo MinMaxTracker minMax = new MinMaxTracker(); MinMaxTracker blockMinMax = new MinMaxTracker(); long gcd = 0; - Set uniqueValues = ords ? null : new HashSet<>(); + LongHashSet uniqueValues = ords ? null : new LongHashSet(); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { for (int i = 0, count = values.docValueCount(); i < count; ++i) { long v = values.nextValue(); @@ -273,7 +271,7 @@ private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, bo meta.writeLong(numValues); final int numBitsPerValue; boolean doBlocks = false; - Map encode = null; + LongIntHashMap encode = null; if (min >= max) { // meta[-1]: All values are 0 numBitsPerValue = 0; meta.writeInt(-1); // tablesize @@ -283,13 +281,13 @@ private long[] writeValues(FieldInfo field, DocValuesProducer valuesProducer, bo && DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1) < DirectWriter.unsignedBitsRequired((max - min) / gcd)) { numBitsPerValue = DirectWriter.unsignedBitsRequired(uniqueValues.size() - 1); - final Long[] sortedUniqueValues = uniqueValues.toArray(new Long[0]); + final long[] sortedUniqueValues = uniqueValues.toArray(); Arrays.sort(sortedUniqueValues); meta.writeInt(sortedUniqueValues.length); // tablesize - for (Long v : sortedUniqueValues) { + for (long v : sortedUniqueValues) { meta.writeLong(v); // table[] entry } - encode = new HashMap<>(); + encode = new LongIntHashMap(); for (int i = 0; i < sortedUniqueValues.length; ++i) { encode.put(sortedUniqueValues[i], i); } @@ -339,7 +337,7 @@ private void writeValuesSingleBlock( int numBitsPerValue, long min, long gcd, - Map encode) + LongIntHashMap encode) throws IOException { DirectWriter writer = DirectWriter.getInstance(data, numValues, numBitsPerValue); for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java index 3718d0ae401f..d0544f3db246 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java @@ -1154,6 +1154,7 @@ private long seekTermsIndex(BytesRef text) throws IOException { assert hi < 0 || getTermFromIndex(hi).compareTo(text) <= 0; assert hi == ((entry.termsDictSize - 1) >> entry.termsDictIndexShift) || getTermFromIndex(hi + 1).compareTo(text) > 0; + assert hi < 0 ^ entry.termsDictSize > 0; // return -1 iff empty term dict return hi; } @@ -1170,7 +1171,9 @@ private BytesRef getFirstTermFromBlock(long block) throws IOException { private long seekBlock(BytesRef text) throws IOException { long index = seekTermsIndex(text); if (index == -1L) { - return -1L; + // empty terms dict + this.ord = 0; + return -2L; } long ordLo = index << entry.termsDictIndexShift; @@ -1194,26 +1197,30 @@ private long seekBlock(BytesRef text) throws IOException { assert blockHi == ((entry.termsDictSize - 1) >>> TERMS_DICT_BLOCK_LZ4_SHIFT) || getFirstTermFromBlock(blockHi + 1).compareTo(text) > 0; + // read the block only if term dict is not empty + assert entry.termsDictSize > 0; + // reset ord and bytes to the ceiling block even if + // text is before the first term (blockHi == -1) + final long block = Math.max(blockHi, 0); + final long blockAddress = blockAddresses.get(block); + this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT; + bytes.seek(blockAddress); + decompressBlock(); + return blockHi; } @Override public SeekStatus seekCeil(BytesRef text) throws IOException { final long block = seekBlock(text); - if (block == -1) { - // before the first term, or empty terms dict - if (entry.termsDictSize == 0) { - ord = 0; - return SeekStatus.END; - } else { - seekExact(0L); - return SeekStatus.NOT_FOUND; - } + if (block == -2) { + // empty terms dict + assert entry.termsDictSize == 0; + return SeekStatus.END; + } else if (block == -1) { + // before the first term + return SeekStatus.NOT_FOUND; } - final long blockAddress = blockAddresses.get(block); - this.ord = block << TERMS_DICT_BLOCK_LZ4_SHIFT; - bytes.seek(blockAddress); - decompressBlock(); while (true) { int cmp = term.compareTo(text); From 8c918ff3c9c358a4ea02af47d8f05149643cccce Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Fri, 11 Oct 2024 14:53:45 -0400 Subject: [PATCH 3/7] revert changes to Lucene94FieldInfosFormat (???) --- .../lucene94/Lucene94FieldInfosFormat.java | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java index 6d669c46e20a..796abd35252d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene94/Lucene94FieldInfosFormat.java @@ -112,6 +112,8 @@ *

  2. 0: EUCLIDEAN distance. ({@link VectorSimilarityFunction#EUCLIDEAN}) *
  3. 1: DOT_PRODUCT similarity. ({@link VectorSimilarityFunction#DOT_PRODUCT}) *
  4. 2: COSINE similarity. ({@link VectorSimilarityFunction#COSINE}) + *
  5. 3: MAXIMUM_INNER_PRODUCT similarity. ({@link + * VectorSimilarityFunction#MAXIMUM_INNER_PRODUCT}) * * * @@ -132,13 +134,14 @@ public FieldInfos read( Throwable priorE = null; FieldInfo[] infos = null; try { - CodecUtil.checkIndexHeader( - input, - Lucene94FieldInfosFormat.CODEC_NAME, - Lucene94FieldInfosFormat.FORMAT_START, - Lucene94FieldInfosFormat.FORMAT_CURRENT, - segmentInfo.getId(), - segmentSuffix); + int format = + CodecUtil.checkIndexHeader( + input, + Lucene94FieldInfosFormat.CODEC_NAME, + Lucene94FieldInfosFormat.FORMAT_START, + Lucene94FieldInfosFormat.FORMAT_CURRENT, + segmentInfo.getId(), + segmentSuffix); final int size = input.readVInt(); // read in the size infos = new FieldInfo[size]; @@ -158,6 +161,18 @@ public FieldInfos read( boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; + boolean isParentField = + format >= FORMAT_PARENT_FIELD ? (bits & PARENT_FIELD_FIELD) != 0 : false; + + if ((bits & 0xE0) != 0) { + throw new CorruptIndexException( + "unused bits are set \"" + Integer.toBinaryString(bits) + "\"", input); + } + if (format < FORMAT_PARENT_FIELD && (bits & 0xF0) != 0) { + throw new CorruptIndexException( + "parent field bit is set but shouldn't \"" + Integer.toBinaryString(bits) + "\"", + input); + } final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); @@ -202,7 +217,7 @@ public FieldInfos read( vectorEncoding, vectorDistFunc, isSoftDeletesField, - false); + isParentField); infos[i].checkConsistency(); } catch (IllegalStateException e) { throw new CorruptIndexException( @@ -378,6 +393,7 @@ public void write( if (fi.omitsNorms()) bits |= OMIT_NORMS; if (fi.hasPayloads()) bits |= STORE_PAYLOADS; if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; + if (fi.isParentField()) bits |= PARENT_FIELD_FIELD; output.writeByte(bits); output.writeByte(indexOptionsByte(fi.getIndexOptions())); @@ -393,7 +409,7 @@ public void write( } output.writeVInt(fi.getVectorDimension()); output.writeByte((byte) fi.getVectorEncoding().ordinal()); - output.writeByte((byte) fi.getVectorSimilarityFunction().ordinal()); + output.writeByte(distFuncToOrd(fi.getVectorSimilarityFunction())); } CodecUtil.writeFooter(output); } @@ -405,11 +421,14 @@ public void write( // Codec header static final String CODEC_NAME = "Lucene94FieldInfos"; static final int FORMAT_START = 0; - static final int FORMAT_CURRENT = FORMAT_START; + // this doesn't actually change the file format but uses up one more bit an existing bit pattern + static final int FORMAT_PARENT_FIELD = 1; + static final int FORMAT_CURRENT = FORMAT_PARENT_FIELD; // Field flags static final byte STORE_TERMVECTOR = 0x1; static final byte OMIT_NORMS = 0x2; static final byte STORE_PAYLOADS = 0x4; static final byte SOFT_DELETES_FIELD = 0x8; + static final byte PARENT_FIELD_FIELD = 0x10; } From a9288d7b3926cfdd01113778fc4f27927ec4d8ab Mon Sep 17 00:00:00 2001 From: Michael Gibney Date: Fri, 11 Oct 2024 16:01:37 -0400 Subject: [PATCH 4/7] remove other likely-irrelevant reverts --- .../LZ4WithPresetDictCompressionMode.java | 4 +++- .../lucene90/Lucene90CompoundFormat.java | 1 + .../lucene90/Lucene90NormsProducer.java | 17 +++++++------ .../codecs/lucene90/Lucene90PointsReader.java | 5 ++-- .../codecs/lucene90/Lucene90PointsWriter.java | 2 +- .../lucene90/Lucene90StoredFieldsFormat.java | 24 +++++++++---------- ...Lucene90CompressingStoredFieldsReader.java | 6 ++--- ...Lucene90CompressingStoredFieldsWriter.java | 13 ++++++++++ .../Lucene90CompressingTermVectorsReader.java | 2 +- .../Lucene90CompressingTermVectorsWriter.java | 22 +++++++---------- 10 files changed, 52 insertions(+), 44 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java index 6f28ca3d0a60..b3acc0ef1699 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/LZ4WithPresetDictCompressionMode.java @@ -128,10 +128,12 @@ public void decompress(DataInput in, int originalLength, int offset, int length, } // Read blocks that intersect with the interval we need + if (offsetInBlock < offset + length) { + bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + offset + length - offsetInBlock); + } while (offsetInBlock < offset + length) { final int bytesToDecompress = Math.min(blockLength, offset + length - offsetInBlock); LZ4.decompress(in, bytesToDecompress, buffer, dictLength); - bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + bytesToDecompress); System.arraycopy(buffer, dictLength, bytes.bytes, bytes.length, bytesToDecompress); bytes.length += bytesToDecompress; offsetInBlock += blockLength; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java index 72d83853879b..ade1699fe753 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90CompoundFormat.java @@ -69,6 +69,7 @@ public final class Lucene90CompoundFormat extends CompoundFormat { /** Extension of compound file */ static final String DATA_EXTENSION = "cfs"; + /** Extension of compound file entries */ static final String ENTRIES_EXTENSION = "cfe"; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java index d145bfdb0ebb..dcfa45f5b60b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90NormsProducer.java @@ -20,8 +20,6 @@ import static org.apache.lucene.codecs.lucene90.Lucene90NormsFormat.VERSION_START; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.index.CorruptIndexException; @@ -31,6 +29,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -40,13 +39,13 @@ /** Reader for {@link Lucene90NormsFormat} */ final class Lucene90NormsProducer extends NormsProducer implements Cloneable { // metadata maps (just file pointers and minimal stuff) - private final Map norms = new HashMap<>(); + private final IntObjectHashMap norms = new IntObjectHashMap<>(); private final int maxDoc; private IndexInput data; private boolean merging; - private Map disiInputs; - private Map disiJumpTables; - private Map dataInputs; + private IntObjectHashMap disiInputs; + private IntObjectHashMap disiJumpTables; + private IntObjectHashMap dataInputs; Lucene90NormsProducer( SegmentReadState state, @@ -122,9 +121,9 @@ public NormsProducer getMergeInstance() { throw new RuntimeException(e); } clone.data = data.clone(); - clone.disiInputs = new HashMap<>(); - clone.disiJumpTables = new HashMap<>(); - clone.dataInputs = new HashMap<>(); + clone.disiInputs = new IntObjectHashMap<>(); + clone.disiJumpTables = new IntObjectHashMap<>(); + clone.dataInputs = new IntObjectHashMap<>(); clone.merging = true; return clone; } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java index 1656dd0a1bae..3e072cb8770d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsReader.java @@ -17,8 +17,6 @@ package org.apache.lucene.codecs.lucene90; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.index.CorruptIndexException; @@ -26,6 +24,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.internal.hppc.IntObjectHashMap; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; @@ -36,7 +35,7 @@ public class Lucene90PointsReader extends PointsReader { final IndexInput indexIn, dataIn; final SegmentReadState readState; - final Map readers = new HashMap<>(); + final IntObjectHashMap readers = new IntObjectHashMap<>(); /** Sole constructor */ public Lucene90PointsReader(SegmentReadState readState) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java index d310b8ace197..e50d6a0fdb5a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90PointsWriter.java @@ -106,7 +106,7 @@ public Lucene90PointsWriter( } /** - * Uses the defaults values for {@code maxPointsInLeafNode} (1024) and {@code maxMBSortInHeap} + * Uses the defaults values for {@code maxPointsInLeafNode} (512) and {@code maxMBSortInHeap} * (16.0) */ public Lucene90PointsWriter(SegmentWriteState writeState) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java index edb886b31c17..5646724546a4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90StoredFieldsFormat.java @@ -37,21 +37,21 @@ * *

    This {@link StoredFieldsFormat} compresses blocks of documents in order to improve the * compression ratio compared to document-level compression. It uses the LZ4 compression algorithm by default in 16KB blocks, - * which is fast to compress and very fast to decompress data. Although the default compression - * method that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on speed than on - * compression ratio, it should provide interesting compression ratios for redundant inputs (such as - * log files, HTML or plain text). For higher compression, you can choose ({@link + * href="http://code.google.com/p/lz4/">LZ4 compression algorithm by default in 8KB blocks and + * shared dictionaries, which is fast to compress and very fast to decompress data. Although the + * default compression method that is used ({@link Mode#BEST_SPEED BEST_SPEED}) focuses more on + * speed than on compression ratio, it should provide interesting compression ratios for redundant + * inputs (such as log files, HTML or plain text). For higher compression, you can choose ({@link * Mode#BEST_COMPRESSION BEST_COMPRESSION}), which uses the DEFLATE algorithm with 48kB blocks and shared + * href="http://en.wikipedia.org/wiki/DEFLATE">DEFLATE algorithm with 48KB blocks and shared * dictionaries for a better ratio at the expense of slower performance. These two options can be * configured like this: * *

      *   // the default: for high performance
    - *   indexWriterConfig.setCodec(new Lucene87Codec(Mode.BEST_SPEED));
    + *   indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_SPEED));
      *   // instead for higher performance (but slower):
    - *   // indexWriterConfig.setCodec(new Lucene87Codec(Mode.BEST_COMPRESSION));
    + *   // indexWriterConfig.setCodec(new Lucene99Codec(Mode.BEST_COMPRESSION));
      * 
    * *

    File formats @@ -61,9 +61,9 @@ *

      *
    1. *

      A fields data file (extension .fdt). This file stores a compact - * representation of documents in compressed blocks of 16KB or more. When writing a segment, + * representation of documents in compressed blocks of 8KB or more. When writing a segment, * documents are appended to an in-memory byte[] buffer. When its size reaches - * 16KB or more, some metadata about the documents is flushed to disk, immediately followed by + * 80KB or more, some metadata about the documents is flushed to disk, immediately followed by * a compressed representation of the buffer using the LZ4 compression @@ -71,10 +71,10 @@ *

      Notes *