Copilot commented on code in PR #18092: URL: https://github.com/apache/pinot/pull/18092#discussion_r3032516175
########## pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/ChunkTransformTest.java: ########## @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.compression; + +import java.nio.ByteBuffer; +import java.util.Random; +import org.apache.pinot.segment.spi.compression.ChunkTransform; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +/** + * Tests for {@link DeltaTransform} and {@link DoubleDeltaTransform}. + */ +public class ChunkTransformTest { + + @Test + public void testDeltaEncodeDecodeInts() { + int[] values = {100, 105, 108, 120, 125, 130}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaEncodeDecodeLongs() { + long[] values = {1000000L, 1000050L, 1000100L, 1000200L, 1000250L}; + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaEmptyInts() { + assertRoundTripInts(DeltaTransform.INSTANCE, new int[0]); + } + + @Test + public void testDeltaSingleInt() { + assertRoundTripInts(DeltaTransform.INSTANCE, new int[]{42}); + } + + @Test + public void testDeltaSingleLong() { + assertRoundTripLongs(DeltaTransform.INSTANCE, new long[]{42L}); + } + + @Test + public void testDeltaNegativeValues() { + int[] values = {-100, -50, 0, 50, 100, -200}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaOverflowInt() { + int[] values = {Integer.MAX_VALUE, Integer.MIN_VALUE, 0, Integer.MAX_VALUE}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaOverflowLong() { + long[] values = {Long.MAX_VALUE, Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaMonotonicInts() { + int[] values = new int[1000]; + for (int i = 0; i < values.length; i++) { + values[i] = i; + } + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaConstantInts() { + int[] values = new int[100]; + java.util.Arrays.fill(values, 42); + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaRandomInts() { + Random random = new Random(12345); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaRandomLongs() { + Random random = new Random(12345); + long[] values = new long[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + // --- Double-delta tests --- + + @Test + public void testDoubleDeltaEncodeDecodeInts() { + int[] values = {100, 110, 120, 130, 140, 150}; + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaEncodeDecodeLongs() { + long[] values = {1000L, 1100L, 1200L, 1300L, 1400L}; + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaEmptyInts() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[0]); + } + + @Test + public void testDoubleDeltaSingleInt() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[]{42}); + } + + @Test + public void testDoubleDeltaTwoInts() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[]{10, 20}); + } + + @Test + public void testDoubleDeltaTwoLongs() { + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, new long[]{10L, 20L}); + } + + @Test + public void testDoubleDeltaOverflowInt() { + int[] values = {Integer.MAX_VALUE, Integer.MIN_VALUE, 0, Integer.MAX_VALUE}; + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaOverflowLong() { + long[] values = {Long.MAX_VALUE, Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaConstantStep() { + // Perfect constant step: double-deltas should all be 0 + int[] values = new int[100]; + for (int i = 0; i < values.length; i++) { + values[i] = 1000 + i * 60; + } + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaRandomInts() { + Random random = new Random(67890); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaRandomLongs() { + Random random = new Random(67890); + long[] values = new long[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + // --- Helpers --- + + private void assertRoundTripInts(ChunkTransform transform, int[] original) { + if (original.length == 0) { + return; + } + ByteBuffer buffer = ByteBuffer.allocate(original.length * Integer.BYTES); + for (int v : original) { + buffer.putInt(v); + } + buffer.flip(); + + int numBytes = original.length * Integer.BYTES; + transform.encode(buffer, numBytes, Integer.BYTES); + transform.decode(buffer, numBytes, Integer.BYTES); + + buffer.position(0); + for (int i = 0; i < original.length; i++) { + assertEquals(buffer.getInt(), original[i], "Mismatch at index " + i); + } + } + + private void assertRoundTripLongs(ChunkTransform transform, long[] original) { + if (original.length == 0) { + return; + } Review Comment: Same issue as the int helper: returning early on `original.length == 0` means the empty-long test cases never call `encode/decode`. Consider running encode/decode with `numBytes=0` on an empty buffer (or otherwise asserting no-throw) to make the empty-long tests meaningful. ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/compression/DoubleDeltaTransform.java: ########## @@ -0,0 +1,185 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.compression; + +import java.nio.ByteBuffer; +import org.apache.pinot.segment.spi.compression.ChunkTransform; + + +/** + * Double-delta (delta-of-delta) transform for numeric values. Stores the first value as-is, + * the first delta as-is, then each subsequent value as the difference between consecutive deltas. + * Operates in-place on the ByteBuffer. + * + * <p>Effective for data with constant or near-constant step sizes (e.g., fixed-interval timestamps).</p> + */ +public class DoubleDeltaTransform implements ChunkTransform { + + public static final DoubleDeltaTransform INSTANCE = new DoubleDeltaTransform(); + + private DoubleDeltaTransform() { + } + + @Override + public void encode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + encodeInts(buffer, numBytes); + } else { + encodeLongs(buffer, numBytes); + } + } + + @Override + public void decode(ByteBuffer buffer, int numBytes, int valueSizeInBytes) { + if (valueSizeInBytes == Integer.BYTES) { + decodeInts(buffer, numBytes); + } else { + decodeLongs(buffer, numBytes); + } + } + + private void encodeInts(ByteBuffer buffer, int numBytes) { + int numValues = numBytes / Integer.BYTES; + if (numValues <= 2) { + // For 0, 1, or 2 values, just apply regular delta (first value stays, second becomes delta) + if (numValues == 2) { + int pos = buffer.position(); + int v0 = buffer.getInt(pos); + int v1 = buffer.getInt(pos + Integer.BYTES); + buffer.putInt(pos + Integer.BYTES, v1 - v0); + } + return; + } + int pos = buffer.position(); + + // Encode in reverse to avoid overwriting values we still need. + // First pass: compute all original values we need (they're in the buffer). + // We encode from the end to index 2, then handle index 1. Review Comment: The inline comment says “Encode in reverse … from the end to index 2”, but the implementation encodes in forward order (`for (int i = 2; i < numValues; i++)`). Please update the comment to match the actual algorithm to avoid confusion during future maintenance. ```suggestion // Encode in forward order, keeping the previous original value and delta in local variables // so we can safely overwrite each position in-place as we advance from index 2 onward. ``` ########## pinot-segment-local/src/test/java/org/apache/pinot/segment/local/io/compression/ChunkTransformTest.java: ########## @@ -0,0 +1,232 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.local.io.compression; + +import java.nio.ByteBuffer; +import java.util.Random; +import org.apache.pinot.segment.spi.compression.ChunkTransform; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; + + +/** + * Tests for {@link DeltaTransform} and {@link DoubleDeltaTransform}. + */ +public class ChunkTransformTest { + + @Test + public void testDeltaEncodeDecodeInts() { + int[] values = {100, 105, 108, 120, 125, 130}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaEncodeDecodeLongs() { + long[] values = {1000000L, 1000050L, 1000100L, 1000200L, 1000250L}; + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaEmptyInts() { + assertRoundTripInts(DeltaTransform.INSTANCE, new int[0]); + } + + @Test + public void testDeltaSingleInt() { + assertRoundTripInts(DeltaTransform.INSTANCE, new int[]{42}); + } + + @Test + public void testDeltaSingleLong() { + assertRoundTripLongs(DeltaTransform.INSTANCE, new long[]{42L}); + } + + @Test + public void testDeltaNegativeValues() { + int[] values = {-100, -50, 0, 50, 100, -200}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaOverflowInt() { + int[] values = {Integer.MAX_VALUE, Integer.MIN_VALUE, 0, Integer.MAX_VALUE}; + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaOverflowLong() { + long[] values = {Long.MAX_VALUE, Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaMonotonicInts() { + int[] values = new int[1000]; + for (int i = 0; i < values.length; i++) { + values[i] = i; + } + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaConstantInts() { + int[] values = new int[100]; + java.util.Arrays.fill(values, 42); + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaRandomInts() { + Random random = new Random(12345); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertRoundTripInts(DeltaTransform.INSTANCE, values); + } + + @Test + public void testDeltaRandomLongs() { + Random random = new Random(12345); + long[] values = new long[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertRoundTripLongs(DeltaTransform.INSTANCE, values); + } + + // --- Double-delta tests --- + + @Test + public void testDoubleDeltaEncodeDecodeInts() { + int[] values = {100, 110, 120, 130, 140, 150}; + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaEncodeDecodeLongs() { + long[] values = {1000L, 1100L, 1200L, 1300L, 1400L}; + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaEmptyInts() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[0]); + } + + @Test + public void testDoubleDeltaSingleInt() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[]{42}); + } + + @Test + public void testDoubleDeltaTwoInts() { + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, new int[]{10, 20}); + } + + @Test + public void testDoubleDeltaTwoLongs() { + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, new long[]{10L, 20L}); + } + + @Test + public void testDoubleDeltaOverflowInt() { + int[] values = {Integer.MAX_VALUE, Integer.MIN_VALUE, 0, Integer.MAX_VALUE}; + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaOverflowLong() { + long[] values = {Long.MAX_VALUE, Long.MIN_VALUE, 0L, Long.MAX_VALUE}; + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaConstantStep() { + // Perfect constant step: double-deltas should all be 0 + int[] values = new int[100]; + for (int i = 0; i < values.length; i++) { + values[i] = 1000 + i * 60; + } + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaRandomInts() { + Random random = new Random(67890); + int[] values = new int[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextInt(); + } + assertRoundTripInts(DoubleDeltaTransform.INSTANCE, values); + } + + @Test + public void testDoubleDeltaRandomLongs() { + Random random = new Random(67890); + long[] values = new long[500]; + for (int i = 0; i < values.length; i++) { + values[i] = random.nextLong(); + } + assertRoundTripLongs(DoubleDeltaTransform.INSTANCE, values); + } + + // --- Helpers --- + + private void assertRoundTripInts(ChunkTransform transform, int[] original) { + if (original.length == 0) { + return; + } Review Comment: The empty-array test cases are currently no-ops because this helper returns early when `original.length == 0`, so `encode/decode` never run. Consider exercising the transform on an empty buffer (numBytes=0) and asserting it doesn’t throw, so `testDeltaEmptyInts`/`testDoubleDeltaEmptyInts` actually validate behavior. ########## pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/forward/ForwardIndexType.java: ########## @@ -113,6 +113,8 @@ private void validateForwardIndexEnabled(ForwardIndexConfig forwardIndexConfig, if (dictionaryConfig.isEnabled()) { Preconditions.checkState(compressionCodec == null || compressionCodec.isApplicableToDictEncodedIndex(), "Compression codec: %s is not applicable to dictionary encoded column: %s", compressionCodec, column); + Preconditions.checkState(forwardIndexConfig.getTransformCodec() == null, + "Transform codec is not supported for dictionary-encoded column: %s", column); Review Comment: In dictionary-encoded columns, validation currently requires `forwardIndexConfig.getTransformCodec() == null`, which will also reject an explicitly configured `transformCodec: NONE`. Since NONE is semantically equivalent to “no transform”, this should allow either null or NONE (e.g., check `transformCodec == null || transformCodec == TransformCodec.NONE`). ########## pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/compression/TransformCodec.java: ########## @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.compression; + +/** + * Enum representing reversible numeric transforms that can be applied to chunk data before compression. + * Unlike {@link ChunkCompressionType}, which defines byte-level compression algorithms, TransformCodec + * operates on typed numeric values (INT/LONG) to reduce entropy before the compression stage. + * + * <p>The transform is applied per-chunk: each chunk is independently transformed and compressed. + * On read, the reverse transform is applied after decompression.</p> + * + * <p>Supported in V1 for single-value INT and LONG RAW forward index columns only.</p> + */ +public enum TransformCodec { + + /** No transform applied. */ + NONE(0), + + /** + * Delta encoding: stores the first value as-is, then each subsequent value as the difference + * from the previous value. Effective for monotonically increasing data (e.g., timestamps). + */ + DELTA(1), + + /** + * Double-delta encoding: stores the first value as-is, the first delta as-is, then each + * subsequent value as the difference of consecutive deltas. Effective for data with constant + * or near-constant step sizes (e.g., fixed-interval timestamps). + */ + DOUBLE_DELTA(2); + + private static final TransformCodec[] VALUES = values(); + + private final int _value; + + TransformCodec(int value) { + _value = value; + } + + public int getValue() { + return _value; + } + + public static TransformCodec valueOf(int ordinal) { + if (ordinal < 0 || ordinal >= VALUES.length) { + throw new IllegalArgumentException("Invalid TransformCodec ordinal: " + ordinal); + } + return VALUES[ordinal]; + } Review Comment: `TransformCodec.valueOf(int)` currently indexes into `values()` using the provided int, but the header stores `TransformCodec.getValue()`. This is only correct as long as `_value` always matches the enum ordinal; reordering or introducing non-sequential values would silently corrupt decoding. Consider decoding by the explicit `_value` instead of the ordinal (e.g., precomputed lookup by value). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
