This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 7c9f3ae4f3 Support reading var-length type (STRING/BYTES/BIG_DECIMAL)
as murmur3 32/64/128 bits hash (#15213)
7c9f3ae4f3 is described below
commit 7c9f3ae4f3bfd06b005f50f600935ebad9f0a44d
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Sat Mar 29 18:21:34 2025 -0600
Support reading var-length type (STRING/BYTES/BIG_DECIMAL) as murmur3
32/64/128 bits hash (#15213)
---
.../org/apache/pinot/core/common/BlockValSet.java | 12 ++
.../apache/pinot/core/common/DataBlockCache.java | 21 +++
.../org/apache/pinot/core/common/DataFetcher.java | 54 ++++++++
.../operator/docvalsets/ProjectionBlockValSet.java | 24 ++++
.../operator/docvalsets/TransformBlockValSet.java | 34 +++++
.../local/io/util/FixedByteValueReaderWriter.java | 20 +--
.../pinot/segment/local/io/util/ValueReader.java | 43 +++++-
.../local/io/util/VarLengthValueReader.java | 6 +-
.../index/readers/BaseImmutableDictionary.java | 12 ++
.../index/readers/BigDecimalDictionary.java | 24 ++++
.../segment/index/readers/BytesDictionary.java | 24 ++++
.../segment/index/readers/StringDictionary.java | 24 ++++
.../pinot/segment/spi/index/reader/Dictionary.java | 31 +++++
.../spi/index/reader/ForwardIndexReader.java | 21 ++-
.../pinot/spi/utils/hash/MurmurHashFunctions.java | 148 ++++++++-------------
15 files changed, 374 insertions(+), 124 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
b/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
index c8aac0bdfb..495f517dfa 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/common/BlockValSet.java
@@ -113,6 +113,18 @@ public interface BlockValSet {
*/
byte[][] getBytesValuesSV();
+ default int[] get32BitsMurmur3HashValuesSV() {
+ throw new UnsupportedOperationException();
+ }
+
+ default long[] get64BitsMurmur3HashValuesSV() {
+ throw new UnsupportedOperationException();
+ }
+
+ default long[][] get128BitsMurmur3HashValuesSV() {
+ throw new UnsupportedOperationException();
+ }
+
/**
* MULTI-VALUED COLUMN APIs
*/
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
b/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
index d5ba2e1202..18ddda44b0 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/common/DataBlockCache.java
@@ -242,6 +242,27 @@ public class DataBlockCache implements AutoCloseable {
return bytesValues;
}
+ public int[] get32BitsMurmur3HashValuesForSVColumn(String column) {
+ // TODO: This is not cached
+ int[] hashValues = new int[_length];
+ _dataFetcher.fetch32BitsMurmur3HashValues(column, _docIds, _length,
hashValues);
+ return hashValues;
+ }
+
+ public long[] get64BitsMurmur3HashValuesForSVColumn(String column) {
+ // TODO: This is not cached
+ long[] hashValues = new long[_length];
+ _dataFetcher.fetch64BitsMurmur3HashValues(column, _docIds, _length,
hashValues);
+ return hashValues;
+ }
+
+ public long[][] get128BitsMurmur3HashValuesForSVColumn(String column) {
+ // TODO: This is not cached
+ long[][] hashValues = new long[_length][];
+ _dataFetcher.fetch128BitsMurmur3HashValues(column, _docIds, _length,
hashValues);
+ return hashValues;
+ }
+
/**
* MULTI-VALUED COLUMN API
*/
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
b/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
index 681b1f58db..384014c808 100644
--- a/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
+++ b/pinot-core/src/main/java/org/apache/pinot/core/common/DataFetcher.java
@@ -185,6 +185,18 @@ public class DataFetcher implements AutoCloseable {
_columnValueReaderMap.get(column).readMapValues(inDocIds, length,
outValues);
}
+ public void fetch32BitsMurmur3HashValues(String column, int[] inDocIds, int
length, int[] outValues) {
+ _columnValueReaderMap.get(column).read32BitsMurmur3HashValues(inDocIds,
length, outValues);
+ }
+
+ public void fetch64BitsMurmur3HashValues(String column, int[] inDocIds, int
length, long[] outValues) {
+ _columnValueReaderMap.get(column).read64BitsMurmur3HashValues(inDocIds,
length, outValues);
+ }
+
+ public void fetch128BitsMurmur3HashValues(String column, int[] inDocIds, int
length, long[][] outValues) {
+ _columnValueReaderMap.get(column).read128BitsMurmur3HashValues(inDocIds,
length, outValues);
+ }
+
/**
* MULTI-VALUED COLUMN API
*/
@@ -422,6 +434,48 @@ public class DataFetcher implements AutoCloseable {
}
}
+ void read32BitsMurmur3HashValues(int[] docIds, int length, int[]
valueBuffer) {
+ Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+ ForwardIndexReaderContext readerContext = getReaderContext();
+ if (_dictionary != null) {
+ int[] dictIdBuffer = THREAD_LOCAL_DICT_IDS.get();
+ _reader.readDictIds(docIds, length, dictIdBuffer, readerContext);
+ _dictionary.read32BitsMurmur3HashValues(dictIdBuffer, length,
valueBuffer);
+ } else {
+ for (int i = 0; i < length; i++) {
+ valueBuffer[i] = _reader.get32BitsMurmur3Hash(docIds[i],
readerContext);
+ }
+ }
+ }
+
+ void read64BitsMurmur3HashValues(int[] docIds, int length, long[]
valueBuffer) {
+ Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+ ForwardIndexReaderContext readerContext = getReaderContext();
+ if (_dictionary != null) {
+ int[] dictIdBuffer = THREAD_LOCAL_DICT_IDS.get();
+ _reader.readDictIds(docIds, length, dictIdBuffer, readerContext);
+ _dictionary.read64BitsMurmur3HashValues(dictIdBuffer, length,
valueBuffer);
+ } else {
+ for (int i = 0; i < length; i++) {
+ valueBuffer[i] = _reader.get64BitsMurmur3Hash(docIds[i],
readerContext);
+ }
+ }
+ }
+
+ void read128BitsMurmur3HashValues(int[] docIds, int length, long[][]
valueBuffer) {
+ Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
+ ForwardIndexReaderContext readerContext = getReaderContext();
+ if (_dictionary != null) {
+ int[] dictIdBuffer = THREAD_LOCAL_DICT_IDS.get();
+ _reader.readDictIds(docIds, length, dictIdBuffer, readerContext);
+ _dictionary.read128BitsMurmur3HashValues(dictIdBuffer, length,
valueBuffer);
+ } else {
+ for (int i = 0; i < length; i++) {
+ valueBuffer[i] = _reader.get128BitsMurmur3Hash(docIds[i],
readerContext);
+ }
+ }
+ }
+
void readDictIdsMV(int[] docIds, int length, int[][] dictIdsBuffer) {
Tracing.activeRecording().setInputDataType(_storedType, _singleValue);
ForwardIndexReaderContext readerContext = getReaderContext();
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
index 76cc124e1f..0dfd8b52d6 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/ProjectionBlockValSet.java
@@ -162,6 +162,30 @@ public class ProjectionBlockValSet implements BlockValSet {
}
}
+ @Override
+ public int[] get32BitsMurmur3HashValuesSV() {
+ try (InvocationScope scope =
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
+ recordReadValues(scope, DataType.BYTES, true);
+ return _dataBlockCache.get32BitsMurmur3HashValuesForSVColumn(_column);
+ }
+ }
+
+ @Override
+ public long[] get64BitsMurmur3HashValuesSV() {
+ try (InvocationScope scope =
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
+ recordReadValues(scope, DataType.BYTES, true);
+ return _dataBlockCache.get64BitsMurmur3HashValuesForSVColumn(_column);
+ }
+ }
+
+ @Override
+ public long[][] get128BitsMurmur3HashValuesSV() {
+ try (InvocationScope scope =
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
+ recordReadValues(scope, DataType.BYTES, true);
+ return _dataBlockCache.get128BitsMurmur3HashValuesForSVColumn(_column);
+ }
+ }
+
@Override
public int[][] getDictionaryIdsMV() {
try (InvocationScope scope =
Tracing.getTracer().createScope(ProjectionBlockValSet.class)) {
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
index 6ae584a486..823dd05cbe 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/docvalsets/TransformBlockValSet.java
@@ -30,6 +30,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.trace.InvocationRecording;
import org.apache.pinot.spi.trace.InvocationScope;
import org.apache.pinot.spi.trace.Tracing;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
import org.roaringbitmap.RoaringBitmap;
@@ -142,6 +143,39 @@ public class TransformBlockValSet implements BlockValSet {
}
}
+ @Override
+ public int[] get32BitsMurmur3HashValuesSV() {
+ byte[][] bytes = getBytesValuesSV();
+ int length = bytes.length;
+ int[] hashValues = new int[length];
+ for (int i = 0; i < length; i++) {
+ hashValues[i] = MurmurHashFunctions.murmurHash3X64Bit32(bytes[i], 0);
+ }
+ return hashValues;
+ }
+
+ @Override
+ public long[] get64BitsMurmur3HashValuesSV() {
+ byte[][] bytes = getBytesValuesSV();
+ int length = bytes.length;
+ long[] hashValues = new long[length];
+ for (int i = 0; i < length; i++) {
+ hashValues[i] = MurmurHashFunctions.murmurHash3X64Bit64(bytes[i], 0);
+ }
+ return hashValues;
+ }
+
+ @Override
+ public long[][] get128BitsMurmur3HashValuesSV() {
+ byte[][] bytes = getBytesValuesSV();
+ int length = bytes.length;
+ long[][] hashValues = new long[length][];
+ for (int i = 0; i < length; i++) {
+ hashValues[i] =
MurmurHashFunctions.murmurHash3X64Bit128AsLongs(bytes[i], 0);
+ }
+ return hashValues;
+ }
+
@Override
public int[][] getDictionaryIdsMV() {
try (InvocationScope scope =
Tracing.getTracer().createScope(TransformBlockValSet.class)) {
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
index 73ca8c09f4..2e85a81c1a 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/FixedByteValueReaderWriter.java
@@ -53,10 +53,8 @@ public final class FixedByteValueReaderWriter implements
ValueReader {
return _dataBuffer.getDouble((long) index * Double.BYTES);
}
- /**
- * Reads the unpadded bytes into the given buffer and returns the length.
- */
- private int readUnpaddedBytes(int index, int numBytesPerValue, byte[]
buffer) {
+ @Override
+ public int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer)
{
// Based on the ZeroInWord algorithm:
http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
assert buffer.length >= numBytesPerValue;
long startOffset = (long) index * numBytesPerValue;
@@ -85,20 +83,6 @@ public final class FixedByteValueReaderWriter implements
ValueReader {
return i;
}
- @Override
- public byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[]
buffer) {
- int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
- byte[] bytes = new byte[length];
- System.arraycopy(buffer, 0, bytes, 0, length);
- return bytes;
- }
-
- @Override
- public String getUnpaddedString(int index, int numBytesPerValue, byte[]
buffer) {
- int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
- return new String(buffer, 0, length, UTF_8);
- }
-
@Override
public String getPaddedString(int index, int numBytesPerValue, byte[]
buffer) {
assert buffer.length >= numBytesPerValue;
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
index 4eaf17e37e..9aa9382e31 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
@@ -21,6 +21,7 @@ package org.apache.pinot.segment.local.io.util;
import java.io.Closeable;
import java.math.BigDecimal;
import org.apache.pinot.spi.utils.BigDecimalUtils;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
/**
@@ -40,16 +41,30 @@ public interface ValueReader extends Closeable {
return BigDecimalUtils.deserialize(getBytes(index, numBytesPerValue));
}
+ /**
+ * Reads the unpadded bytes into the given buffer and returns the length.
+ * NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
+ */
+ int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer);
+
/**
* Returns un-padded bytes for string.
* NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
*/
- byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer);
+ default byte[] getUnpaddedBytes(int index, int numBytesPerValue, byte[]
buffer) {
+ int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+ byte[] value = new byte[length];
+ System.arraycopy(buffer, 0, value, 0, length);
+ return value;
+ }
/**
* NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
*/
- String getUnpaddedString(int index, int numBytesPerValue, byte[] buffer);
+ default String getUnpaddedString(int index, int numBytesPerValue, byte[]
buffer) {
+ int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+ return new String(buffer, 0, length);
+ }
/**
* NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
@@ -61,6 +76,30 @@ public interface ValueReader extends Closeable {
*/
byte[] getBytes(int index, int numBytesPerValue);
+ /**
+ * NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
+ */
+ default int get32BitsMurmur3Hash(int index, int numBytesPerValue, byte[]
buffer) {
+ int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+ return MurmurHashFunctions.murmurHash3X64Bit32(buffer, length, 0);
+ }
+
+ /**
+ * NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
+ */
+ default long get64BitsMurmur3Hash(int index, int numBytesPerValue, byte[]
buffer) {
+ int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+ return MurmurHashFunctions.murmurHash3X64Bit64(buffer, length, 0);
+ }
+
+ /**
+ * NOTE: The passed in reusable buffer should have capacity of at least
{@code numBytesPerValue}.
+ */
+ default long[] get128BitsMurmur3Hash(int index, int numBytesPerValue, byte[]
buffer) {
+ int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
+ return MurmurHashFunctions.murmurHash3X64Bit128AsLongs(buffer, length, 0);
+ }
+
/**
* Returns the comparison result of the UTF-8 decoded values.
*/
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
index d9384bf16f..502d38cfaa 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/VarLengthValueReader.java
@@ -22,8 +22,6 @@ import java.util.List;
import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* The value reader for var-length values (STRING and BYTES). See {@link
VarLengthValueWriter} for the file layout.
@@ -89,7 +87,7 @@ public class VarLengthValueReader implements ValueReader {
}
@Override
- public String getUnpaddedString(int index, int numBytesPerValue, byte[]
buffer) {
+ public int readUnpaddedBytes(int index, int numBytesPerValue, byte[] buffer)
{
assert buffer.length >= numBytesPerValue;
// Read the offset of the byte array first and then read the actual byte
array.
@@ -100,7 +98,7 @@ public class VarLengthValueReader implements ValueReader {
assert numBytesPerValue >= length;
_dataBuffer.copyTo(startOffset, buffer, 0, length);
- return new String(buffer, 0, length, UTF_8);
+ return length;
}
public void recordOffsetRanges(int index, long baseOffset,
List<ForwardIndexReader.ByteRange> rangeList) {
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
index d075306000..4ba8895bea 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BaseImmutableDictionary.java
@@ -278,6 +278,18 @@ public abstract class BaseImmutableDictionary implements
Dictionary {
return _valueReader.getBytes(dictId, _numBytesPerValue);
}
+ public int get32BitsMurmur3Hash(int dictId, byte[] buffer) {
+ return _valueReader.get32BitsMurmur3Hash(dictId, _numBytesPerValue,
buffer);
+ }
+
+ public long get64BitsMurmur3Hash(int dictId, byte[] buffer) {
+ return _valueReader.get64BitsMurmur3Hash(dictId, _numBytesPerValue,
buffer);
+ }
+
+ public long[] get128BitsMurmur3HashValue(int dictId, byte[] buffer) {
+ return _valueReader.get128BitsMurmur3Hash(dictId, _numBytesPerValue,
buffer);
+ }
+
protected byte[] getBuffer() {
return new byte[_numBytesPerValue];
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
index a6cfa817c6..00efcc3b61 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BigDecimalDictionary.java
@@ -97,4 +97,28 @@ public class BigDecimalDictionary extends
BaseImmutableDictionary {
public byte[] getBytesValue(int dictId) {
return getBytes(dictId);
}
+
+ @Override
+ public void read32BitsMurmur3HashValues(int[] dictIds, int length, int[]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get32BitsMurmur3Hash(dictIds[i], buffer);
+ }
+ }
+
+ @Override
+ public void read64BitsMurmur3HashValues(int[] dictIds, int length, long[]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get64BitsMurmur3Hash(dictIds[i], buffer);
+ }
+ }
+
+ @Override
+ public void read128BitsMurmur3HashValues(int[] dictIds, int length, long[][]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get128BitsMurmur3HashValue(dictIds[i], buffer);
+ }
+ }
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
index 3528f1e74d..db3401d3af 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/BytesDictionary.java
@@ -104,4 +104,28 @@ public class BytesDictionary extends
BaseImmutableDictionary {
public byte[] getBytesValue(int dictId) {
return getBytes(dictId);
}
+
+ @Override
+ public void read32BitsMurmur3HashValues(int[] dictIds, int length, int[]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get32BitsMurmur3Hash(dictIds[i], buffer);
+ }
+ }
+
+ @Override
+ public void read64BitsMurmur3HashValues(int[] dictIds, int length, long[]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get64BitsMurmur3Hash(dictIds[i], buffer);
+ }
+ }
+
+ @Override
+ public void read128BitsMurmur3HashValues(int[] dictIds, int length, long[][]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get128BitsMurmur3HashValue(dictIds[i], buffer);
+ }
+ }
}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
index 28f510eed8..60f604660f 100644
---
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/readers/StringDictionary.java
@@ -169,4 +169,28 @@ public class StringDictionary extends
BaseImmutableDictionary {
outValues[i] = getUnpaddedBytes(dictIds[i], buffer);
}
}
+
+ @Override
+ public void read32BitsMurmur3HashValues(int[] dictIds, int length, int[]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get32BitsMurmur3Hash(dictIds[i], buffer);
+ }
+ }
+
+ @Override
+ public void read64BitsMurmur3HashValues(int[] dictIds, int length, long[]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get64BitsMurmur3Hash(dictIds[i], buffer);
+ }
+ }
+
+ @Override
+ public void read128BitsMurmur3HashValues(int[] dictIds, int length, long[][]
outValues) {
+ byte[] buffer = getBuffer();
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get128BitsMurmur3HashValue(dictIds[i], buffer);
+ }
+ }
}
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
index 2edcf51720..0f61d3947e 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/Dictionary.java
@@ -27,6 +27,7 @@ import org.apache.pinot.segment.spi.index.IndexReader;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.utils.ByteArray;
import org.apache.pinot.spi.utils.MapUtils;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
/**
@@ -202,6 +203,18 @@ public interface Dictionary extends IndexReader {
return new ByteArray(getBytesValue(dictId));
}
+ default int get32BitsMurmur3HashValue(int dictId) {
+ return MurmurHashFunctions.murmurHash3X64Bit32(getBytesValue(dictId), 0);
+ }
+
+ default long get64BitsMurmur3HashValue(int dictId) {
+ return MurmurHashFunctions.murmurHash3X64Bit64(getBytesValue(dictId), 0);
+ }
+
+ default long[] get128BitsMurmur3HashValue(int dictId) {
+ return
MurmurHashFunctions.murmurHash3X64Bit128AsLongs(getBytesValue(dictId), 0);
+ }
+
// Batch read APIs
default void readIntValues(int[] dictIds, int length, int[] outValues) {
@@ -276,6 +289,24 @@ public interface Dictionary extends IndexReader {
}
}
+ default void read32BitsMurmur3HashValues(int[] dictIds, int length, int[]
outValues) {
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get32BitsMurmur3HashValue(dictIds[i]);
+ }
+ }
+
+ default void read64BitsMurmur3HashValues(int[] dictIds, int length, long[]
outValues) {
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get64BitsMurmur3HashValue(dictIds[i]);
+ }
+ }
+
+ default void read128BitsMurmur3HashValues(int[] dictIds, int length,
long[][] outValues) {
+ for (int i = 0; i < length; i++) {
+ outValues[i] = get128BitsMurmur3HashValue(dictIds[i]);
+ }
+ }
+
default void getDictIds(List<String> values, IntSet dictIds) {
for (String value : values) {
int dictId = indexOf(value);
diff --git
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
index b3d9f842d7..dc0d37b986 100644
---
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
+++
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/index/reader/ForwardIndexReader.java
@@ -30,6 +30,7 @@ import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.utils.BigDecimalUtils;
import org.apache.pinot.spi.utils.BytesUtils;
import org.apache.pinot.spi.utils.MapUtils;
+import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
/**
@@ -512,6 +513,18 @@ public interface ForwardIndexReader<T extends
ForwardIndexReaderContext> extends
+ "ForwardIndexReader is being created to read this column.");
}
+ default int get32BitsMurmur3Hash(int docId, T context) {
+ return MurmurHashFunctions.murmurHash3X64Bit32(getBytes(docId, context),
0);
+ }
+
+ default long get64BitsMurmur3Hash(int docId, T context) {
+ return MurmurHashFunctions.murmurHash3X64Bit64(getBytes(docId, context),
0);
+ }
+
+ default long[] get128BitsMurmur3Hash(int docId, T context) {
+ return MurmurHashFunctions.murmurHash3X64Bit128AsLongs(getBytes(docId,
context), 0);
+ }
+
/**
* MULTI-VALUE COLUMN RAW INDEX APIs
*/
@@ -995,10 +1008,10 @@ public interface ForwardIndexReader<T extends
ForwardIndexReaderContext> extends
/**
* Returns whether the forward index supports recording the byte ranges
accessed while reading a given docId.
- * For readers that do support this info, caller should check if the buffer
is a {@link isFixedOffsetMappingType()}.
- * If yes, the byte range mapping for a docId can be calculated using the
{@link getRawDataStartOffset()} and the
- * {@link getDocLength()} functions.
- * if not, caller should use the {@link recordDocIdByteRanges()} function to
get the list of byte ranges accessed
+ * For readers that do support this info, caller should check if the buffer
is a {@link #isFixedOffsetMappingType}.
+ * If yes, the byte range mapping for a docId can be calculated using the
{@link #getRawDataStartOffset} and the
+ * {@link #getDocLength} functions.
+ * if not, caller should use the {@link #recordDocIdByteRanges} function to
get the list of byte ranges accessed
* for a docId.
*/
default boolean isBufferByteRangeInfoSupported() {
diff --git
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
index 276eb0f46e..0c43ed4b0d 100644
---
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
+++
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/hash/MurmurHashFunctions.java
@@ -279,94 +279,60 @@ public class MurmurHashFunctions {
}
/**
- * Hash a value using the x64 128 bit variant of MurmurHash3
- *
- * @param key value to hash
- * @param seed random value
- * @return 128 bit hashed key, in an array containing two longs
+ * Hash a value using the x64 128 bit variant of MurmurHash3.
*/
- public static byte[] murmurHash3X64Bit128(final byte[] key, final int seed) {
- State state = new State();
-
- state._h1 = 0x9368e53c2f6af274L ^ seed;
- state._h2 = 0x586dcd208f7cd3fdL ^ seed;
-
- state._c1 = 0x87c37b91114253d5L;
- state._c2 = 0x4cf5ad432745937fL;
-
- for (int i = 0; i < key.length / 16; i++) {
- state._k1 = getblock(key, i * 2 * 8);
- state._k2 = getblock(key, (i * 2 + 1) * 8);
-
- bmix(state);
- }
+ public static byte[] murmurHash3X64Bit128(byte[] key, int length, int seed) {
+ State state = murmurHash3X64(key, length, seed);
+ ByteBuffer buffer = ByteBuffer.allocate(16);
+ buffer.putLong(state._h1);
+ buffer.putLong(state._h2);
+ return buffer.array();
+ }
- state._k1 = 0;
- state._k2 = 0;
+ public static byte[] murmurHash3X64Bit128(byte[] key, int seed) {
+ return murmurHash3X64Bit128(key, key.length, seed);
+ }
- int tail = (key.length >>> 4) << 4;
- // CHECKSTYLE:OFF: checkstyle:coding
- switch (key.length & 15) {
- case 15:
- state._k2 ^= (long) key[tail + 14] << 48;
- case 14:
- state._k2 ^= (long) key[tail + 13] << 40;
- case 13:
- state._k2 ^= (long) key[tail + 12] << 32;
- case 12:
- state._k2 ^= (long) key[tail + 11] << 24;
- case 11:
- state._k2 ^= (long) key[tail + 10] << 16;
- case 10:
- state._k2 ^= (long) key[tail + 9] << 8;
- case 9:
- state._k2 ^= key[tail + 8];
- case 8:
- state._k1 ^= (long) key[tail + 7] << 56;
- case 7:
- state._k1 ^= (long) key[tail + 6] << 48;
- case 6:
- state._k1 ^= (long) key[tail + 5] << 40;
- case 5:
- state._k1 ^= (long) key[tail + 4] << 32;
- case 4:
- state._k1 ^= (long) key[tail + 3] << 24;
- case 3:
- state._k1 ^= (long) key[tail + 2] << 16;
- case 2:
- state._k1 ^= (long) key[tail + 1] << 8;
- case 1:
- state._k1 ^= key[tail + 0];
- bmix(state);
- }
- // CHECKSTYLE:ON: checkstyle:coding
+ /**
+ * Hash a value using the x64 128 bit variant of MurmurHash3.
+ */
+ public static long[] murmurHash3X64Bit128AsLongs(byte[] key, int length, int
seed) {
+ State state = murmurHash3X64(key, length, seed);
+ return new long[]{state._h1, state._h2};
+ }
- state._h2 ^= key.length;
+ public static long[] murmurHash3X64Bit128AsLongs(byte[] key, int seed) {
+ return murmurHash3X64Bit128AsLongs(key, key.length, seed);
+ }
- state._h1 += state._h2;
- state._h2 += state._h1;
+ /**
+ * Hash a value using the x64 64 bit variant of murmurHash3.
+ */
+ public static long murmurHash3X64Bit64(byte[] key, int length, int seed) {
+ return murmurHash3X64(key, length, seed)._h1;
+ }
- state._h1 = fmix(state._h1);
- state._h2 = fmix(state._h2);
+ public static long murmurHash3X64Bit64(byte[] key, int seed) {
+ return murmurHash3X64Bit64(key, key.length, seed);
+ }
- state._h1 += state._h2;
- state._h2 += state._h1;
+ /**
+ * Hash a value using the x64 64 bit variant of murmurHash3.
+ */
+ public static int murmurHash3X64Bit32(byte[] key, int length, int seed) {
+ return (int) (murmurHash3X64(key, length, seed)._h1 >>> 32);
+ }
- ByteBuffer buffer = ByteBuffer.allocate(16);
- buffer.putLong(state._h1);
- buffer.putLong(state._h2);
- return buffer.array();
+ public static int murmurHash3X64Bit32(byte[] key, int seed) {
+ return murmurHash3X64Bit32(key, key.length, seed);
}
/**
- * Hash a value using the x64 64 bit variant of murmurHash3
- *
- * @param key value to hash
- * @param seed random value
- * @return 64 bit hashed key
+ * Taken and modified from
+ * <a
href="https://github.com/infinispan/infinispan/blob/main/commons/all/src/main/java/org/infinispan/commons/hash/
+ * MurmurHash3.java">Infinispan code base</a>.
*/
- public static long murmurHash3X64Bit64(final byte[] key, final int seed) {
- // Exactly the same as murmurHash3X64Bit128, except it only returns
state.h1
+ private static State murmurHash3X64(byte[] key, int length, int seed) {
State state = new State();
state._h1 = 0x9368e53c2f6af274L ^ seed;
@@ -375,9 +341,10 @@ public class MurmurHashFunctions {
state._c1 = 0x87c37b91114253d5L;
state._c2 = 0x4cf5ad432745937fL;
- for (int i = 0; i < key.length / 16; i++) {
- state._k1 = getblock(key, i * 2 * 8);
- state._k2 = getblock(key, (i * 2 + 1) * 8);
+ int end = length - 15;
+ for (int i = 0; i < end; i += 16) {
+ state._k1 = getblock(key, i);
+ state._k2 = getblock(key, i + 8);
bmix(state);
}
@@ -385,9 +352,9 @@ public class MurmurHashFunctions {
state._k1 = 0;
state._k2 = 0;
- int tail = (key.length >>> 4) << 4;
+ int tail = length & 0xFFFFFFF0;
// CHECKSTYLE:OFF: checkstyle:coding
- switch (key.length & 15) {
+ switch (length & 15) {
case 15:
state._k2 ^= (long) key[tail + 14] << 48;
case 14:
@@ -422,7 +389,7 @@ public class MurmurHashFunctions {
}
// CHECKSTYLE:ON: checkstyle:coding
- state._h2 ^= key.length;
+ state._h2 ^= length;
state._h1 += state._h2;
state._h2 += state._h1;
@@ -433,18 +400,7 @@ public class MurmurHashFunctions {
state._h1 += state._h2;
state._h2 += state._h1;
- return state._h1;
- }
-
- /**
- * Hash a value using the x64 32 bit variant of murmurHash3
- *
- * @param key value to hash
- * @param seed random value
- * @return 32 bit hashed key
- */
- public static int murmurHash3X64Bit32(final byte[] key, final int seed) {
- return (int) (murmurHash3X64Bit64(key, seed) >>> 32);
+ return state;
}
private static void addByte(State state, byte b, int len) {
@@ -462,8 +418,8 @@ public class MurmurHashFunctions {
}
}
- static long getblock(byte[] key, int i) {
- return ((key[i + 0] & 0x00000000000000FFL)) | ((key[i + 1] &
0x00000000000000FFL) << 8) | (
+ private static long getblock(byte[] key, int i) {
+ return (key[i] & 0x00000000000000FFL) | ((key[i + 1] &
0x00000000000000FFL) << 8) | (
(key[i + 2] & 0x00000000000000FFL) << 16) | ((key[i + 3] &
0x00000000000000FFL) << 24) | (
(key[i + 4] & 0x00000000000000FFL) << 32) | ((key[i + 5] &
0x00000000000000FFL) << 40) | (
(key[i + 6] & 0x00000000000000FFL) << 48) | ((key[i + 7] &
0x00000000000000FFL) << 56);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]