Jackie-Jiang commented on code in PR #18869:
URL: https://github.com/apache/pinot/pull/18869#discussion_r3508169361
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
Review Comment:
(convention) Please use markdown style for new changes. Same for other places
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
+ * Utilities for Pinot's logical UUID type.
+ *
+ * <p>UUID values are externally represented as canonical lowercase RFC 4122
strings and internally represented as
+ * fixed-width 16-byte values.
+ */
public class UuidUtils {
+ public static final int UUID_NUM_BYTES = 16;
+ private static final byte[] NULL_UUID_BYTES = new byte[UUID_NUM_BYTES];
+
+ // Gregorian-to-Unix offset in 100-nanosecond units. Matches RFC 4122 / RFC
9562.
+ // This is the count of 100-ns intervals between 1582-10-15T00:00:00Z and
1970-01-01T00:00:00Z.
+ private static final long GREGORIAN_TO_UNIX_OFFSET_100NS =
0x01b21dd213814000L;
+
private UuidUtils() {
Review Comment:
(nit) For util class, we usually put this next to class declaration for
clarity
##########
pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java:
##########
@@ -413,11 +415,15 @@ public void setAllowTrailingZeros(boolean
allowTrailingZeros) {
}
public Object getDefaultNullValue() {
+ if (_dataType == DataType.UUID && _defaultNullValue instanceof byte[]) {
+ byte[] uuidDefaultNullValue = (byte[]) _defaultNullValue;
+ return Arrays.copyOf(uuidDefaultNullValue, uuidDefaultNullValue.length);
+ }
return _defaultNullValue;
}
public String getDefaultNullValueString() {
- return getStringValue(_defaultNullValue);
+ return _dataType != null ? _dataType.toString(_defaultNullValue) :
getStringValue(_defaultNullValue);
Review Comment:
Why do we need this check?
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
+ * Utilities for Pinot's logical UUID type.
+ *
+ * <p>UUID values are externally represented as canonical lowercase RFC 4122
strings and internally represented as
+ * fixed-width 16-byte values.
+ */
public class UuidUtils {
+ public static final int UUID_NUM_BYTES = 16;
+ private static final byte[] NULL_UUID_BYTES = new byte[UUID_NUM_BYTES];
+
+ // Gregorian-to-Unix offset in 100-nanosecond units. Matches RFC 4122 / RFC
9562.
+ // This is the count of 100-ns intervals between 1582-10-15T00:00:00Z and
1970-01-01T00:00:00Z.
+ private static final long GREGORIAN_TO_UNIX_OFFSET_100NS =
0x01b21dd213814000L;
+
private UuidUtils() {
}
- /// Converts `uuid` to its 16-byte big-endian RFC 4122 form.
+ public static byte[] nullUuidBytes() {
+ return Arrays.copyOf(NULL_UUID_BYTES, UUID_NUM_BYTES);
+ }
+
+ public static byte[] toBytes(long mostSignificantBits, long
leastSignificantBits) {
+ byte[] uuidBytes = new byte[UUID_NUM_BYTES];
+ writeLong(uuidBytes, 0, mostSignificantBits);
+ writeLong(uuidBytes, Long.BYTES, leastSignificantBits);
+ return uuidBytes;
+ }
+
public static byte[] toBytes(UUID uuid) {
- return ByteBuffer.allocate(16)
- .putLong(uuid.getMostSignificantBits())
- .putLong(uuid.getLeastSignificantBits())
- .array();
+ validateNotNull(uuid, "UUID bytes");
+ return toBytes(uuid.getMostSignificantBits(),
uuid.getLeastSignificantBits());
+ }
+
+ public static byte[] toBytes(String uuidString) {
+ UUID uuid;
+ try {
+ uuid = UUID.fromString(uuidString);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Invalid UUID value: '" + uuidString
+ "'", e);
+ }
+ String canonical = uuid.toString();
+ if (!canonical.equalsIgnoreCase(uuidString)) {
+ throw new IllegalArgumentException(
+ "Invalid UUID value: '" + uuidString + "'. Expected RFC 4122 format:
" + canonical);
+ }
+ return toBytes(uuid);
+ }
+
+ public static byte[] toBytes(byte[] uuidBytes) {
+ validateNotNull(uuidBytes, "UUID bytes");
+ validateLength(uuidBytes);
+ return Arrays.copyOf(uuidBytes, uuidBytes.length);
Review Comment:
Why do we need this? Copying a buffer doesn't seem correct
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
+ * Utilities for Pinot's logical UUID type.
+ *
+ * <p>UUID values are externally represented as canonical lowercase RFC 4122
strings and internally represented as
+ * fixed-width 16-byte values.
+ */
public class UuidUtils {
+ public static final int UUID_NUM_BYTES = 16;
+ private static final byte[] NULL_UUID_BYTES = new byte[UUID_NUM_BYTES];
+
+ // Gregorian-to-Unix offset in 100-nanosecond units. Matches RFC 4122 / RFC
9562.
+ // This is the count of 100-ns intervals between 1582-10-15T00:00:00Z and
1970-01-01T00:00:00Z.
+ private static final long GREGORIAN_TO_UNIX_OFFSET_100NS =
0x01b21dd213814000L;
+
private UuidUtils() {
}
- /// Converts `uuid` to its 16-byte big-endian RFC 4122 form.
+ public static byte[] nullUuidBytes() {
+ return Arrays.copyOf(NULL_UUID_BYTES, UUID_NUM_BYTES);
+ }
+
+ public static byte[] toBytes(long mostSignificantBits, long
leastSignificantBits) {
+ byte[] uuidBytes = new byte[UUID_NUM_BYTES];
+ writeLong(uuidBytes, 0, mostSignificantBits);
+ writeLong(uuidBytes, Long.BYTES, leastSignificantBits);
+ return uuidBytes;
+ }
+
public static byte[] toBytes(UUID uuid) {
- return ByteBuffer.allocate(16)
- .putLong(uuid.getMostSignificantBits())
- .putLong(uuid.getLeastSignificantBits())
- .array();
+ validateNotNull(uuid, "UUID bytes");
Review Comment:
Given all methods here can be invoked on a per value basis, suggest removing
all `null` checks. Caller should ensure not passing null values
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
+ * Utilities for Pinot's logical UUID type.
+ *
+ * <p>UUID values are externally represented as canonical lowercase RFC 4122
strings and internally represented as
+ * fixed-width 16-byte values.
+ */
public class UuidUtils {
+ public static final int UUID_NUM_BYTES = 16;
+ private static final byte[] NULL_UUID_BYTES = new byte[UUID_NUM_BYTES];
+
+ // Gregorian-to-Unix offset in 100-nanosecond units. Matches RFC 4122 / RFC
9562.
+ // This is the count of 100-ns intervals between 1582-10-15T00:00:00Z and
1970-01-01T00:00:00Z.
+ private static final long GREGORIAN_TO_UNIX_OFFSET_100NS =
0x01b21dd213814000L;
+
private UuidUtils() {
}
- /// Converts `uuid` to its 16-byte big-endian RFC 4122 form.
+ public static byte[] nullUuidBytes() {
+ return Arrays.copyOf(NULL_UUID_BYTES, UUID_NUM_BYTES);
+ }
+
+ public static byte[] toBytes(long mostSignificantBits, long
leastSignificantBits) {
+ byte[] uuidBytes = new byte[UUID_NUM_BYTES];
+ writeLong(uuidBytes, 0, mostSignificantBits);
+ writeLong(uuidBytes, Long.BYTES, leastSignificantBits);
+ return uuidBytes;
+ }
+
public static byte[] toBytes(UUID uuid) {
- return ByteBuffer.allocate(16)
- .putLong(uuid.getMostSignificantBits())
- .putLong(uuid.getLeastSignificantBits())
- .array();
+ validateNotNull(uuid, "UUID bytes");
+ return toBytes(uuid.getMostSignificantBits(),
uuid.getLeastSignificantBits());
+ }
+
+ public static byte[] toBytes(String uuidString) {
+ UUID uuid;
+ try {
+ uuid = UUID.fromString(uuidString);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Invalid UUID value: '" + uuidString
+ "'", e);
+ }
+ String canonical = uuid.toString();
+ if (!canonical.equalsIgnoreCase(uuidString)) {
+ throw new IllegalArgumentException(
+ "Invalid UUID value: '" + uuidString + "'. Expected RFC 4122 format:
" + canonical);
+ }
Review Comment:
This check could be expensive.
Consider adding an unsafe version to be used when we already know the input
string is valid
##########
pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java:
##########
@@ -413,11 +415,15 @@ public void setAllowTrailingZeros(boolean
allowTrailingZeros) {
}
public Object getDefaultNullValue() {
+ if (_dataType == DataType.UUID && _defaultNullValue instanceof byte[]) {
+ byte[] uuidDefaultNullValue = (byte[]) _defaultNullValue;
+ return Arrays.copyOf(uuidDefaultNullValue, uuidDefaultNullValue.length);
+ }
Review Comment:
No need to copy here. We never change default value
##########
pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java:
##########
@@ -750,7 +774,10 @@ public enum DataType {
MAP(false, false),
OPEN_STRUCT(false, false),
LIST(false, false),
- UNKNOWN(false, true);
+ UNKNOWN(false, true),
+ // UUID must remain at the end to avoid shifting ordinals of existing
types.
+ // AnyValueAggregationFunction serializes DataType.ordinal() into
intermediate results.
Review Comment:
This is quite bad design.. We should consider making it self-contained
within `AnyValueAggregationFunction` as a separate PR.
We can still put `UUID` after `BYTES` given all the supported types in
`AnyValueAggregationFunction` are before `BYTES`
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
+ * Utilities for Pinot's logical UUID type.
+ *
+ * <p>UUID values are externally represented as canonical lowercase RFC 4122
strings and internally represented as
+ * fixed-width 16-byte values.
+ */
public class UuidUtils {
+ public static final int UUID_NUM_BYTES = 16;
+ private static final byte[] NULL_UUID_BYTES = new byte[UUID_NUM_BYTES];
+
+ // Gregorian-to-Unix offset in 100-nanosecond units. Matches RFC 4122 / RFC
9562.
+ // This is the count of 100-ns intervals between 1582-10-15T00:00:00Z and
1970-01-01T00:00:00Z.
+ private static final long GREGORIAN_TO_UNIX_OFFSET_100NS =
0x01b21dd213814000L;
+
private UuidUtils() {
}
- /// Converts `uuid` to its 16-byte big-endian RFC 4122 form.
+ public static byte[] nullUuidBytes() {
+ return Arrays.copyOf(NULL_UUID_BYTES, UUID_NUM_BYTES);
+ }
+
+ public static byte[] toBytes(long mostSignificantBits, long
leastSignificantBits) {
+ byte[] uuidBytes = new byte[UUID_NUM_BYTES];
+ writeLong(uuidBytes, 0, mostSignificantBits);
+ writeLong(uuidBytes, Long.BYTES, leastSignificantBits);
+ return uuidBytes;
+ }
+
public static byte[] toBytes(UUID uuid) {
- return ByteBuffer.allocate(16)
- .putLong(uuid.getMostSignificantBits())
- .putLong(uuid.getLeastSignificantBits())
- .array();
+ validateNotNull(uuid, "UUID bytes");
+ return toBytes(uuid.getMostSignificantBits(),
uuid.getLeastSignificantBits());
+ }
+
+ public static byte[] toBytes(String uuidString) {
+ UUID uuid;
+ try {
+ uuid = UUID.fromString(uuidString);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Invalid UUID value: '" + uuidString
+ "'", e);
+ }
+ String canonical = uuid.toString();
+ if (!canonical.equalsIgnoreCase(uuidString)) {
+ throw new IllegalArgumentException(
+ "Invalid UUID value: '" + uuidString + "'. Expected RFC 4122 format:
" + canonical);
+ }
+ return toBytes(uuid);
+ }
+
+ public static byte[] toBytes(byte[] uuidBytes) {
+ validateNotNull(uuidBytes, "UUID bytes");
+ validateLength(uuidBytes);
Review Comment:
Same for length validation. If the input is known to be valid, we should
skip it. This can be hotspot
##########
pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java:
##########
@@ -459,13 +465,24 @@ private void setDefaultNullValue(@Nullable JsonNode
defaultNullValue) {
@JsonIgnore
public void setDefaultNullValue(@Nullable Object defaultNullValue) {
if (defaultNullValue != null) {
- _stringDefaultNullValue = getStringValue(defaultNullValue);
+ if (defaultNullValue instanceof String) {
+ _stringDefaultNullValue = (String) defaultNullValue;
+ } else {
+ _stringDefaultNullValue = getStringDefaultNullValue(defaultNullValue);
+ }
}
if (_dataType != null) {
_defaultNullValue = getDefaultNullValue(getFieldType(), _dataType,
_stringDefaultNullValue);
}
}
+ private String getStringDefaultNullValue(Object defaultNullValue) {
+ if (_dataType == DataType.UUID) {
Review Comment:
Why specializing UUID?
##########
pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java:
##########
@@ -459,13 +465,24 @@ private void setDefaultNullValue(@Nullable JsonNode
defaultNullValue) {
@JsonIgnore
public void setDefaultNullValue(@Nullable Object defaultNullValue) {
if (defaultNullValue != null) {
- _stringDefaultNullValue = getStringValue(defaultNullValue);
+ if (defaultNullValue instanceof String) {
Review Comment:
Is this necessary?
##########
pinot-spi/src/main/java/org/apache/pinot/spi/data/FieldSpec.java:
##########
@@ -844,6 +878,8 @@ public Object convert(String value) {
case STRING:
case JSON:
return value;
+ case UUID:
Review Comment:
Keep `UUID` after `BYTES` - its stored type. Same for other places
##########
pinot-spi/src/main/java/org/apache/pinot/spi/utils/UuidUtils.java:
##########
@@ -18,32 +18,507 @@
*/
package org.apache.pinot.spi.utils;
-import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
-/// Utilities for the 16-byte big-endian binary form of [UUID] (RFC 4122).
Bytes 0..7 are the
-/// most-significant 64 bits, bytes 8..15 are the least-significant 64 bits.
+/**
+ * Utilities for Pinot's logical UUID type.
+ *
+ * <p>UUID values are externally represented as canonical lowercase RFC 4122
strings and internally represented as
+ * fixed-width 16-byte values.
+ */
public class UuidUtils {
+ public static final int UUID_NUM_BYTES = 16;
+ private static final byte[] NULL_UUID_BYTES = new byte[UUID_NUM_BYTES];
+
+ // Gregorian-to-Unix offset in 100-nanosecond units. Matches RFC 4122 / RFC
9562.
+ // This is the count of 100-ns intervals between 1582-10-15T00:00:00Z and
1970-01-01T00:00:00Z.
+ private static final long GREGORIAN_TO_UNIX_OFFSET_100NS =
0x01b21dd213814000L;
+
private UuidUtils() {
}
- /// Converts `uuid` to its 16-byte big-endian RFC 4122 form.
+ public static byte[] nullUuidBytes() {
+ return Arrays.copyOf(NULL_UUID_BYTES, UUID_NUM_BYTES);
+ }
+
+ public static byte[] toBytes(long mostSignificantBits, long
leastSignificantBits) {
+ byte[] uuidBytes = new byte[UUID_NUM_BYTES];
+ writeLong(uuidBytes, 0, mostSignificantBits);
+ writeLong(uuidBytes, Long.BYTES, leastSignificantBits);
+ return uuidBytes;
+ }
+
public static byte[] toBytes(UUID uuid) {
- return ByteBuffer.allocate(16)
- .putLong(uuid.getMostSignificantBits())
- .putLong(uuid.getLeastSignificantBits())
- .array();
+ validateNotNull(uuid, "UUID bytes");
+ return toBytes(uuid.getMostSignificantBits(),
uuid.getLeastSignificantBits());
+ }
+
+ public static byte[] toBytes(String uuidString) {
+ UUID uuid;
+ try {
+ uuid = UUID.fromString(uuidString);
+ } catch (Exception e) {
+ throw new IllegalArgumentException("Invalid UUID value: '" + uuidString
+ "'", e);
+ }
+ String canonical = uuid.toString();
+ if (!canonical.equalsIgnoreCase(uuidString)) {
+ throw new IllegalArgumentException(
+ "Invalid UUID value: '" + uuidString + "'. Expected RFC 4122 format:
" + canonical);
+ }
+ return toBytes(uuid);
+ }
+
+ public static byte[] toBytes(byte[] uuidBytes) {
+ validateNotNull(uuidBytes, "UUID bytes");
+ validateLength(uuidBytes);
+ return Arrays.copyOf(uuidBytes, uuidBytes.length);
+ }
+
+ public static byte[] toBytes(ByteArray uuidBytes) {
+ validateNotNull(uuidBytes, "UUID bytes");
+ return toBytes(uuidBytes.getBytes());
+ }
+
+ public static byte[] toBytes(Object value) {
+ validateNotNull(value, "UUID bytes");
+ if (value instanceof UUID) {
+ return toBytes((UUID) value);
+ }
+ if (value instanceof byte[]) {
+ return toBytes((byte[]) value);
+ }
+ if (value instanceof ByteArray) {
+ return toBytes((ByteArray) value);
+ }
+ if (value instanceof CharSequence) {
+ return toBytes(value.toString());
+ }
+ throw new IllegalArgumentException(
+ "Cannot convert value: '" + value + "' to UUID bytes, unsupported
type: " + value.getClass());
+ }
+
+ public static UUID toUUID(byte[] uuidBytes) {
+ validateNotNull(uuidBytes, "UUID");
+ validateLength(uuidBytes);
+ return toUUID(getMostSignificantBitsInternal(uuidBytes),
getLeastSignificantBitsInternal(uuidBytes));
+ }
+
+ public static UUID toUUID(ByteArray uuidBytes) {
+ validateNotNull(uuidBytes, "UUID");
+ return toUUID(uuidBytes.getBytes());
+ }
+
+ public static UUID toUUID(String uuidString) {
+ return toUUID(toBytes(uuidString));
+ }
+
+ public static UUID toUUID(long mostSignificantBits, long
leastSignificantBits) {
+ return new UUID(mostSignificantBits, leastSignificantBits);
+ }
+
+ public static UUID toUUID(Object value) {
+ validateNotNull(value, "UUID");
+ if (value instanceof UUID) {
+ return (UUID) value;
+ }
+ if (value instanceof byte[]) {
+ return toUUID((byte[]) value);
+ }
+ if (value instanceof ByteArray) {
+ return toUUID((ByteArray) value);
+ }
+ if (value instanceof CharSequence) {
+ return toUUID(value.toString());
+ }
+ throw new IllegalArgumentException(
+ "Cannot convert value: '" + value + "' to UUID, unsupported type: " +
value.getClass());
+ }
+
+ public static String toString(byte[] uuidBytes) {
+ return toUUID(uuidBytes).toString();
+ }
+
+ public static String toString(ByteArray uuidBytes) {
+ return toString(uuidBytes.getBytes());
+ }
+
+ public static String toString(UUID uuid) {
+ return uuid.toString();
+ }
+
+ public static String toString(long mostSignificantBits, long
leastSignificantBits) {
+ return toUUID(mostSignificantBits, leastSignificantBits).toString();
+ }
+
+ /**
+ * Immutable UUID key optimized for Pinot execution hot paths.
+ *
+ * <p>The key stores UUID values as two primitive longs to avoid repeated
{@code ByteArray} allocation and
+ * byte-by-byte equality/hashing while preserving Pinot's canonical 16-byte
representation at the edges.
+ */
+ public static final class UuidKey implements Comparable<UuidKey> {
Review Comment:
Move this to a dedicate class
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]