nastra commented on code in PR #14500:
URL: https://github.com/apache/iceberg/pull/14500#discussion_r2740423040
##########
api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java:
##########
@@ -684,4 +686,154 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats",
INT_MAX_VALUE))
new StrictMetricsEvaluator(SCHEMA,
notNull("struct.nested_col_with_stats")).eval(FILE);
assertThat(shouldRead).as("notNull nested column should not
match").isFalse();
}
+
+ // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison
only
+
+ private static final UUID UUID_MIN =
UUID.fromString("00000000-0000-0000-0000-000000000001");
+ private static final UUID UUID_MAX =
UUID.fromString("80000000-0000-0000-0000-000000000001");
Review Comment:
we typically define all of the fields at the top of the file before defining
any methods. can you please make sure to move all fields further to the top?
##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java:
##########
@@ -1350,4 +1351,105 @@ private void assertEquals(UnboundTransform<?, ?>
expected, UnboundTransform<?, ?
.hasToString(expected.transform().toString());
assertEquals(expected.ref(), actual.ref());
}
+
+ // Tests for UUID bounds predicate detection and transformation
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidLt() {
+ Expression original = Expressions.lessThan("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID lt predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidEq() {
+ Expression original = Expressions.equal("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID eq predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidIn() {
+ Expression original = Expressions.in("uuid_col", UUID.randomUUID(),
UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID in predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoDetectionForNonUuid() {
+ Expression original = Expressions.equal("id", 42L);
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should not detect non-UUID predicate")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoDetectionForIsNull() {
+ Expression original = Expressions.isNull("uuid_col");
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should not detect UUID isNull predicate (doesn't compare against
bounds)")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsInCompoundExpression() {
+ Expression original =
+ Expressions.and(
+ Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col",
UUID.randomUUID()));
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID predicate in compound expression")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsLiteral() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original = Expressions.equal("uuid_col", testUuid);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull();
+ assertThat(result).isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate<?> pred = (UnboundPredicate<?>) result;
+ assertThat(pred.literal().value()).isEqualTo(testUuid);
+ // The literal should now use the signed comparator
+ assertThat(pred.literal()).isInstanceOf(Literals.UUIDLiteral.class);
Review Comment:
please make sure to add such a check to the other test cases in this class
as well
##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java:
##########
@@ -1350,4 +1351,105 @@ private void assertEquals(UnboundTransform<?, ?>
expected, UnboundTransform<?, ?
.hasToString(expected.transform().toString());
assertEquals(expected.ref(), actual.ref());
}
+
+ // Tests for UUID bounds predicate detection and transformation
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidLt() {
+ Expression original = Expressions.lessThan("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID lt predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidEq() {
+ Expression original = Expressions.equal("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID eq predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidIn() {
+ Expression original = Expressions.in("uuid_col", UUID.randomUUID(),
UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID in predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoDetectionForNonUuid() {
+ Expression original = Expressions.equal("id", 42L);
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should not detect non-UUID predicate")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoDetectionForIsNull() {
+ Expression original = Expressions.isNull("uuid_col");
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should not detect UUID isNull predicate (doesn't compare against
bounds)")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsInCompoundExpression() {
+ Expression original =
+ Expressions.and(
+ Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col",
UUID.randomUUID()));
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID predicate in compound expression")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsLiteral() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original = Expressions.equal("uuid_col", testUuid);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull();
+ assertThat(result).isInstanceOf(UnboundPredicate.class);
Review Comment:
nit: you can combine calling `.isNotNull().isInstanceOf(...)` everywhere
##########
api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java:
##########
@@ -64,7 +68,15 @@ public static ManifestEvaluator forPartitionFilter(
}
private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter,
boolean caseSensitive) {
- this.expr = Binder.bind(spec.partitionType(), rewriteNot(partitionFilter),
caseSensitive);
+ Types.StructType partitionType = spec.partitionType();
+ Expression rewritten = rewriteNot(partitionFilter);
+ this.expr = Binder.bind(partitionType, rewritten, caseSensitive);
+
+ // Create the signed UUID expression if and only if there are UUID
predicates
+ // that compare against bounds.
+ Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten);
Review Comment:
what if we would just return the original expression in
`toSignedUUIDLiteral` instead of null? I think that way we would be really just
"rewriting" the expression and then we wouldn't need to deal with null handling
on the calling sites
##########
api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java:
##########
@@ -684,4 +686,154 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats",
INT_MAX_VALUE))
new StrictMetricsEvaluator(SCHEMA,
notNull("struct.nested_col_with_stats")).eval(FILE);
assertThat(shouldRead).as("notNull nested column should not
match").isFalse();
}
+
+ // Tests for UUID with StrictMetricsEvaluator using RFC-compliant comparison
only
+
+ private static final UUID UUID_MIN =
UUID.fromString("00000000-0000-0000-0000-000000000001");
+ private static final UUID UUID_MAX =
UUID.fromString("80000000-0000-0000-0000-000000000001");
+
+ private static final DataFile UUID_FILE =
+ new TestDataFile(
+ "uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(18, 50L),
+ ImmutableMap.of(18, 0L),
+ null,
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MIN)),
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), UUID_MAX)));
+
+ @Test
+ public void testStrictUuidGt() {
+ // Query: uuid > 0x00... (all UUIDs in file should be > this)
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, greaterThan("uuid",
belowMin)).eval(UUID_FILE);
+ // With bounds [UUID_MIN, UUID_MAX], all values should be > belowMin
+ assertThat(allMatch).as("All UUIDs should be greater than
belowMin").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidLt() {
+ // Query: uuid < 0xFF... (all UUIDs in file should be < this)
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, lessThan("uuid",
aboveMax)).eval(UUID_FILE);
+ // With bounds [UUID_MIN, UUID_MAX], all values should be < aboveMax
+ assertThat(allMatch).as("All UUIDs should be less than aboveMax").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidEqNeverMatchesRange() {
+ // Strict eq should never match when there's a range of values
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ boolean allMatch = new StrictMetricsEvaluator(SCHEMA, equal("uuid",
middle)).eval(UUID_FILE);
+ assertThat(allMatch).as("Strict eq should not match range").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidInNeverMatchesRange() {
+ // Strict IN should never match when there's a range of values (lower !=
upper)
+ UUID middle1 = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ UUID middle2 = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid", middle1,
middle2)).eval(UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match range").isFalse();
+ }
+
+ // File with single UUID value (lower == upper)
+ private static final UUID SINGLE_UUID =
UUID.fromString("40000000-0000-0000-0000-000000000001");
+ private static final DataFile SINGLE_UUID_FILE =
+ new TestDataFile(
+ "single_uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(18, 50L),
+ ImmutableMap.of(18, 0L),
+ null,
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(), SINGLE_UUID)),
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(),
SINGLE_UUID)));
+
+ @Test
+ public void testStrictUuidInMatchesSingleValue() {
+ // Strict IN should match when lower == upper and the value is in the set
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid",
SINGLE_UUID)).eval(SINGLE_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should match single value in
set").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidInDoesNotMatchWhenValueNotInSet() {
+ // Strict IN should not match when lower == upper but the value is not in
the set
+ UUID otherUuid = UUID.fromString("50000000-0000-0000-0000-000000000001");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, in("uuid",
otherUuid)).eval(SINGLE_UUID_FILE);
+ assertThat(allMatch).as("Strict IN should not match when value not in
set").isFalse();
+ }
+
+ @Test
+ public void testStrictUuidNotInMatchesWhenAllValuesOutsideBounds() {
+ // Strict NOT IN should match when all values in the set are outside the
file's bounds
+ UUID belowMin = UUID.fromString("00000000-0000-0000-0000-000000000000");
+ UUID aboveMax = UUID.fromString("ffffffff-ffff-ffff-ffff-ffffffffffff");
+ boolean allMatch =
+ new StrictMetricsEvaluator(SCHEMA, notIn("uuid", belowMin,
aboveMax)).eval(UUID_FILE);
+ // All values in the set are outside [UUID_MIN, UUID_MAX], so all rows
match NOT IN
+ assertThat(allMatch).as("Strict NOT IN should match when all values
outside bounds").isTrue();
+ }
+
+ @Test
+ public void testStrictUuidNotInDoesNotMatchWhenValueInBounds() {
+ // Strict NOT IN should not match when a value in the set is within bounds
+ UUID middle = UUID.fromString("40000000-0000-0000-0000-000000000001");
+ boolean allMatch = new StrictMetricsEvaluator(SCHEMA, notIn("uuid",
middle)).eval(UUID_FILE);
+ // middle is within [UUID_MIN, UUID_MAX], so some rows might match the
value
+ assertThat(allMatch).as("Strict NOT IN should not match when value in
bounds").isFalse();
+ }
+
+ // Tests for file with inverted UUID bounds (as would be written by legacy
signed comparator)
+ // In RFC unsigned order: 0x40... < 0x80..., so lower > upper when
interpreted with RFC
+ private static final UUID LEGACY_UUID_LOWER =
+ UUID.fromString("80000000-0000-0000-0000-000000000001");
+ private static final UUID LEGACY_UUID_UPPER =
+ UUID.fromString("40000000-0000-0000-0000-000000000001");
+
+ private static final DataFile LEGACY_UUID_FILE =
+ new TestDataFile(
+ "legacy_uuid_file.avro",
+ Row.of(),
+ 50,
+ ImmutableMap.of(18, 50L),
+ ImmutableMap.of(18, 0L),
+ null,
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(),
LEGACY_UUID_LOWER)),
+ ImmutableMap.of(18, toByteBuffer(Types.UUIDType.get(),
LEGACY_UUID_UPPER)));
+
+ @Test
+ public void testStrictUuidInWithInvertedBounds() {
Review Comment:
you might want to add `legacy` to the naming of these tests to make it
clearer
##########
data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java:
##########
@@ -1138,6 +1138,150 @@ public void testUUID() {
.isTrue();
}
+ // UUIDs for testing dual-comparator behavior with high-bit values
+ // These UUIDs span the signed/unsigned comparison boundary
+ private static final UUID UUID_LOW =
UUID.fromString("00000000-0000-0000-0000-000000000001");
Review Comment:
same as in the other tests, please move such fields further to the top
##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java:
##########
@@ -1350,4 +1351,105 @@ private void assertEquals(UnboundTransform<?, ?>
expected, UnboundTransform<?, ?
.hasToString(expected.transform().toString());
assertEquals(expected.ref(), actual.ref());
}
+
+ // Tests for UUID bounds predicate detection and transformation
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidLt() {
+ Expression original = Expressions.lessThan("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID lt predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidEq() {
+ Expression original = Expressions.equal("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID eq predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidIn() {
+ Expression original = Expressions.in("uuid_col", UUID.randomUUID(),
UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID in predicate")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoDetectionForNonUuid() {
+ Expression original = Expressions.equal("id", 42L);
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should not detect non-UUID predicate")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralNoDetectionForIsNull() {
+ Expression original = Expressions.isNull("uuid_col");
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should not detect UUID isNull predicate (doesn't compare against
bounds)")
+ .isNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsInCompoundExpression() {
+ Expression original =
+ Expressions.and(
+ Expressions.equal("id", 42L), Expressions.greaterThan("uuid_col",
UUID.randomUUID()));
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID predicate in compound expression")
+ .isNotNull();
+ }
+
+ @Test
+ public void testToSignedUUIDLiteralTransformsLiteral() {
+ UUID testUuid = UUID.randomUUID();
+ Expression original = Expressions.equal("uuid_col", testUuid);
+
+ Expression result = ExpressionUtil.toSignedUUIDLiteral(original);
+ assertThat(result).isNotNull();
+ assertThat(result).isInstanceOf(UnboundPredicate.class);
+ UnboundPredicate<?> pred = (UnboundPredicate<?>) result;
+ assertThat(pred.literal().value()).isEqualTo(testUuid);
+ // The literal should now use the signed comparator
+ assertThat(pred.literal()).isInstanceOf(Literals.UUIDLiteral.class);
Review Comment:
we should make sure here that the signed comparator is actually being used
via
```
assertThat(pred.literal())
.isInstanceOf(Literals.UUIDLiteral.class)
.asInstanceOf(InstanceOfAssertFactories.type(Literals.UUIDLiteral.class))
.extracting("useSignedComparator")
.isEqualTo(true);
```
##########
api/src/main/java/org/apache/iceberg/expressions/ManifestEvaluator.java:
##########
@@ -64,7 +68,15 @@ public static ManifestEvaluator forPartitionFilter(
}
private ManifestEvaluator(PartitionSpec spec, Expression partitionFilter,
boolean caseSensitive) {
- this.expr = Binder.bind(spec.partitionType(), rewriteNot(partitionFilter),
caseSensitive);
+ Types.StructType partitionType = spec.partitionType();
+ Expression rewritten = rewriteNot(partitionFilter);
+ this.expr = Binder.bind(partitionType, rewritten, caseSensitive);
+
+ // Create the signed UUID expression if and only if there are UUID
predicates
+ // that compare against bounds.
+ Expression transformed = ExpressionUtil.toSignedUUIDLiteral(rewritten);
Review Comment:
or I guess we actually need the info that we rewrote a UUID here in the
evaluator?
##########
api/src/test/java/org/apache/iceberg/expressions/TestExpressionUtil.java:
##########
@@ -1350,4 +1351,105 @@ private void assertEquals(UnboundTransform<?, ?>
expected, UnboundTransform<?, ?
.hasToString(expected.transform().toString());
assertEquals(expected.ref(), actual.ref());
}
+
+ // Tests for UUID bounds predicate detection and transformation
+
+ @Test
+ public void testToSignedUUIDLiteralDetectsUuidLt() {
+ Expression original = Expressions.lessThan("uuid_col", UUID.randomUUID());
+
+ assertThat(ExpressionUtil.toSignedUUIDLiteral(original))
+ .as("Should detect UUID lt predicate")
+ .isNotNull();
Review Comment:
I feel like these can be just combined with the other tests where we end up
checking that the literal actually uses the correct comparator. We should still
make sure that we cover all eq/gt/lt/... cases though
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]