rdblue commented on a change in pull request #2062: URL: https://github.com/apache/iceberg/pull/2062#discussion_r567177688
########## File path: parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java ########## @@ -472,6 +472,50 @@ public Boolean or(Boolean leftResult, Boolean rightResult) { return ROWS_MIGHT_MATCH; } + @Override + @SuppressWarnings("unchecked") + public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) { + int id = ref.fieldId(); + + Long valueCount = valueCounts.get(id); + if (valueCount == null) { + // the column is not present and is all nulls + return ROWS_CANNOT_MATCH; + } + + Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id); + if (colStats != null && !colStats.isEmpty()) { + if (hasNonNullButNoMinMax(colStats, valueCount)) { + return ROWS_MIGHT_MATCH; + } + + if (!colStats.hasNonNullValue()) { + return ROWS_CANNOT_MATCH; + } + + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + + Comparator<ByteBuffer> comparator = Comparators.unsignedBytes(); + + Binary lower = colStats.genericGetMin(); + // truncate lower bound so that its length in bytes is not greater than the length of prefix + int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length()); + int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes); + if (lowerCmp == 0) { + Binary upper = colStats.genericGetMax(); + // truncate upper bound so that its length in bytes is not greater than the length of prefix + int upperLength = Math.min(prefixAsBytes.remaining(), upper.length()); + ByteBuffer upperByteBuffer = upper.toByteBuffer(); + int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upperByteBuffer, upperLength), prefixAsBytes); + if (upperCmp == 0) { + return ROWS_CANNOT_MATCH; + } + } + } Review comment: I think the logic should be updated here as well to what I suggested above for the lower = upper = prefix case. ########## File path: parquet/src/main/java/org/apache/iceberg/parquet/ParquetMetricsRowGroupFilter.java ########## @@ -472,6 +472,50 @@ public Boolean or(Boolean leftResult, Boolean rightResult) { return ROWS_MIGHT_MATCH; } + @Override + @SuppressWarnings("unchecked") + public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) { + int id = ref.fieldId(); + + Long valueCount = valueCounts.get(id); + if (valueCount == null) { + // the column is not present and is all nulls + return ROWS_CANNOT_MATCH; + } + + Statistics<Binary> colStats = (Statistics<Binary>) stats.get(id); + if (colStats != null && !colStats.isEmpty()) { + if (hasNonNullButNoMinMax(colStats, valueCount)) { + return ROWS_MIGHT_MATCH; + } + + if (!colStats.hasNonNullValue()) { + return ROWS_CANNOT_MATCH; + } + + ByteBuffer prefixAsBytes = lit.toByteBuffer(); + + Comparator<ByteBuffer> comparator = Comparators.unsignedBytes(); + + Binary lower = colStats.genericGetMin(); + // truncate lower bound so that its length in bytes is not greater than the length of prefix + int lowerLength = Math.min(prefixAsBytes.remaining(), lower.length()); + int lowerCmp = comparator.compare(BinaryUtil.truncateBinary(lower.toByteBuffer(), lowerLength), prefixAsBytes); + if (lowerCmp == 0) { + Binary upper = colStats.genericGetMax(); + // truncate upper bound so that its length in bytes is not greater than the length of prefix + int upperLength = Math.min(prefixAsBytes.remaining(), upper.length()); + ByteBuffer upperByteBuffer = upper.toByteBuffer(); + int upperCmp = comparator.compare(BinaryUtil.truncateBinary(upperByteBuffer, upperLength), prefixAsBytes); + if (upperCmp == 0) { + return ROWS_CANNOT_MATCH; + } + } + } Review comment: I think the logic should be updated here as well to what I suggested above. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org