This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new ade1bb8d87 [core] fix non null predicate for blob & vector (#8177)
ade1bb8d87 is described below
commit ade1bb8d87d6a4d8de09e2705b5fa4df1c62c28c
Author: Stefanietry <[email protected]>
AuthorDate: Tue Jun 9 13:50:32 2026 +0800
[core] fix non null predicate for blob & vector (#8177)
---
.../operation/DataEvolutionFileStoreScan.java | 15 ++++++
.../operation/DataEvolutionFileStoreScanTest.java | 61 +++++++++++++++++++++-
.../apache/paimon/spark/SparkMultimodalITCase.java | 5 ++
3 files changed, 80 insertions(+), 1 deletion(-)
diff --git
a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
index 38ab9b0db0..b5ea80b060 100644
---
a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
+++
b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
@@ -265,6 +265,16 @@ public class DataEvolutionFileStoreScan extends
AppendOnlyFileStoreScan {
TableSchema schema,
Function<Long, TableSchema> scanTableSchema,
List<ManifestEntry> metas) {
+ Set<Integer> excludedFileFieldIds =
+ metas.stream()
+ .filter(
+ entry ->
+ isBlobFile(entry.file().fileName())
+ ||
isVectorStoreFile(entry.file().fileName()))
+ .flatMap(
+ entry ->
+ computeFileFieldIds(scanTableSchema,
entry.file()).stream())
+ .collect(Collectors.toSet());
// exclude blob and vector-store files, useless for predicate eval
metas =
metas.stream()
@@ -339,6 +349,11 @@ public class DataEvolutionFileStoreScan extends
AppendOnlyFileStoreScan {
}
long groupRowCount = metas.get(0).file().rowCount();
+ for (int j = 0; j < fieldsCount; j++) {
+ if (rowOffsets[j] == -1 &&
excludedFileFieldIds.contains(allFields[j])) {
+ rowOffsets[j] = -2;
+ }
+ }
DataEvolutionRow finalMin = new DataEvolutionRow(metas.size(),
rowOffsets, fieldOffsets);
DataEvolutionRow finalMax = new DataEvolutionRow(metas.size(),
rowOffsets, fieldOffsets);
// For null-count specifically, a field absent from every file in the
group means every
diff --git
a/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
b/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
index 803e4ac255..ee10f5b6d5 100644
---
a/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
+++
b/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
@@ -30,6 +30,8 @@ import org.apache.paimon.manifest.FileKind;
import org.apache.paimon.manifest.FileSource;
import org.apache.paimon.manifest.ManifestEntry;
import org.apache.paimon.operation.DataEvolutionFileStoreScan.EvolutionStats;
+import org.apache.paimon.predicate.Predicate;
+import org.apache.paimon.predicate.PredicateBuilder;
import org.apache.paimon.reader.DataEvolutionArray;
import org.apache.paimon.reader.DataEvolutionRow;
import org.apache.paimon.schema.Schema;
@@ -310,6 +312,53 @@ public class DataEvolutionFileStoreScanTest {
assertThat(maxRow.getString(0).toString()).isEqualTo("yam");
}
+ @Test
+ public void testEvolutionStatsKeepDedicatedVectorFieldAsUnknown() {
+ Schema schema = createSchema("f0", "f1", "f2");
+ TableSchema tableSchema = TableSchema.create(0L, schema);
+ schemas.put(0L, tableSchema);
+
+ ManifestEntry dataEntry =
+ createManifestEntryWithDifferentColsAndFileName(
+ "data-file.parquet",
+ 0L,
+ new String[] {"f0", "f1"},
+ new String[] {"f0", "f1"},
+ createSimpleStats(
+ GenericRow.of(1, BinaryString.fromString("a")),
+ GenericRow.of(3, BinaryString.fromString("c")),
+ createBinaryArray(new int[] {0, 0}),
+ new int[] {0, 1}));
+
+ ManifestEntry vectorEntry =
+ createManifestEntryWithDifferentColsAndFileName(
+ "data-file.vector.avro",
+ 0L,
+ new String[] {"f2"},
+ new String[] {"f2"},
+ createSimpleStats(
+ GenericRow.of(10),
+ GenericRow.of(30),
+ createBinaryArray(new int[] {0}),
+ new int[] {2}));
+
+ EvolutionStats result =
+ DataEvolutionFileStoreScan.evolutionStats(
+ tableSchema, scanTableSchema, Arrays.asList(dataEntry,
vectorEntry));
+
+ DataEvolutionArray nullCounts = (DataEvolutionArray)
result.nullCounts();
+ assertThat(nullCounts.isNullAt(2)).isTrue();
+
+ Predicate predicate = new
PredicateBuilder(tableSchema.logicalRowType()).isNotNull(2);
+ assertThat(
+ predicate.test(
+ result.rowCount(),
+ result.minValues(),
+ result.maxValues(),
+ result.nullCounts()))
+ .isTrue();
+ }
+
@Test
public void testIntersectsRowRanges() {
List<Range> rowRanges =
@@ -366,9 +415,19 @@ public class DataEvolutionFileStoreScanTest {
private ManifestEntry createManifestEntryWithDifferentCols(
Long schemaId, String[] writeCols, String[] valueStatsCols,
SimpleStats stats) {
+ return createManifestEntryWithDifferentColsAndFileName(
+ "test-file.parquet", schemaId, writeCols, valueStatsCols,
stats);
+ }
+
+ private ManifestEntry createManifestEntryWithDifferentColsAndFileName(
+ String fileName,
+ Long schemaId,
+ String[] writeCols,
+ String[] valueStatsCols,
+ SimpleStats stats) {
DataFileMeta fileMeta =
DataFileMeta.create(
- "test-file.parquet",
+ fileName,
100L,
100L,
createBinaryRow(1),
diff --git
a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
index 435aa6ce85..4bd45f0079 100644
---
a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
+++
b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
@@ -129,6 +129,11 @@ public class SparkMultimodalITCase {
spark.sql("select gid, sid, embs from my_db1.vector_test where
date = '20260420';")
.collectAsList();
assertThat(rows).hasSize(8);
+ rows =
+ spark.sql(
+ "select gid, sid, embs from my_db1.vector_test
where date = '20260420' and embs is not null;")
+ .collectAsList();
+ assertThat(rows).hasSize(8);
rows =
spark.sql(
"select gid, sid, embs from
vector_search('my_db1.vector_test', 'embs', array(1.0f, 2.0f, 3.0f, 4.0f), 5)
where date = '20260420'")