(paimon) branch master updated: [core] fix non null predicate for blob & vector (#8177)

lzljs3620320 Mon, 08 Jun 2026 22:50:44 -0700

This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git



The following commit(s) were added to refs/heads/master by this push:
     new ade1bb8d87 [core] fix non null predicate for blob & vector (#8177)
ade1bb8d87 is described below

commit ade1bb8d87d6a4d8de09e2705b5fa4df1c62c28c
Author: Stefanietry <[email protected]>
AuthorDate: Tue Jun 9 13:50:32 2026 +0800

    [core] fix non null predicate for blob & vector (#8177)
---
 .../operation/DataEvolutionFileStoreScan.java      | 15 ++++++
 .../operation/DataEvolutionFileStoreScanTest.java  | 61 +++++++++++++++++++++-
 .../apache/paimon/spark/SparkMultimodalITCase.java |  5 ++
 3 files changed, 80 insertions(+), 1 deletion(-)

diff --git 
a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
 
b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
index 38ab9b0db0..b5ea80b060 100644
--- 
a/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
+++ 
b/paimon-core/src/main/java/org/apache/paimon/operation/DataEvolutionFileStoreScan.java
@@ -265,6 +265,16 @@ public class DataEvolutionFileStoreScan extends 
AppendOnlyFileStoreScan {
             TableSchema schema,
             Function<Long, TableSchema> scanTableSchema,
             List<ManifestEntry> metas) {
+        Set<Integer> excludedFileFieldIds =
+                metas.stream()
+                        .filter(
+                                entry ->
+                                        isBlobFile(entry.file().fileName())
+                                                || 
isVectorStoreFile(entry.file().fileName()))
+                        .flatMap(
+                                entry ->
+                                        computeFileFieldIds(scanTableSchema, 
entry.file()).stream())
+                        .collect(Collectors.toSet());
         // exclude blob and vector-store files, useless for predicate eval
         metas =
                 metas.stream()
@@ -339,6 +349,11 @@ public class DataEvolutionFileStoreScan extends 
AppendOnlyFileStoreScan {
         }
 
         long groupRowCount = metas.get(0).file().rowCount();
+        for (int j = 0; j < fieldsCount; j++) {
+            if (rowOffsets[j] == -1 && 
excludedFileFieldIds.contains(allFields[j])) {
+                rowOffsets[j] = -2;
+            }
+        }
         DataEvolutionRow finalMin = new DataEvolutionRow(metas.size(), 
rowOffsets, fieldOffsets);
         DataEvolutionRow finalMax = new DataEvolutionRow(metas.size(), 
rowOffsets, fieldOffsets);
         // For null-count specifically, a field absent from every file in the 
group means every
diff --git 
a/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
 
b/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
index 803e4ac255..ee10f5b6d5 100644
--- 
a/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
+++ 
b/paimon-core/src/test/java/org/apache/paimon/operation/DataEvolutionFileStoreScanTest.java
@@ -30,6 +30,8 @@ import org.apache.paimon.manifest.FileKind;
 import org.apache.paimon.manifest.FileSource;
 import org.apache.paimon.manifest.ManifestEntry;
 import org.apache.paimon.operation.DataEvolutionFileStoreScan.EvolutionStats;
+import org.apache.paimon.predicate.Predicate;
+import org.apache.paimon.predicate.PredicateBuilder;
 import org.apache.paimon.reader.DataEvolutionArray;
 import org.apache.paimon.reader.DataEvolutionRow;
 import org.apache.paimon.schema.Schema;
@@ -310,6 +312,53 @@ public class DataEvolutionFileStoreScanTest {
         assertThat(maxRow.getString(0).toString()).isEqualTo("yam");
     }
 
+    @Test
+    public void testEvolutionStatsKeepDedicatedVectorFieldAsUnknown() {
+        Schema schema = createSchema("f0", "f1", "f2");
+        TableSchema tableSchema = TableSchema.create(0L, schema);
+        schemas.put(0L, tableSchema);
+
+        ManifestEntry dataEntry =
+                createManifestEntryWithDifferentColsAndFileName(
+                        "data-file.parquet",
+                        0L,
+                        new String[] {"f0", "f1"},
+                        new String[] {"f0", "f1"},
+                        createSimpleStats(
+                                GenericRow.of(1, BinaryString.fromString("a")),
+                                GenericRow.of(3, BinaryString.fromString("c")),
+                                createBinaryArray(new int[] {0, 0}),
+                                new int[] {0, 1}));
+
+        ManifestEntry vectorEntry =
+                createManifestEntryWithDifferentColsAndFileName(
+                        "data-file.vector.avro",
+                        0L,
+                        new String[] {"f2"},
+                        new String[] {"f2"},
+                        createSimpleStats(
+                                GenericRow.of(10),
+                                GenericRow.of(30),
+                                createBinaryArray(new int[] {0}),
+                                new int[] {2}));
+
+        EvolutionStats result =
+                DataEvolutionFileStoreScan.evolutionStats(
+                        tableSchema, scanTableSchema, Arrays.asList(dataEntry, 
vectorEntry));
+
+        DataEvolutionArray nullCounts = (DataEvolutionArray) 
result.nullCounts();
+        assertThat(nullCounts.isNullAt(2)).isTrue();
+
+        Predicate predicate = new 
PredicateBuilder(tableSchema.logicalRowType()).isNotNull(2);
+        assertThat(
+                        predicate.test(
+                                result.rowCount(),
+                                result.minValues(),
+                                result.maxValues(),
+                                result.nullCounts()))
+                .isTrue();
+    }
+
     @Test
     public void testIntersectsRowRanges() {
         List<Range> rowRanges =
@@ -366,9 +415,19 @@ public class DataEvolutionFileStoreScanTest {
 
     private ManifestEntry createManifestEntryWithDifferentCols(
             Long schemaId, String[] writeCols, String[] valueStatsCols, 
SimpleStats stats) {
+        return createManifestEntryWithDifferentColsAndFileName(
+                "test-file.parquet", schemaId, writeCols, valueStatsCols, 
stats);
+    }
+
+    private ManifestEntry createManifestEntryWithDifferentColsAndFileName(
+            String fileName,
+            Long schemaId,
+            String[] writeCols,
+            String[] valueStatsCols,
+            SimpleStats stats) {
         DataFileMeta fileMeta =
                 DataFileMeta.create(
-                        "test-file.parquet",
+                        fileName,
                         100L,
                         100L,
                         createBinaryRow(1),
diff --git 
a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
 
b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
index 435aa6ce85..4bd45f0079 100644
--- 
a/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
+++ 
b/paimon-spark/paimon-spark-ut/src/test/java/org/apache/paimon/spark/SparkMultimodalITCase.java
@@ -129,6 +129,11 @@ public class SparkMultimodalITCase {
                 spark.sql("select gid, sid, embs from my_db1.vector_test where 
date = '20260420';")
                         .collectAsList();
         assertThat(rows).hasSize(8);
+        rows =
+                spark.sql(
+                                "select gid, sid, embs from my_db1.vector_test 
where date = '20260420' and embs is not null;")
+                        .collectAsList();
+        assertThat(rows).hasSize(8);
         rows =
                 spark.sql(
                                 "select gid, sid,  embs from 
vector_search('my_db1.vector_test', 'embs', array(1.0f, 2.0f, 3.0f, 4.0f), 5)  
where date = '20260420'")

(paimon) branch master updated: [core] fix non null predicate for blob & vector (#8177)

Reply via email to