… [hudi]

via GitHub Thu, 14 May 2026 04:42:48 -0700


hudi-agent commented on code in PR #18535:
URL: https://github.com/apache/hudi/pull/18535#discussion_r3241073820



##########
hudi-trino-plugin/src/main/java/io/trino/plugin/hudi/stats/TableMetadataReader.java:
##########
@@ -55,60 +48,29 @@ public class TableMetadataReader
      * @return a map from column name to their corresponding {@link 
HoodieColumnRangeMetadata}
      * @throws HoodieMetadataException if an error occurs while fetching the 
column statistics
      */
-    Map<String, HoodieColumnRangeMetadata> getColumnStats(List<Pair<String, 
String>> partitionNameFileNameList, List<String> columnNames)
+    public Map<String, HoodieColumnRangeMetadata> 
getColumnsRange(List<Pair<String, String>> partitionNameFileNameList, 
List<String> columnNames)
             throws HoodieMetadataException
     {
-        return 
computeFileToColumnStatsMap(computeColumnStatsLookupKeys(partitionNameFileNameList,
 columnNames));
-    }
-
-    /**
-     * @param partitionNameFileNameList a list of partition and file name 
pairs for which column stats need to be retrieved
-     * @param columnNames list of column names for which stats are needed
-     * @return a list of column stats keys to look up in the metadata table 
col_stats partition.
-     */
-    private List<String> computeColumnStatsLookupKeys(
-            final List<Pair<String, String>> partitionNameFileNameList,
-            final List<String> columnNames)
-    {
-        return columnNames.stream()
-                .flatMap(columnName -> partitionNameFileNameList.stream()
-                        .map(partitionNameFileNamePair -> 
HoodieMetadataPayload.getColumnStatsIndexKey(
-                                new 
PartitionIndexID(HoodieTableMetadataUtil.getColumnStatsIndexPartitionIdentifier(partitionNameFileNamePair.getLeft())),
-                                new 
FileIndexID(partitionNameFileNamePair.getRight()),
-                                new ColumnIndexID(columnName))))
-                .toList();
-    }
-
-    /**
-     * @param columnStatsLookupKeys a map from column stats key to partition 
and file name pair
-     * @return a map from column name to merged HoodieMetadataColumnStats
-     */
-    private Map<String, HoodieColumnRangeMetadata> 
computeFileToColumnStatsMap(List<String> columnStatsLookupKeys)
-    {
-        HoodieTimer timer = HoodieTimer.start();
-        Map<String, HoodieRecord<HoodieMetadataPayload>> hoodieRecords =
-                getRecordsByKeys(columnStatsLookupKeys, 
MetadataPartitionType.COLUMN_STATS.getPartitionPath());
-        metrics.ifPresent(m -> 
m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, 
timer.endTimer()));
-        return hoodieRecords.values().stream()
-                .collect(Collectors.groupingBy(
-                        r -> 
r.getData().getColumnStatMetadata().get().getColumnName(),
-                        Collectors.mapping(r -> 
r.getData().getColumnStatMetadata().get(), Collectors.toList())))
+        Map<Pair<String, String>, List<HoodieMetadataColumnStats>> 
columnStatsMap = getColumnStats(partitionNameFileNameList, columnNames);
+        return 
columnStatsMap.values().stream().flatMap(Collection::stream).collect(Collectors.groupingBy(
+                HoodieMetadataColumnStats::getColumnName,
+                Collectors.mapping(colStats -> colStats, Collectors.toList())))

Review Comment:
   🤖 nit: `Collectors.mapping(colStats -> colStats, Collectors.toList())` is 
equivalent to just `Collectors.toList()` as a downstream collector for 
`groupingBy` — could you simplify it?
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



##########
hudi-trino-plugin/src/test/java/io/trino/plugin/hudi/util/FileOperationAssertions.java:
##########
@@ -0,0 +1,143 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.trino.plugin.hudi.util;
+
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
+import io.airlift.log.Logger;
+import io.trino.plugin.hudi.util.FileOperationUtils.FileOperation;
+import io.trino.testing.QueryRunner;
+import org.intellij.lang.annotations.Language;
+
+import java.util.Comparator;
+
+import static 
io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getCacheOperationSpans;
+import static 
io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getFileLocation;
+import static 
io.trino.filesystem.tracing.CacheFileSystemTraceUtils.isTrinoSchemaOrPermissions;
+import static io.trino.testing.MultisetAssertions.assertMultisetsEqual;
+import static java.util.stream.Collectors.toCollection;
+
+public final class FileOperationAssertions
+{
+    private static final Logger log = 
Logger.get(FileOperationAssertions.class);
+
+    private FileOperationAssertions() {}
+
+    /**
+     * Asserts that file system accesses match expected operations.
+     * This version uses manual filtering for Input/InputFile operations.
+     * Logs detailed comparison at WARN level for debugging test failures.
+     */
+    public static void assertFileSystemAccesses(
+            QueryRunner queryRunner,
+            @Language("SQL") String query,
+            Multiset<FileOperation> expectedCacheAccesses)
+            throws InterruptedException
+    {
+        queryRunner.executeWithPlan(queryRunner.getDefaultSession(), query);
+        // Allow time for table stats computation to finish before validation.
+        Thread.sleep(1000L);
+        Multiset<FileOperation> actualCacheAccesses = 
getFileOperations(queryRunner);
+        printFileAccessDebugInfo(queryRunner, actualCacheAccesses, 
expectedCacheAccesses);
+        assertMultisetsEqual(actualCacheAccesses, expectedCacheAccesses);
+    }

Review Comment:
   🤖 nit: `printFileAccessDebugInfo` is called unconditionally, so every test 
run emits several WARN-level blocks even when the assertion passes — have you 
considered only calling it (or logging at DEBUG) when the multisets actually 
differ?
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



##########
hudi-trino-plugin/src/main/java/io/trino/plugin/hudi/HudiConfig.java:
##########
@@ -55,7 +55,7 @@ public class HudiConfig
     private boolean queryPartitionFilterRequired;
     private boolean ignoreAbsentPartitions;
     private Duration dynamicFilteringWaitTimeout = new Duration(1, SECONDS);
-    private boolean resolveColumnNameCasingEnabled = true;
+    private boolean resolveColumnNameCasingEnabled;

Review Comment:
   🤖 Was this default flip from `true` to `false` intentional? It's a silent 
behavior change — `resolveColumnNameCasingEnabled` gates schema reconciliation 
(`HudiMetadata` eagerly loading the latest Hudi schema) and case-insensitive 
predicate-pushdown key mapping (`IndexSupportFactory.transformTupleDomain`). It 
also breaks `TestHudiConfig.testDefaults` which still asserts 
`setResolveColumnNameCasingEnabled(true)`. If the flip is intentional, the test 
needs updating; if not, the `= true` initializer should be restored.
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



##########
hudi-trino-plugin/src/main/java/io/trino/plugin/hudi/HudiUtil.java:
##########
@@ -397,4 +412,62 @@ public static Schema 
getLatestTableSchema(HoodieTableMetaClient metaClient, Stri
             throw new TrinoException(HUDI_FILESYSTEM_ERROR, e);
         }
     }
+
+    public static List<HiveColumnHandle> getOrderingColumnHandles(Table table, 
TypeManager typeManager, Lazy<HoodieTableMetaClient> lazyMetaClient, 
HiveTimestampPrecision timestampPrecision)

Review Comment:
   🤖 nit: `Objects.isNull(recordMergeMode)` is a bit unusual here — 
`recordMergeMode == null` reads more naturally in a plain null-guard like this.
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



##########
hudi-trino-plugin/src/main/java/io/trino/plugin/hudi/reader/HudiTrinoReaderContext.java:
##########
@@ -130,98 +146,14 @@ public IndexedRecord next()
     }
 
     @Override
-    public IndexedRecord convertAvroRecord(IndexedRecord record)
+    protected Option<HoodieRecordMerger> getRecordMerger(RecordMergeMode 
mergeMode, String mergeStrategyId, String mergeImplClasses)
     {
-        return record;
+        return Option.of(new HoodiePreCombineAvroRecordMerger());

Review Comment:
   🤖 Is `HoodiePreCombineAvroRecordMerger` the right choice here? Its javadoc 
says it's for "deduplication among incoming records" (write-time precombine), 
whereas `HoodieAvroRecordMerger` is documented as the one for "base record from 
disk to merge with incoming record" — which is exactly what 
`FileGroupRecordBuffer` does at read time. The PreCombine variant calls 
`payload.preCombine(...)` instead of `combineAndGetUpdateValue(...)`, which can 
diverge on delete handling (`_hoodie_is_deleted`) and on custom payloads (e.g. 
partial-update or `OverwriteNonDefaults*`). Spark/Flink contexts pick 
`DefaultSparkRecordMerger`/`EventTimeFlinkRecordMerger` based on `mergeMode` — 
could the Trino path do the same, or at least use the read-time merger? @yihua 
could you confirm the intent here?
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



##########
hudi-integ-test/src/test/java/org/apache/hudi/integ2/testcontainers/command/CommandResult.java:
##########
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.integ2.testcontainers.command;
+
+import lombok.AllArgsConstructor;
+import org.testcontainers.containers.Container;
+
+import java.util.regex.Pattern;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+
+/**
+ * A dedicated class to hold the result of a command execution and provide
+ * fluent assertion methods for cleaner tests.
+ */
+@AllArgsConstructor
+public class CommandResult {
+
+  // Spark 4.0 (Scala 2.13) spark-shell runs under a dumb terminal when stdin
+  // is piped from a file, which flushes the next `scala> ` prompt onto the
+  // same line as preceding async println output (e.g. `scala> MARKER`).
+  // Stripping one-or-more leading `scala>\s+` prefixes normalizes those lines
+  // back to the bare sentinel while leaving Spark 3.5 output (sentinel already
+  // on its own line) unchanged.
+  private static final Pattern REPL_PROMPT_PREFIX = 
Pattern.compile("^(scala>\\s+)+");
+
+  private final String stdout;
+  private final String stderr;
+  private final int exitCode;
+
+  public CommandResult(Container.ExecResult execResult) {
+    this.stdout = execResult.getStdout();
+    this.stderr = execResult.getStderr();
+    this.exitCode = execResult.getExitCode();
+  }
+
+  /**
+   * Asserts that the command's exit code is 0 (success).
+   *
+   * @return The same {@link CommandResult} instance for chaining assertions.
+   * @throws AssertionError if the exit code is not 0.
+   */
+  public CommandResult expectToSucceed() {
+    assertEquals(0, exitCode,
+        String.format("Command failed with exit code %d. Stderr: %s", 
exitCode, stderr));
+    return this;
+  }
+
+  /**
+   * Asserts that the command's exit code is not 0 (failure).
+   * More specifically, it asserts the exit code is 1.
+   *
+   * @return The same {@link CommandResult} instance for chaining assertions.
+   * @throws AssertionError if the exit code is 0.
+   */
+  public CommandResult expectToFail() {
+    assertNotEquals(0, exitCode,
+        String.format("Command succeeded with exit code %d. Stderr: %s", 
exitCode, stderr));
+    return this;
+  }
+
+  /**
+   * Asserts that the command's exit code is zero.
+   *
+   * @return The same {@link CommandResult} instance for chaining assertions.
+   */
+  public CommandResult assertExitCodeIs(int expectedCode) {

Review Comment:
   🤖 nit: the Javadoc just above this method says "Asserts that the command's 
exit code is zero" — could you update it to something like "Asserts that the 
command's exit code equals `expectedCode`"? As written it reads like a 
duplicate of `expectToSucceed()`, which makes it unclear when you'd reach for 
one vs. the other.
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkLanceWriter.java:
##########
@@ -157,7 +166,10 @@ private HoodieSparkLanceWriter(StoragePath file,
                                  long flushByteWatermark) {
     super(file, DEFAULT_BATCH_SIZE, allocatorSize, flushByteWatermark,
         bloomFilterOpt.map(HoodieBloomFilterRowWriteSupport::new));
-    this.sparkSchema = enrichSparkSchemaForLance(sparkSchema);
+    this.inputSparkSchema = sparkSchema;
+    Pair<StructType, int[]> variantPlan = enrichForLanceVariant(sparkSchema);

Review Comment:
   🤖 nit: `variantPlan` reads like a Spark logical/physical query plan in this 
codebase — something like `variantEnrichment` or `schemaWithVariantOrdinals` 
might better convey that this is just a `(StructType, int[])` pair holding the 
rewritten schema and the ordinal positions.
   
   <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag 
quality.</i></sub>



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(integ-test): add Testcontainers-based E2E tests for Vector/Blob/… [hudi]

Reply via email to