hudi-agent commented on code in PR #18837: URL: https://github.com/apache/hudi/pull/18837#discussion_r3320882817
########## hudi-trino-plugin/src/test/java/io/trino/plugin/hudi/util/FileOperationAssertions.java: ########## @@ -0,0 +1,166 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.trino.plugin.hudi.util; + +import com.google.common.collect.HashMultiset; +import com.google.common.collect.Multiset; +import io.airlift.log.Logger; +import io.trino.plugin.hudi.util.FileOperationUtils.FileOperation; +import io.trino.testing.QueryRunner; +import org.intellij.lang.annotations.Language; + +import java.util.Comparator; +import java.util.function.Supplier; + +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getCacheOperationSpans; +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.getFileLocation; +import static io.trino.filesystem.tracing.CacheFileSystemTraceUtils.isTrinoSchemaOrPermissions; +import static io.trino.testing.MultisetAssertions.assertMultisetsEqual; +import static java.util.stream.Collectors.toCollection; + +public final class FileOperationAssertions +{ + private static final Logger log = Logger.get(FileOperationAssertions.class); + + private FileOperationAssertions() {} + + /** + * Asserts that file system accesses match expected operations. + * This version uses manual filtering for Input/InputFile operations. + * Logs detailed comparison at WARN level for debugging test failures. + */ + public static void assertFileSystemAccesses( + QueryRunner queryRunner, + @Language("SQL") String query, + Multiset<FileOperation> expectedCacheAccesses) + throws InterruptedException + { + queryRunner.executeWithPlan(queryRunner.getDefaultSession(), query); + // Async table-stats computation can outlive the synchronous query and emit spans into + // the exporter after execute returns. A fixed Thread.sleep races with this: when stats + // from query N is still running while query N+1's measurement happens, spans leak + // across the boundary and counts get scrambled. Poll until the span set is stable for + // two consecutive reads. + Multiset<FileOperation> actualCacheAccesses = waitForStableSpans(() -> getFileOperations(queryRunner)); + printFileAccessDebugInfo(queryRunner, actualCacheAccesses, expectedCacheAccesses); + assertMultisetsEqual(actualCacheAccesses, expectedCacheAccesses); Review Comment: 🤖 nit: `printFileAccessDebugInfo` is called unconditionally before every assertion, so it emits WARN-level logs even when tests pass. Could you move the call inside a `try { assertMultisetsEqual(...); } catch (AssertionError e) { printFileAccessDebugInfo(...); throw e; }` block (or drop to DEBUG level) so CI output isn't flooded on green runs? <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag quality.</i></sub> ########## hudi-trino-plugin/src/main/java/io/trino/plugin/hudi/HudiUtil.java: ########## @@ -397,4 +413,62 @@ public static Schema getLatestTableSchema(HoodieTableMetaClient metaClient, Stri throw new TrinoException(HUDI_FILESYSTEM_ERROR, e); } } + + public static List<HiveColumnHandle> getOrderingColumnHandles(Table table, TypeManager typeManager, Lazy<HoodieTableMetaClient> lazyMetaClient, HiveTimestampPrecision timestampPrecision) + { + RecordMergeMode recordMergeMode = lazyMetaClient.get().getTableConfig().getRecordMergeMode(); + if (Objects.isNull(recordMergeMode) || recordMergeMode.equals(RecordMergeMode.COMMIT_TIME_ORDERING)) { + // if commit time ordering is enabled, return empty list + return Collections.emptyList(); + } + + ImmutableList.Builder<HiveColumnHandle> columns = ImmutableList.builder(); + List<String> orderingColumnNames = lazyMetaClient.get().getTableConfig().getOrderingFields(); + + int hiveColumnIndex = 0; Review Comment: 🤖 nit: `Objects.isNull(recordMergeMode)` reads a bit awkwardly for a plain null check on a local variable — `recordMergeMode == null` is the more idiomatic form here. <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag quality.</i></sub> ########## hudi-trino-plugin/src/main/java/io/trino/plugin/hudi/stats/TableMetadataReader.java: ########## @@ -55,60 +48,29 @@ public class TableMetadataReader * @return a map from column name to their corresponding {@link HoodieColumnRangeMetadata} * @throws HoodieMetadataException if an error occurs while fetching the column statistics */ - Map<String, HoodieColumnRangeMetadata> getColumnStats(List<Pair<String, String>> partitionNameFileNameList, List<String> columnNames) + public Map<String, HoodieColumnRangeMetadata> getColumnsRange(List<Pair<String, String>> partitionNameFileNameList, List<String> columnNames) throws HoodieMetadataException { - return computeFileToColumnStatsMap(computeColumnStatsLookupKeys(partitionNameFileNameList, columnNames)); - } - - /** - * @param partitionNameFileNameList a list of partition and file name pairs for which column stats need to be retrieved - * @param columnNames list of column names for which stats are needed - * @return a list of column stats keys to look up in the metadata table col_stats partition. - */ - private List<String> computeColumnStatsLookupKeys( - final List<Pair<String, String>> partitionNameFileNameList, - final List<String> columnNames) - { - return columnNames.stream() - .flatMap(columnName -> partitionNameFileNameList.stream() - .map(partitionNameFileNamePair -> HoodieMetadataPayload.getColumnStatsIndexKey( - new PartitionIndexID(HoodieTableMetadataUtil.getColumnStatsIndexPartitionIdentifier(partitionNameFileNamePair.getLeft())), - new FileIndexID(partitionNameFileNamePair.getRight()), - new ColumnIndexID(columnName)))) - .toList(); - } - - /** - * @param columnStatsLookupKeys a map from column stats key to partition and file name pair - * @return a map from column name to merged HoodieMetadataColumnStats - */ - private Map<String, HoodieColumnRangeMetadata> computeFileToColumnStatsMap(List<String> columnStatsLookupKeys) - { - HoodieTimer timer = HoodieTimer.start(); - Map<String, HoodieRecord<HoodieMetadataPayload>> hoodieRecords = - getRecordsByKeys(columnStatsLookupKeys, MetadataPartitionType.COLUMN_STATS.getPartitionPath()); - metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer())); - return hoodieRecords.values().stream() - .collect(Collectors.groupingBy( - r -> r.getData().getColumnStatMetadata().get().getColumnName(), - Collectors.mapping(r -> r.getData().getColumnStatMetadata().get(), Collectors.toList()))) + Map<Pair<String, String>, List<HoodieMetadataColumnStats>> columnStatsMap = getColumnStats(partitionNameFileNameList, columnNames); + return columnStatsMap.values().stream().flatMap(Collection::stream).collect(Collectors.groupingBy( + HoodieMetadataColumnStats::getColumnName, + Collectors.mapping(colStats -> colStats, Collectors.toList()))) Review Comment: 🤖 nit: `Collectors.mapping(colStats -> colStats, Collectors.toList())` is just an identity transform — could you replace it with `Collectors.toList()` directly to reduce noise? <sub><i>- AI-generated; verify before applying. React 👍/👎 to flag quality.</i></sub> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
