zhangyue19921010 commented on code in PR #13060: URL: https://github.com/apache/hudi/pull/13060#discussion_r2028875514
########## hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/table/HoodieTableSource.java: ########## @@ -365,16 +368,16 @@ private PartitionPruners.PartitionPruner createPartitionPruner(List<ResolvedExpr .build(); } - private int getDataBucket(List<ResolvedExpression> dataFilters) { + private Option<Function<Integer, Integer>> getDataBucketFunc(List<ResolvedExpression> dataFilters) { if (!OptionsResolver.isBucketIndexType(conf) || dataFilters.isEmpty()) { - return PrimaryKeyPruners.BUCKET_ID_NO_PRUNING; + return Option.empty(); } Set<String> indexKeyFields = Arrays.stream(OptionsResolver.getIndexKeys(conf)).collect(Collectors.toSet()); List<ResolvedExpression> indexKeyFilters = dataFilters.stream().filter(expr -> ExpressionUtils.isEqualsLitExpr(expr, indexKeyFields)).collect(Collectors.toList()); if (!ExpressionUtils.isFilteringByAllFields(indexKeyFilters, indexKeyFields)) { - return PrimaryKeyPruners.BUCKET_ID_NO_PRUNING; + return Option.empty(); Review Comment: Add a new Spark UT `test("Test BucketID Pruning With Partition Bucket Index")` Without This PR will throw Exception ``` Expected Array([1111,3333.0,3333,2021-01-05]), but got Array() ScalaTestFailureLocation: org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase at (HoodieSparkSqlTestBase.scala:135) org.scalatest.exceptions.TestFailedException: Expected Array([1111,3333.0,3333,2021-01-05]), but got Array() ``` With PR in `Always load latest hashing config` logic, will throw exception ``` Expected Array([1111,2222.0,2222,2021-01-05]), but got Array() ScalaTestFailureLocation: org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase at (HoodieSparkSqlTestBase.scala:135) org.scalatest.exceptions.TestFailedException: Expected Array([1111,2222.0,2222,2021-01-05]), but got Array() ``` ########## hudi-common/src/main/java/org/apache/hudi/common/model/PartitionBucketIndexHashingConfig.java: ########## @@ -196,24 +196,51 @@ public static Option<PartitionBucketIndexHashingConfig> loadHashingConfig(Hoodie /** * Get Latest committed hashing config instant to load. + * If instant is empty, then return latest hashing config instant */ - public static String getLatestHashingConfigInstantToLoad(HoodieTableMetaClient metaClient) { + public static Option<String> getHashingConfigInstantToLoad(HoodieTableMetaClient metaClient, Option<String> instant) { try { List<String> allCommittedHashingConfig = getCommittedHashingConfigInstants(metaClient); - return allCommittedHashingConfig.get(allCommittedHashingConfig.size() - 1); + if (instant.isPresent()) { + Option<String> res = getHashingConfigInstantToLoadBeforeOrOn(allCommittedHashingConfig, instant.get()); + // fall back to look up archived hashing config instant before return empty + return res.isPresent() ? res : getHashingConfigInstantToLoadBeforeOrOn(getArchiveHashingConfigInstants(metaClient), instant.get()); + } else { + return Option.of(allCommittedHashingConfig.get(allCommittedHashingConfig.size() - 1)); + } } catch (Exception e) { throw new HoodieException("Failed to get hashing config instant to load.", e); } } + private static Option<String> getHashingConfigInstantToLoadBeforeOrOn(List<String> hashingConfigInstants, String instant) { Review Comment: Add a new Spark UT `test("Test BucketID Pruning With Partition Bucket Index")` Without This PR will throw Exception ``` Expected Array([1111,3333.0,3333,2021-01-05]), but got Array() ScalaTestFailureLocation: org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase at (HoodieSparkSqlTestBase.scala:135) org.scalatest.exceptions.TestFailedException: Expected Array([1111,3333.0,3333,2021-01-05]), but got Array() ``` With PR in `Always load latest hashing config` logic, will throw exception ``` Expected Array([1111,2222.0,2222,2021-01-05]), but got Array() ScalaTestFailureLocation: org.apache.spark.sql.hudi.common.HoodieSparkSqlTestBase at (HoodieSparkSqlTestBase.scala:135) org.scalatest.exceptions.TestFailedException: Expected Array([1111,2222.0,2222,2021-01-05]), but got Array() ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org