[GitHub] [hudi] alexeykudinkin commented on a diff in pull request #6016: [HUDI-4465] Optimizing file-listing sequence of Metadata Table

GitBox Wed, 27 Jul 2022 19:10:50 -0700


alexeykudinkin commented on code in PR #6016:
URL: https://github.com/apache/hudi/pull/6016#discussion_r931715112



##########
hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java:
##########
@@ -244,12 +255,7 @@ private Map<PartitionPath, FileStatus[]> 
loadPartitionPathFiles() {
           );
 
       fetchedPartitionToFiles =
-          FSUtils.getFilesInPartitions(
-                  engineContext,
-                  metadataConfig,
-                  basePath,
-                  fullPartitionPathsMapToFetch.keySet().toArray(new String[0]),
-                  fileSystemStorageConfig.getSpillableDir())

Review Comment:
   Good call



##########
hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java:
##########
@@ -124,28 +125,28 @@ public static HoodieTableMetaClient init(Configuration 
hadoopConf, String basePa
   }
 
   public static HoodieTableMetaClient init(Configuration hadoopConf, String 
basePath, HoodieTableType tableType,
-                                           Properties properties)
-      throws IOException {
-    properties = HoodieTableMetaClient.withPropertyBuilder()
-      .setTableName(RAW_TRIPS_TEST_NAME)
-      .setTableType(tableType)
-      .setPayloadClass(HoodieAvroPayload.class)
-      .fromProperties(properties)
-      .build();
-    return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, 
basePath, properties);
+                                           Properties properties) throws 
IOException {
+    return init(hadoopConf, basePath, tableType, properties, null);
   }
 
   public static HoodieTableMetaClient init(Configuration hadoopConf, String 
basePath, HoodieTableType tableType,
                                            Properties properties, String 
databaseName)
       throws IOException {
-    properties = HoodieTableMetaClient.withPropertyBuilder()
-      .setDatabaseName(databaseName)
-      .setTableName(RAW_TRIPS_TEST_NAME)
-      .setTableType(tableType)
-      .setPayloadClass(HoodieAvroPayload.class)
-      .fromProperties(properties)
-      .build();
-    return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, 
basePath, properties);
+    HoodieTableMetaClient.PropertyBuilder builder =
+        HoodieTableMetaClient.withPropertyBuilder()
+            .setDatabaseName(databaseName)
+            .setTableName(RAW_TRIPS_TEST_NAME)
+            .setTableType(tableType)
+            .setPayloadClass(HoodieAvroPayload.class);
+
+    String keyGen = 
properties.getProperty("hoodie.datasource.write.keygenerator.class");
+    if (!Objects.equals(keyGen, 
"org.apache.hudi.keygen.NonpartitionedKeyGenerator")) {
+      builder.setPartitionFields("some_nonexistent_field");

Review Comment:
   I don't think we should actually standardize on this one, it's just to stop 
the bleeding in misconfigured tests



##########
hudi-common/src/main/java/org/apache/hudi/BaseHoodieTableFileIndex.java:
##########
@@ -324,6 +335,26 @@ private void doRefresh() {
     LOG.info(String.format("Refresh table %s, spent: %d ms", 
metaClient.getTableConfig().getTableName(), duration));
   }
 
+  private Map<String, FileStatus[]> 
getAllFilesInPartitionsUnchecked(Collection<String> 
fullPartitionPathsMapToFetch) {
+    try {
+      return tableMetadata.getAllFilesInPartitions(new 
ArrayList<>(fullPartitionPathsMapToFetch));
+    } catch (IOException e) {
+      throw new HoodieIOException("Failed to list partition paths for a 
table", e);
+    }
+  }
+
+  private List<String> getAllPartitionPathsUnchecked() {
+    try {
+      if (partitionColumns.length == 0) {
+        return Collections.singletonList("");

Review Comment:
   Non-partitioned table has exactly one partition, which we designate w/ ""



##########
hudi-common/src/main/java/org/apache/hudi/hadoop/CachingPath.java:
##########
@@ -32,11 +33,12 @@
  * NOTE: This class is thread-safe
  */
 @ThreadSafe
-public class CachingPath extends Path implements Serializable {

Review Comment:
   Yep



##########
hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java:
##########
@@ -138,13 +144,17 @@ public FileStatus[] getAllFilesInPartition(Path 
partitionPath)
       }
     }
 
-    return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, 
dataBasePath, metadataConfig.shouldAssumeDatePartitioning())
+    return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, 
dataBasePath.toString(), metadataConfig.shouldAssumeDatePartitioning())
         .getAllFilesInPartition(partitionPath);
   }
 
   @Override
   public Map<String, FileStatus[]> getAllFilesInPartitions(List<String> 
partitions)
       throws IOException {
+    if (partitions.isEmpty()) {
+      return Collections.emptyMap();

Review Comment:
   Agree, this is somewhat dissonant, but that's just the way things are -- for 
non-partitioned tables it's assumed that the only partition that is there has 
to be identified by ""



##########
hudi-common/src/main/java/org/apache/hudi/common/table/HoodieTableMetaClient.java:
##########
@@ -398,7 +398,7 @@ public HoodieArchivedTimeline getArchivedTimeline(String 
startTs) {
   public void validateTableProperties(Properties properties) {
     // Once meta fields are disabled, it cant be re-enabled for a given table.
     if (!getTableConfig().populateMetaFields()
-        && Boolean.parseBoolean((String) 
properties.getOrDefault(HoodieTableConfig.POPULATE_META_FIELDS.key(), 
HoodieTableConfig.POPULATE_META_FIELDS.defaultValue()))) {
+        && Boolean.parseBoolean((String) 
properties.getOrDefault(HoodieTableConfig.POPULATE_META_FIELDS.key(), 
HoodieTableConfig.POPULATE_META_FIELDS.defaultValue().toString()))) {

Review Comment:
   It's not 



##########
hudi-common/src/main/java/org/apache/hudi/common/table/log/block/HoodieHFileDataBlock.java:
##########
@@ -167,9 +164,8 @@ protected ClosableIterator<IndexedRecord> 
deserializeRecords(byte[] content) thr
     // Get schema from the header
     Schema writerSchema = new 
Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
 
-    FileSystem fs = FSUtils.getFs(pathForReader.toString(), new 
Configuration());
     // Read the content
-    HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(fs, 
pathForReader, content, Option.of(writerSchema));
+    HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(null, 
pathForReader, content, Option.of(writerSchema));

Review Comment:
   Yeah, i checked it and it actually doesn't use `fs` at all



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/SparkHoodieTableFileIndex.scala:
##########
@@ -341,4 +340,9 @@ object SparkHoodieTableFileIndex {
       override def invalidate(): Unit = cache.invalidateAll()
     }
   }
+
+  private def shouldValidatePartitionColumns(spark: SparkSession): Boolean = {
+    // NOTE: We can't use helper, method nor the config-entry to stay 
compatible w/ Spark 2.4
+    
spark.sessionState.conf.getConfString("spark.sql.sources.validatePartitionColumns",
 "true").toBoolean

Review Comment:
   We don't need to



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] alexeykudinkin commented on a diff in pull request #6016: [HUDI-4465] Optimizing file-listing sequence of Metadata Table

Reply via email to