This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 13a8e5c7297 [HUDI-5348] Cache file slices in HoodieBackedTableMetadata
(#7436)
13a8e5c7297 is described below
commit 13a8e5c729750ba5907d75df3d22473feaaa2a03
Author: Y Ethan Guo <[email protected]>
AuthorDate: Mon Dec 12 17:00:10 2022 -0800
[HUDI-5348] Cache file slices in HoodieBackedTableMetadata (#7436)
---
.../org/apache/hudi/metadata/HoodieBackedTableMetadata.java | 13 +++++++++++--
.../org/apache/hudi/metadata/HoodieTableMetadataUtil.java | 10 ++++++----
.../java/org/apache/hudi/utilities/TestHoodieIndexer.java | 7 +++++--
3 files changed, 22 insertions(+), 8 deletions(-)
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
index 7743a65bf05..e2fbc4e6716 100644
---
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
+++
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieBackedTableMetadata.java
@@ -40,6 +40,7 @@ import
org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
+import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.ClosableIterator;
import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option;
@@ -78,6 +79,7 @@ import static
org.apache.hudi.common.util.ValidationUtils.checkArgument;
import static
org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_BLOOM_FILTERS;
import static
org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_COLUMN_STATS;
import static
org.apache.hudi.metadata.HoodieTableMetadataUtil.PARTITION_NAME_FILES;
+import static
org.apache.hudi.metadata.HoodieTableMetadataUtil.getFileSystemView;
/**
* Table metadata provided by an internal DFS backed Hudi metadata table.
@@ -92,6 +94,7 @@ public class HoodieBackedTableMetadata extends
BaseTableMetadata {
// Metadata table's timeline and metaclient
private HoodieTableMetaClient metadataMetaClient;
private HoodieTableConfig metadataTableConfig;
+ private HoodieTableFileSystemView metadataFileSystemView;
// should we reuse the open file handles, across calls
private final boolean reuse;
@@ -120,6 +123,7 @@ public class HoodieBackedTableMetadata extends
BaseTableMetadata {
} else if (this.metadataMetaClient == null) {
try {
this.metadataMetaClient =
HoodieTableMetaClient.builder().setConf(hadoopConf.get()).setBasePath(metadataBasePath).build();
+ this.metadataFileSystemView = getFileSystemView(metadataMetaClient);
this.metadataTableConfig = metadataMetaClient.getTableConfig();
this.isBloomFilterIndexEnabled =
metadataConfig.isBloomFilterIndexEnabled();
this.isColumnStatsIndexEnabled =
metadataConfig.isColumnStatsIndexEnabled();
@@ -127,11 +131,13 @@ public class HoodieBackedTableMetadata extends
BaseTableMetadata {
LOG.warn("Metadata table was not found at path " + metadataBasePath);
this.isMetadataTableEnabled = false;
this.metadataMetaClient = null;
+ this.metadataFileSystemView = null;
this.metadataTableConfig = null;
} catch (Exception e) {
LOG.error("Failed to initialize metadata table at path " +
metadataBasePath, e);
this.isMetadataTableEnabled = false;
this.metadataMetaClient = null;
+ this.metadataFileSystemView = null;
this.metadataTableConfig = null;
}
}
@@ -162,7 +168,8 @@ public class HoodieBackedTableMetadata extends
BaseTableMetadata {
// to scan all file-groups for all key-prefixes as each of these
might contain some
// records matching the key-prefix
List<FileSlice> partitionFileSlices =
-
HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient,
partitionName);
+ HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(
+ metadataMetaClient, metadataFileSystemView, partitionName);
return (shouldLoadInMemory ? HoodieListData.lazy(partitionFileSlices) :
engineContext.parallelize(partitionFileSlices))
.flatMap((SerializableFunction<FileSlice,
Iterator<HoodieRecord<HoodieMetadataPayload>>>) fileSlice -> {
@@ -379,7 +386,8 @@ public class HoodieBackedTableMetadata extends
BaseTableMetadata {
private Map<Pair<String, FileSlice>, List<String>>
getPartitionFileSliceToKeysMapping(final String partitionName, final
List<String> keys) {
// Metadata is in sync till the latest completed instant on the dataset
List<FileSlice> latestFileSlices =
-
HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient,
partitionName);
+ HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(
+ metadataMetaClient, metadataFileSystemView, partitionName);
Map<Pair<String, FileSlice>, List<String>> partitionFileSliceToKeysMap =
new HashMap<>();
for (String key : keys) {
@@ -646,6 +654,7 @@ public class HoodieBackedTableMetadata extends
BaseTableMetadata {
dataMetaClient.reloadActiveTimeline();
if (metadataMetaClient != null) {
metadataMetaClient.reloadActiveTimeline();
+ metadataFileSystemView = getFileSystemView(metadataMetaClient);
}
// the cached reader has max instant time restriction, they should be
cleared
// because the metadata timeline may have changed.
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index 5896c1a5ebb..0ceb43b86c6 100644
---
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -991,13 +991,15 @@ public class HoodieTableMetadataUtil {
* just before the compaction instant time. The list of file slices returned
is
* sorted in the correct order of file group name.
*
- * @param metaClient - Instance of {@link HoodieTableMetaClient}.
- * @param partition - The name of the partition whose file groups are to be
loaded.
+ * @param metaClient Instance of {@link HoodieTableMetaClient}.
+ * @param fsView Metadata table filesystem view.
+ * @param partition The name of the partition whose file groups are to be
loaded.
* @return List of latest file slices for all file groups in a given
partition.
*/
- public static List<FileSlice>
getPartitionLatestMergedFileSlices(HoodieTableMetaClient metaClient, String
partition) {
+ public static List<FileSlice> getPartitionLatestMergedFileSlices(
+ HoodieTableMetaClient metaClient, HoodieTableFileSystemView fsView,
String partition) {
LOG.info("Loading latest merged file slices for metadata table partition "
+ partition);
- return getPartitionFileSlices(metaClient, Option.empty(), partition, true);
+ return getPartitionFileSlices(metaClient, Option.of(fsView), partition,
true);
}
/**
diff --git
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java
index f5a0fadc87f..ac7b86f4cfa 100644
---
a/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java
+++
b/hudi-utilities/src/test/java/org/apache/hudi/utilities/TestHoodieIndexer.java
@@ -56,6 +56,7 @@ import java.util.stream.Stream;
import static org.apache.hudi.common.table.HoodieTableMetaClient.reload;
import static
org.apache.hudi.common.table.timeline.HoodieInstant.State.REQUESTED;
+import static
org.apache.hudi.metadata.HoodieTableMetadataUtil.getFileSystemView;
import static
org.apache.hudi.metadata.HoodieTableMetadataUtil.metadataPartitionExists;
import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS;
import static org.apache.hudi.metadata.MetadataPartitionType.COLUMN_STATS;
@@ -175,7 +176,8 @@ public class TestHoodieIndexer extends
SparkClientFunctionalTestHarness implemen
HoodieTableMetaClient metadataMetaClient =
HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getMetaPath()
+ "/metadata").build();
List<FileSlice> partitionFileSlices =
-
HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient,
COLUMN_STATS.getPartitionPath());
+ HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(
+ metadataMetaClient, getFileSystemView(metadataMetaClient),
COLUMN_STATS.getPartitionPath());
assertEquals(partitionFileSlices.size(), colStatsFileGroupCount);
}
@@ -220,7 +222,8 @@ public class TestHoodieIndexer extends
SparkClientFunctionalTestHarness implemen
HoodieTableMetaClient metadataMetaClient =
HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(metaClient.getMetaPath()
+ "/metadata").build();
List<FileSlice> partitionFileSlices =
-
HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(metadataMetaClient,
COLUMN_STATS.getPartitionPath());
+ HoodieTableMetadataUtil.getPartitionLatestMergedFileSlices(
+ metadataMetaClient, getFileSystemView(metadataMetaClient),
COLUMN_STATS.getPartitionPath());
assertEquals(partitionFileSlices.size(),
HoodieMetadataConfig.METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT.defaultValue());
}