This is an automated email from the ASF dual-hosted git repository.
vinoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 684e12e [HUDI-1529] Add block size to the FileStatus objects returned
from metadata table to avoid too many file splits (#2451)
684e12e is described below
commit 684e12e9fcfa45c6ac922a84fb3116ac8142bc18
Author: Udit Mehrotra <[email protected]>
AuthorDate: Mon Jan 18 07:29:53 2021 -0800
[HUDI-1529] Add block size to the FileStatus objects returned from metadata
table to avoid too many file splits (#2451)
---
.../org/apache/hudi/metadata/TestHoodieBackedMetadata.java | 8 ++++++++
.../main/java/org/apache/hudi/metadata/BaseTableMetadata.java | 2 +-
.../java/org/apache/hudi/metadata/HoodieMetadataPayload.java | 10 +++++++---
3 files changed, 16 insertions(+), 4 deletions(-)
diff --git
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
index 3d770c7..16ee120 100644
---
a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
+++
b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/metadata/TestHoodieBackedMetadata.java
@@ -801,6 +801,14 @@ public class TestHoodieBackedMetadata extends
HoodieClientTestHarness {
// File sizes should be valid
Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));
+ // Block sizes should be valid
+ Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() >
0));
+ List<Long> fsBlockSizes =
Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
+ Collections.sort(fsBlockSizes);
+ List<Long> metadataBlockSizes =
Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
+ Collections.sort(metadataBlockSizes);
+ assertEquals(fsBlockSizes, metadataBlockSizes);
+
if ((fsFileNames.size() != metadataFilenames.size()) ||
(!fsFileNames.equals(metadataFilenames))) {
LOG.info("*** File system listing = " +
Arrays.toString(fsFileNames.toArray()));
LOG.info("*** Metadata listing = " +
Arrays.toString(metadataFilenames.toArray()));
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
index 4ae71de..de0a3c4 100644
--- a/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
+++ b/hudi-common/src/main/java/org/apache/hudi/metadata/BaseTableMetadata.java
@@ -202,7 +202,7 @@ public abstract class BaseTableMetadata implements
HoodieTableMetadata {
throw new HoodieMetadataException("Metadata record for partition " +
partitionName + " is inconsistent: "
+ hoodieRecord.get().getData());
}
- statuses = hoodieRecord.get().getData().getFileStatuses(partitionPath);
+ statuses =
hoodieRecord.get().getData().getFileStatuses(hadoopConf.get(), partitionPath);
}
if (validateLookups) {
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
index 0863f7e..9c6eb89 100644
---
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
+++
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieMetadataPayload.java
@@ -29,7 +29,9 @@ import org.apache.hudi.exception.HoodieMetadataException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
@@ -177,10 +179,12 @@ public class HoodieMetadataPayload implements
HoodieRecordPayload<HoodieMetadata
/**
* Returns the files added as part of this record.
*/
- public FileStatus[] getFileStatuses(Path partitionPath) {
+ public FileStatus[] getFileStatuses(Configuration hadoopConf, Path
partitionPath) throws IOException {
+ FileSystem fs = partitionPath.getFileSystem(hadoopConf);
+ long blockSize = fs.getDefaultBlockSize(partitionPath);
return filterFileInfoEntries(false)
- .map(e -> new FileStatus(e.getValue().getSize(), false, 0, 0, 0, 0,
null, null, null,
- new Path(partitionPath, e.getKey())))
+ .map(e -> new FileStatus(e.getValue().getSize(), false, 0, blockSize,
0, 0,
+ null, null, null, new Path(partitionPath, e.getKey())))
.toArray(FileStatus[]::new);
}