manojpec commented on a change in pull request #4352:
URL: https://github.com/apache/hudi/pull/4352#discussion_r784651771
##########
File path:
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java
##########
@@ -165,17 +167,40 @@ protected void commit(HoodieData<HoodieRecord>
hoodieDataRecords, String partiti
/**
* Tag each record with the location in the given partition.
- *
+ * <p>
* The record is tagged with respective file slice's location based on its
record key.
*/
- private JavaRDD<HoodieRecord> prepRecords(JavaRDD<HoodieRecord> recordsRDD,
String partitionName, int numFileGroups) {
- List<FileSlice> fileSlices =
HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient,
partitionName);
- ValidationUtils.checkArgument(fileSlices.size() == numFileGroups,
String.format("Invalid number of file groups: found=%d, required=%d",
fileSlices.size(), numFileGroups));
-
- return recordsRDD.map(r -> {
- FileSlice slice =
fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(),
numFileGroups));
- r.setCurrentLocation(new
HoodieRecordLocation(slice.getBaseInstantTime(), slice.getFileId()));
- return r;
- });
+ private JavaRDD<HoodieRecord> prepRecords(Map<MetadataPartitionType,
HoodieData<HoodieRecord>> partitionRecordsMap) {
+ // The result set
+ JavaRDD<HoodieRecord> rddAllPartitionRecords = null;
+
+ for (Map.Entry<MetadataPartitionType, HoodieData<HoodieRecord>> entry :
partitionRecordsMap.entrySet()) {
+ final String partitionName = entry.getKey().partitionPath();
+ final int fileGroupCount = entry.getKey().getFileGroupCount();
+ HoodieData<HoodieRecord> records = entry.getValue();
+ JavaRDD<HoodieRecord> recordsRDD = (JavaRDD<HoodieRecord>) records.get();
+
+ List<FileSlice> fileSlices =
+
HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient,
partitionName);
+ ValidationUtils.checkArgument(fileSlices.size() == fileGroupCount,
+ String.format("Invalid number of file groups: found=%d,
required=%d", fileSlices.size(), fileGroupCount));
+
+ JavaSparkContext jsc = ((HoodieSparkEngineContext)
engineContext).getJavaSparkContext();
+ JavaRDD<HoodieRecord> rddSinglePartitionRecords = recordsRDD.map(r -> {
+ FileSlice slice =
fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(),
Review comment:
Sure. This is not a problem in my testing.
##########
File path: hudi-common/src/main/avro/HoodieMetadata.avsc
##########
@@ -30,27 +30,118 @@
"doc": "Type of the metadata record",
"type": "int"
},
- { "name": "filesystemMetadata",
+ {
"doc": "Contains information about partitions and files within the
dataset",
- "type": ["null", {
- "type": "map",
- "values": {
+ "name": "filesystemMetadata",
+ "type": [
+ "null",
+ {
+ "type": "map",
+ "values": {
+ "type": "record",
+ "name": "HoodieMetadataFileInfo",
+ "fields": [
+ {
+ "name": "size",
+ "type": "long",
+ "doc": "Size of the file"
+ },
+ {
+ "name": "isDeleted",
+ "type": "boolean",
+ "doc": "True if this file has been deleted"
+ }
+ ]
+ }
+ }
+ ]
+ },
+ {
+ "doc": "Metadata Index of bloom filters for all data files in the
user table",
+ "name": "BloomFilterMetadata",
+ "type": [
+ "null",
+ {
+ "doc": "Data file bloom filter details",
+ "name": "HoodieMetadataBloomFilter",
"type": "record",
- "name": "HoodieMetadataFileInfo",
"fields": [
{
- "name": "size",
- "type": "long",
- "doc": "Size of the file"
+ "doc": "Bloom filter type code",
+ "name": "type",
+ "type": "string"
+ },
+ {
+ "doc": "Instant timestamp when this metadata was
created/updated",
+ "name": "timestamp",
+ "type": "string"
+ },
+ {
+ "doc": "Bloom filter binary byte array",
+ "name": "bloomFilter",
+ "type": "bytes"
},
{
+ "doc": "Bloom filter entry valid/deleted flag",
"name": "isDeleted",
- "type": "boolean",
- "doc": "True if this file has been deleted"
+ "type": "boolean"
+ },
+ {
+ "doc": "Reserved bytes for future use",
+ "name": "reserved",
Review comment:
Fixed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]