xushiyan commented on code in PR #8490:
URL: https://github.com/apache/hudi/pull/8490#discussion_r1184756545


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/HoodieIndexUtils.java:
##########
@@ -175,4 +186,72 @@ public static boolean checkIfValidCommit(HoodieTimeline 
commitTimeline, String c
     // 2) is less than the first commit ts in the timeline
     return !commitTimeline.empty() && 
commitTimeline.containsOrBeforeTimelineStarts(commitTs);
   }
+
+  public static <R> HoodieData<HoodieRecord<R>> 
getTaggedRecordsFromPartitionLocations(
+      HoodieData<Pair<String, HoodieRecordLocation>> partitionLocations, 
HoodieWriteConfig config, HoodieTable hoodieTable) {
+    final Option<String> instantTime = hoodieTable
+        .getMetaClient()
+        .getCommitsTimeline()
+        .filterCompletedInstants()
+        .lastInstant()
+        .map(HoodieInstant::getTimestamp);
+    return partitionLocations.flatMap(p -> {
+      String partitionPath = p.getLeft();
+      String fileId = p.getRight().getFileId();
+      return new HoodieMergedReadHandle(config, instantTime, hoodieTable, 
Pair.of(partitionPath, fileId))
+          .getMergedRecords().iterator();
+    });
+  }
+
+  public static <R> HoodieData<HoodieRecord<R>> mergeForPartitionUpdates(
+      HoodieData<Pair<HoodieRecord<R>, Option<Pair<String, 
HoodieRecordLocation>>>> taggedHoodieRecords, HoodieWriteConfig config, 
HoodieTable hoodieTable) {
+    // completely new records
+    HoodieData<HoodieRecord<R>> newRecords = taggedHoodieRecords.filter(p -> 
!p.getRight().isPresent()).map(Pair::getLeft);
+    // the records tagged to existing base files
+    HoodieData<HoodieRecord<R>> updatingRecords = taggedHoodieRecords.filter(p 
-> p.getRight().isPresent()).map(Pair::getLeft)
+        .distinctWithKey(HoodieRecord::getRecordKey, 
config.getGlobalIndexReconcileParallelism());
+    // the tagging partitions and locations
+    HoodieData<Pair<String, HoodieRecordLocation>> partitionLocations = 
taggedHoodieRecords
+        .filter(p -> p.getRight().isPresent())
+        .map(p -> p.getRight().get())
+        .distinct(config.getGlobalIndexReconcileParallelism());
+    // merged existing records with current locations being set
+    HoodieData<HoodieRecord<R>> existingRecords = 
getTaggedRecordsFromPartitionLocations(partitionLocations, config, hoodieTable);
+
+    TypedProperties updatedProps = 
HoodieAvroRecordMerger.Config.withLegacyOperatingModePreCombining(config.getProps());
+    HoodieData<HoodieRecord<R>> taggedUpdatingRecords = 
updatingRecords.mapToPair(r -> Pair.of(r.getRecordKey(), r))
+        .leftOuterJoin(existingRecords.mapToPair(r -> 
Pair.of(r.getRecordKey(), r)))
+        .values().flatMap(entry -> {
+          HoodieRecord<R> incoming = entry.getLeft();
+          Option<HoodieRecord<R>> existingOpt = entry.getRight();
+          if (!existingOpt.isPresent()) {
+            // existing record not found (e.g., due to delete log not merged 
to base file): tag as a new record
+            return Collections.singletonList(getTaggedRecord(incoming, 
Option.empty())).iterator();
+          }
+          HoodieRecord<R> existing = existingOpt.get();
+          if (incoming.getData() instanceof EmptyHoodieRecordPayload) {
+            // incoming is a delete: force tag the incoming to the old 
partition
+            return Collections.singletonList(getTaggedRecord(incoming, 
Option.of(existing.getCurrentLocation()))).iterator();
+          }
+          Schema existingSchema = HoodieAvroUtils.addMetadataFields(new 
Schema.Parser().parse(config.getSchema()), 
config.allowOperationMetadataField());

Review Comment:
   made clearer



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to