This is an automated email from the ASF dual-hosted git repository.
leesf pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 99b14a7 [HUDI-1918] Fix incorrect keyBy field cause serious data
skew, to avoid multiple subtasks write to a partition at the same time (#2972)
99b14a7 is described below
commit 99b14a78e38b081efbdda59243626c244335ea7c
Author: zhangminglei <[email protected]>
AuthorDate: Fri May 21 21:59:47 2021 +0800
[HUDI-1918] Fix incorrect keyBy field cause serious data skew, to avoid
multiple subtasks write to a partition at the same time (#2972)
---
.../src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git
a/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java
b/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java
index 88ab2b6..bd48563 100644
--- a/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java
+++ b/hudi-flink/src/main/java/org/apache/hudi/streamer/HoodieFlinkStreamer.java
@@ -88,8 +88,8 @@ public class HoodieFlinkStreamer {
.name("kafka_source")
.uid("uid_kafka_source")
.map(new RowDataToHoodieFunction<>(rowType, conf),
TypeInformation.of(HoodieRecord.class))
- // Key-by partition path, to avoid multiple subtasks write to a
partition at the same time
- .keyBy(HoodieRecord::getPartitionPath)
+ // Key-by record key, to avoid multiple subtasks write to a partition
at the same time
+ .keyBy(HoodieRecord::getRecordKey)
.transform(
"bucket_assigner",
TypeInformation.of(HoodieRecord.class),