This is an automated email from the ASF dual-hosted git repository.

danny0405 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new bb0621edee9 [HUDI-7957] Fix data skew when writing with bulk_insert + 
bucket_index enabled (#11578)
bb0621edee9 is described below

commit bb0621edee97507cf2460e8cb57b5307510b917e
Author: KnightChess <[email protected]>
AuthorDate: Wed Jul 10 16:08:33 2024 +0800

    [HUDI-7957] Fix data skew when writing with bulk_insert + bucket_index 
enabled (#11578)
---
 .../apache/spark/sql/BucketPartitionUtils.scala    | 14 +++++---
 .../hudi/common/util/hash}/BucketIndexUtil.java    | 37 ++++++++++------------
 .../sink/bucket/BucketStreamWriteFunction.java     |  2 +-
 .../sink/partitioner/BucketIndexPartitioner.java   |  2 +-
 4 files changed, 28 insertions(+), 27 deletions(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/BucketPartitionUtils.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/BucketPartitionUtils.scala
index 41b091e6ecf..7b197727eb9 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/BucketPartitionUtils.scala
+++ 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/BucketPartitionUtils.scala
@@ -19,8 +19,9 @@
 package org.apache.spark.sql
 
 import org.apache.hudi.common.model.HoodieRecord
+import org.apache.hudi.common.util.Functions
+import org.apache.hudi.common.util.hash.BucketIndexUtil
 import org.apache.hudi.index.bucket.BucketIdentifier
-
 import org.apache.spark.Partitioner
 import org.apache.spark.sql.catalyst.InternalRow
 
@@ -39,12 +40,15 @@ object BucketPartitionUtils {
 
     val getPartitionKey = getPartitionKeyExtractor()
     val partitioner = new Partitioner {
+
+      private val partitionIndexFunc: Functions.Function2[String, Integer, 
Integer] =
+        BucketIndexUtil.getPartitionIndexFunc(bucketNum, partitionNum)
+
       override def numPartitions: Int = partitionNum
 
-      override def getPartition(key: Any): Int = {
-        val t = key.asInstanceOf[(String, Int)]
-        val pw = (t._1.hashCode & Int.MaxValue) % partitionNum
-        BucketIdentifier.mod(t._2 + pw, partitionNum)
+      override def getPartition(value: Any): Int = {
+        val partitionKeyPair = value.asInstanceOf[(String, Int)]
+        partitionIndexFunc.apply(partitionKeyPair._1, partitionKeyPair._2)
       }
     }
     // use internalRow to avoid extra convert.
diff --git 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/BucketIndexUtil.java
 
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
similarity index 58%
rename from 
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/BucketIndexUtil.java
rename to 
hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
index cba7e22c68a..adfdd4540d8 100644
--- 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/utils/BucketIndexUtil.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
@@ -1,27 +1,24 @@
 /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  *
- *  * Licensed to the Apache Software Foundation (ASF) under one
- *  * or more contributor license agreements.  See the NOTICE file
- *  * distributed with this work for additional information
- *  * regarding copyright ownership.  The ASF licenses this file
- *  * to you under the Apache License, Version 2.0 (the
- *  * "License"); you may not use this file except in compliance
- *  * with the License.  You may obtain a copy of the License at
- *  *
- *  *      http://www.apache.org/licenses/LICENSE-2.0
- *  *
- *  * Unless required by applicable law or agreed to in writing, software
- *  * distributed under the License is distributed on an "AS IS" BASIS,
- *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  * See the License for the specific language governing permissions and
- *  * limitations under the License.
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
-package org.apache.hudi.sink.utils;
+package org.apache.hudi.common.util.hash;
 
 import org.apache.hudi.common.util.Functions;
-import org.apache.hudi.index.bucket.BucketIdentifier;
 
 /**
  * Utility class for bucket index.
@@ -43,20 +40,20 @@ public class BucketIndexUtil {
       return (partition, curBucket) -> {
         int partitionIndex = (partition.hashCode() & Integer.MAX_VALUE) / 
parallelism * bucketNum;
         int globalIndex = partitionIndex + curBucket;
-        return BucketIdentifier.mod(globalIndex, parallelism);
+        return globalIndex % parallelism;
       };
     } else {
       if (parallelism % bucketNum == 0) {
         return (partition, curBucket) -> {
           int partitionIndex = (partition.hashCode() & Integer.MAX_VALUE) / 
(parallelism / bucketNum) * bucketNum;
           int globalIndex = partitionIndex + curBucket;
-          return BucketIdentifier.mod(globalIndex, parallelism);
+          return globalIndex % parallelism;
         };
       } else {
         return (partition, curBucket) -> {
           int partitionIndex = (partition.hashCode() & Integer.MAX_VALUE) / 
(parallelism / bucketNum + 1) * bucketNum;
           int globalIndex = partitionIndex + curBucket;
-          return BucketIdentifier.mod(globalIndex, parallelism);
+          return globalIndex % parallelism;
         };
       }
     }
diff --git 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java
 
b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java
index 5229cc591fc..1475cf043b3 100644
--- 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java
+++ 
b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/bucket/BucketStreamWriteFunction.java
@@ -22,11 +22,11 @@ import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.model.HoodieRecord;
 import org.apache.hudi.common.model.HoodieRecordLocation;
 import org.apache.hudi.common.util.Functions;
+import org.apache.hudi.common.util.hash.BucketIndexUtil;
 import org.apache.hudi.configuration.FlinkOptions;
 import org.apache.hudi.configuration.OptionsResolver;
 import org.apache.hudi.index.bucket.BucketIdentifier;
 import org.apache.hudi.sink.StreamWriteFunction;
-import org.apache.hudi.sink.utils.BucketIndexUtil;
 
 import org.apache.flink.configuration.Configuration;
 import org.apache.flink.runtime.state.FunctionInitializationContext;
diff --git 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java
 
b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java
index d80ebf650c9..d86278fcc8e 100644
--- 
a/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java
+++ 
b/hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/sink/partitioner/BucketIndexPartitioner.java
@@ -20,8 +20,8 @@ package org.apache.hudi.sink.partitioner;
 
 import org.apache.hudi.common.model.HoodieKey;
 import org.apache.hudi.common.util.Functions;
+import org.apache.hudi.common.util.hash.BucketIndexUtil;
 import org.apache.hudi.index.bucket.BucketIdentifier;
-import org.apache.hudi.sink.utils.BucketIndexUtil;
 
 import org.apache.flink.api.common.functions.Partitioner;
 

Reply via email to