This is an automated email from the ASF dual-hosted git repository.
wombatukun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 760ecb51e9f [HUDI-9329] avoid obtaining an incorrect partition id due
to int type overflow in `BucketIndexUtil` (#13189)
760ecb51e9f is described below
commit 760ecb51e9f493cd69bfaf47ae0f06d2efd66260
Author: TheR1sing3un <[email protected]>
AuthorDate: Tue Apr 22 15:28:40 2025 +0800
[HUDI-9329] avoid obtaining an incorrect partition id due to int type
overflow in `BucketIndexUtil` (#13189)
---
.../java/org/apache/hudi/common/util/hash/BucketIndexUtil.java | 10 +++++++---
.../org/apache/hudi/common/util/hash/TestBucketIndexUtil.java | 2 ++
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git
a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
index 8081e65871b..837439b5fcc 100644
---
a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
+++
b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/BucketIndexUtil.java
@@ -19,6 +19,7 @@
package org.apache.hudi.common.util.hash;
import org.apache.hudi.common.util.Functions;
+import org.apache.hudi.common.util.ValidationUtils;
/**
* Utility class for bucket index.
@@ -37,9 +38,12 @@ public class BucketIndexUtil {
*/
public static Functions.Function3<Integer, String, Integer, Integer>
getPartitionIndexFunc(int parallelism) {
return (bucketNum, partition, curBucket) -> {
- int partitionIndex = (partition.hashCode() & Integer.MAX_VALUE) %
parallelism * bucketNum;
- int globalIndex = partitionIndex + curBucket;
- return globalIndex % parallelism;
+ long partitionIndex = (partition.hashCode() & Integer.MAX_VALUE) %
parallelism * (long) bucketNum;
+ long globalIndex = partitionIndex + curBucket;
+ int partitionId = (int) (globalIndex % parallelism);
+ ValidationUtils.checkArgument(partitionId >= 0 && partitionId <
parallelism,
+ () -> "Partition id should be in range [0, " + parallelism + "), but
got " + partitionId);
+ return partitionId;
};
}
}
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestBucketIndexUtil.java
b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestBucketIndexUtil.java
index 482227393eb..ea2f637b938 100644
---
a/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestBucketIndexUtil.java
+++
b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestBucketIndexUtil.java
@@ -55,6 +55,7 @@ public class TestBucketIndexUtil {
argsList.add(Arguments.of(201, 100, true));
argsList.add(Arguments.of(400, 1000, true));
argsList.add(Arguments.of(401, 1000, true));
+ argsList.add(Arguments.of(65536, 65536, true));
return argsList.stream();
}
@@ -144,6 +145,7 @@ public class TestBucketIndexUtil {
private void checkResult(Map<Integer, Integer> parallelism2TaskCount, int
parallelism, int bucketNumber, boolean partitioned, boolean needExpand) {
int sum = 0;
+ assertTrue(parallelism2TaskCount.keySet().stream().allMatch(id -> id >= 0
&& id < parallelism));
for (int v : parallelism2TaskCount.values()) {
sum = sum + v;
}