[hudi] branch master updated: [HUDI-6369] Fix spacial curve with sample strategy fails when 0 or 1 rows only is incoming (#9053)

vbalaji Wed, 18 Oct 2023 10:10:34 -0700

This is an automated email from the ASF dual-hosted git repository.

vbalaji pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git



The following commit(s) were added to refs/heads/master by this push:
     new 2dc1c60ba47 [HUDI-6369] Fix spacial curve with sample strategy fails 
when 0 or 1 rows only is incoming (#9053)
2dc1c60ba47 is described below

commit 2dc1c60ba475b9596290e227fe33dc009d516b21
Author: Nicolas Paris <[email protected]>
AuthorDate: Wed Oct 18 19:10:22 2023 +0200

    [HUDI-6369] Fix spacial curve with sample strategy fails when 0 or 1 rows 
only is incoming (#9053)
    
    * [HUDI-6369] Fix spacial when empty or 1 row df
    
    * Rename unit test to follow conventions
    
    ---------
    
    Co-authored-by: Balaji Varadarajan <[email protected]>
---
 .../spark/sql/hudi/execution/RangeSample.scala     |  5 +-
 .../sql/hudi/execution/TestRangeSampleSort.java    | 58 ++++++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala
 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala
index f00bb90a441..898c8dc8209 100644
--- 
a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala
+++ 
b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/hudi/execution/RangeSample.scala
@@ -316,6 +316,8 @@ object RangeSampleSort {
         
HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE.defaultValue.toString).toInt
       val sample = new RangeSample(zOrderBounds, sampleRdd)
       val rangeBounds = sample.getRangeBounds()
+      if (rangeBounds.size <= 1)
+        return df
       val sampleBounds = {
         val candidateColNumber = rangeBounds.head._1.length
         (0 to candidateColNumber - 1).map { i =>
@@ -479,6 +481,8 @@ object RangeSampleSort {
       val sample = new RangeSample(zOrderBounds, sampleRdd)
 
       val rangeBounds = sample.getRangeBounds()
+      if(rangeBounds.size <= 1)
+        return df
 
       implicit val ordering1 = lazyGeneratedOrderings(0)
 
@@ -536,4 +540,3 @@ object RangeSampleSort {
     }
   }
 }
-
diff --git 
a/hudi-client/hudi-spark-client/src/test/java/org/apache/spark/sql/hudi/execution/TestRangeSampleSort.java
 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/spark/sql/hudi/execution/TestRangeSampleSort.java
new file mode 100644
index 00000000000..cedf21d3c35
--- /dev/null
+++ 
b/hudi-client/hudi-spark-client/src/test/java/org/apache/spark/sql/hudi/execution/TestRangeSampleSort.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.spark.sql.hudi.execution;
+
+import org.apache.hudi.config.HoodieClusteringConfig;
+import org.apache.hudi.testutils.HoodieClientTestBase;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+
+import scala.collection.JavaConversions;
+
+class TestRangeSampleSort extends HoodieClientTestBase {
+
+  @Test
+  void sortDataFrameBySampleSupportAllTypes() {
+    Dataset<Row> df = this.context.getSqlContext().sql("select 1 as id, 
array(2) as content");
+    for (int i = 0; i < 2; i++) {
+      final int limit = i;
+      Assertions.assertDoesNotThrow(() ->
+          
RangeSampleSort$.MODULE$.sortDataFrameBySampleSupportAllTypes(df.limit(limit),
+              JavaConversions.asScalaBuffer(Arrays.asList("id", "content")), 
1), "range sort shall not fail when 0 or 1 record incoming");
+    }
+  }
+
+  @Test
+  void sortDataFrameBySample() {
+    HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy = 
HoodieClusteringConfig.LayoutOptimizationStrategy.HILBERT;
+    Dataset<Row> df = this.context.getSqlContext().sql("select 1 as id, 2 as 
content");
+    for (int i = 0; i < 2; i++) {
+      final int limit = i;
+      Assertions.assertDoesNotThrow(() ->
+          RangeSampleSort$.MODULE$.sortDataFrameBySample(df.limit(limit), 
layoutOptStrategy,
+              JavaConversions.asScalaBuffer(Arrays.asList("id", "content")), 
1), "range sort shall not fail when 0 or 1 record incoming");
+    }
+  }
+}

[hudi] branch master updated: [HUDI-6369] Fix spacial curve with sample strategy fails when 0 or 1 rows only is incoming (#9053)

Reply via email to