This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new eb3b380  [SPARK-34909][SQL] Fix conversion of negative to unsigned in 
conv()
eb3b380 is described below

commit eb3b3801153f2cd355ab2a4c21177ea8772d43e0
Author: Tim Armstrong <tim.armstr...@databricks.com>
AuthorDate: Wed Mar 31 12:58:29 2021 +0800

    [SPARK-34909][SQL] Fix conversion of negative to unsigned in conv()
    
    ### What changes were proposed in this pull request?
    Use `java.lang.Long.divideUnsigned()` to do integer division in 
`NumberConverter` to avoid a bug in `unsignedLongDiv` that produced invalid 
results.
    
    ### Why are the changes needed?
    The previous results are incorrect, the result of the below query should be 
45012021522523134134555
    ```
    scala> spark.sql("select conv('-10', 11, 7)").show(20, 150)
    +-----------------------+
    |       conv(-10, 11, 7)|
    +-----------------------+
    |4501202152252313413456|
    +-----------------------+
    scala> spark.sql("select hex(conv('-10', 11, 7))").show(20, 150)
    +----------------------------------------------+
    |                         hex(conv(-10, 11, 7))|
    +----------------------------------------------+
    |3435303132303231353232353233313334313334353600|
    +----------------------------------------------+
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    `conv()` will produce different results because the bug is fixed.
    
    ### How was this patch tested?
    Added a simple unit test.
    
    Closes #32006 from timarmstrong/conv-unsigned.
    
    Authored-by: Tim Armstrong <tim.armstr...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit 13b255fefd881beb68fd8bb6741c7f88318baf9b)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/util/NumberConverter.scala  | 26 +++-------------------
 .../sql/catalyst/util/NumberConverterSuite.scala   |  4 ++++
 2 files changed, 7 insertions(+), 23 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
index 7dbdd1e..dca75e5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
@@ -22,26 +22,6 @@ import org.apache.spark.unsafe.types.UTF8String
 object NumberConverter {
 
   /**
-   * Divide x by m as if x is an unsigned 64-bit integer. Examples:
-   * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2
-   * unsignedLongDiv(0, 5) == 0
-   *
-   * @param x is treated as unsigned
-   * @param m is treated as signed
-   */
-  private def unsignedLongDiv(x: Long, m: Int): Long = {
-    if (x >= 0) {
-      x / m
-    } else {
-      // Let uval be the value of the unsigned long with the same bits as x
-      // Two's complement => x = uval - 2*MAX - 2
-      // => uval = x + 2*MAX + 2
-      // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
-      x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % 
m) + 2 % m) / m
-    }
-  }
-
-  /**
    * Decode v into value[].
    *
    * @param v is treated as an unsigned 64-bit integer
@@ -52,7 +32,7 @@ object NumberConverter {
     java.util.Arrays.fill(value, 0.asInstanceOf[Byte])
     var i = value.length - 1
     while (tmpV != 0) {
-      val q = unsignedLongDiv(tmpV, radix)
+      val q = java.lang.Long.divideUnsigned(tmpV, radix)
       value(i) = (tmpV - q * radix).asInstanceOf[Byte]
       tmpV = q
       i -= 1
@@ -69,12 +49,12 @@ object NumberConverter {
    */
   private def encode(radix: Int, fromPos: Int, value: Array[Byte]): Long = {
     var v: Long = 0L
-    val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once
+    val bound = java.lang.Long.divideUnsigned(-1 - radix, radix) // Possible 
overflow once
     var i = fromPos
     while (i < value.length && value(i) >= 0) {
       if (v >= bound) {
         // Check for overflow
-        if (unsignedLongDiv(-1 - value(i), radix) < v) {
+        if (java.lang.Long.divideUnsigned(-1 - value(i), radix) < v) {
           return -1
         }
       }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
index ec73f45..eb257b7 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
@@ -40,6 +40,10 @@ class NumberConverterSuite extends SparkFunSuite {
     checkConv("11abc", 10, 16, "B")
   }
 
+  test("SPARK-34909: convert negative to unsigned") {
+    checkConv("-10", 11, 7, "45012021522523134134555")
+  }
+
   test("byte to binary") {
     checkToBinary(0.toByte)
     checkToBinary(1.toByte)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to