This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 4877e0ba0c91 [SPARK-55932] Fix XML to variant parser hang on negative 
scale
4877e0ba0c91 is described below

commit 4877e0ba0c91b250937e544b6b08727982e325eb
Author: Stevo Mitric <[email protected]>
AuthorDate: Wed Mar 11 09:05:48 2026 +0800

    [SPARK-55932] Fix XML to variant parser hang on negative scale
    
    ### What changes were proposed in this pull request?
    
    Add a guard before `setScale(0):` if `scale < -MAX_DECIMAL16_PRECISION` 
(-38), skip the decimal path and store the value as a string. Any value with 
scale below -38 is guaranteed to exceed the 38-digit precision limit after 
normalization, so the guard is a pure optimization with no correctness impact.
    
    ### Why are the changes needed?
    When parsing XML or CSV data to Variant type, a decimal string with an 
extreme positive exponent (e.g., "1e9999999") causes the executor to hang 
indefinitely.
    
    Java's BigDecimal parses such strings into a compact representation 
(`unscaledValue=1`, `scale=-9999999`). The variant parser then calls 
`d.setScale(0)` to normalize the scale before storing in the variant binary 
format. This forces the JDK to compute `BigInteger.pow(10, 9999999)`, 
materializing a number with ~10 million digits via multiplication - an 
operation that takes very long and effectively hangs the task.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Unit tests in this PR.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #54726 from stevomitric/stevomitric/fix-xml-parser.
    
    Lead-authored-by: Stevo Mitric <[email protected]>
    Co-authored-by: Stevo Mitric <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/csv/UnivocityParser.scala   | 27 ++++++++++++++--------
 .../spark/sql/catalyst/xml/StaxXmlParser.scala     | 20 ++++++++++------
 .../org/apache/spark/sql/CsvFunctionsSuite.scala   | 14 +++++++++++
 .../datasources/xml/XmlVariantSuite.scala          |  7 ++++++
 4 files changed, 51 insertions(+), 17 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
index 3f0ce387883c..2073c5922fea 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -496,17 +496,24 @@ class UnivocityParser(
       def parseDecimal(): DataType = {
         try {
           var d = decimalParser(s)
-          if (d.scale() < 0) {
-            d = d.setScale(0)
-          }
-          if (d.scale() <= VariantUtil.MAX_DECIMAL16_PRECISION &&
-            d.precision() <= VariantUtil.MAX_DECIMAL16_PRECISION) {
-            builder.appendDecimal(d)
-            // The actual decimal type doesn't matter. `appendDecimal` will 
use the smallest
-            // possible decimal type to store the value.
-            DecimalType.USER_DEFAULT
-          } else {
+          if (d.scale() < -VariantUtil.MAX_DECIMAL16_PRECISION) {
+            // Scale is so extremely negative that setScale(0) would require 
computing
+            // bigTenToThe(|scale|), which is prohibitively expensive. The 
resulting precision
+            // would also exceed MAX_DECIMAL16_PRECISION, so fall through to 
string.
             if (options.preferDate) parseDate() else parseTimestampNTZ()
+          } else {
+            if (d.scale() < 0) {
+              d = d.setScale(0)
+            }
+            if (d.scale() <= VariantUtil.MAX_DECIMAL16_PRECISION &&
+              d.precision() <= VariantUtil.MAX_DECIMAL16_PRECISION) {
+              builder.appendDecimal(d)
+              // The actual decimal type doesn't matter. `appendDecimal` will 
use the smallest
+              // possible decimal type to store the value.
+              DecimalType.USER_DEFAULT
+            } else {
+              if (options.preferDate) parseDate() else parseTimestampNTZ()
+            }
           }
         } catch {
           case NonFatal(_) =>
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 58038cc189e1..268b39b89d43 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -1284,13 +1284,19 @@ object StaxXmlParser {
     val decimalParser = ExprUtils.getDecimalParser(options.locale)
     try {
       var d = decimalParser(value)
-      if (d.scale() < 0) {
-        d = d.setScale(0)
-      }
-      if (d.scale <= VariantUtil.MAX_DECIMAL16_PRECISION &&
-        d.precision <= VariantUtil.MAX_DECIMAL16_PRECISION) {
-        builder.appendDecimal(d)
-        return
+      if (d.scale() < -VariantUtil.MAX_DECIMAL16_PRECISION) {
+        // Scale is so extremely negative that setScale(0) would require 
computing
+        // bigTenToThe(|scale|), which is prohibitively expensive. The 
resulting precision
+        // would also exceed MAX_DECIMAL16_PRECISION, so fall through to 
string.
+      } else {
+        if (d.scale() < 0) {
+          d = d.setScale(0)
+        }
+        if (d.scale <= VariantUtil.MAX_DECIMAL16_PRECISION &&
+          d.precision <= VariantUtil.MAX_DECIMAL16_PRECISION) {
+          builder.appendDecimal(d)
+          return
+        }
       }
     } catch {
       case NonFatal(_) =>
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
index e796ec0f0ea7..f8daf677ddae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
@@ -836,6 +836,20 @@ class CsvFunctionsSuite extends QueryTest with 
SharedSparkSession {
       Seq(Row(s"""{null, $largeInput}""")))
   }
 
+  test("from_csv with variant: extreme negative scale decimal does not hang") {
+    // A value like "1E99999" parses to a BigDecimal with scale=-99999.
+    // Calling setScale(0) on it would hang, so it should fall through to 
string.
+    val df = Seq("1E99999").toDF("value")
+    checkAnswer(
+      df.select(
+        from_csv(
+          $"value",
+          StructType.fromDDL("a variant"),
+          Map.empty[String, String]
+        ).cast("string")),
+      Seq(Row("""{"1E99999"}""")))
+  }
+
   test("SPARK-47497: the input of to_csv must be StructType") {
     val df = Seq(1, 2).toDF("value")
     checkError(
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
index 5738cd2a9927..d8c09ce5ab53 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
@@ -80,6 +80,13 @@ class XmlVariantSuite extends QueryTest with 
SharedSparkSession with TestXmlData
       xml = "<ROW><amount>1e40</amount></ROW>",
       expectedJsonStr = """{"amount":"1e40"}"""
     )
+    // Extreme negative scale: parsed as String to avoid hanging on 
setScale(0).
+    // "1e-99999" parses to a BigDecimal with scale=99999, which is fine 
(positive scale).
+    // "1e99999" parses to a BigDecimal with scale=-99999, triggering the 
guard.
+    testParser(
+      xml = "<ROW><amount>1e99999</amount></ROW>",
+      expectedJsonStr = """{"amount":"1e99999"}"""
+    )
 
     // Date -> String
     testParser(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to