This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 4877e0ba0c91 [SPARK-55932] Fix XML to variant parser hang on negative
scale
4877e0ba0c91 is described below
commit 4877e0ba0c91b250937e544b6b08727982e325eb
Author: Stevo Mitric <[email protected]>
AuthorDate: Wed Mar 11 09:05:48 2026 +0800
[SPARK-55932] Fix XML to variant parser hang on negative scale
### What changes were proposed in this pull request?
Add a guard before `setScale(0):` if `scale < -MAX_DECIMAL16_PRECISION`
(-38), skip the decimal path and store the value as a string. Any value with
scale below -38 is guaranteed to exceed the 38-digit precision limit after
normalization, so the guard is a pure optimization with no correctness impact.
### Why are the changes needed?
When parsing XML or CSV data to Variant type, a decimal string with an
extreme positive exponent (e.g., "1e9999999") causes the executor to hang
indefinitely.
Java's BigDecimal parses such strings into a compact representation
(`unscaledValue=1`, `scale=-9999999`). The variant parser then calls
`d.setScale(0)` to normalize the scale before storing in the variant binary
format. This forces the JDK to compute `BigInteger.pow(10, 9999999)`,
materializing a number with ~10 million digits via multiplication - an
operation that takes very long and effectively hangs the task.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Unit tests in this PR.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #54726 from stevomitric/stevomitric/fix-xml-parser.
Lead-authored-by: Stevo Mitric <[email protected]>
Co-authored-by: Stevo Mitric <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../spark/sql/catalyst/csv/UnivocityParser.scala | 27 ++++++++++++++--------
.../spark/sql/catalyst/xml/StaxXmlParser.scala | 20 ++++++++++------
.../org/apache/spark/sql/CsvFunctionsSuite.scala | 14 +++++++++++
.../datasources/xml/XmlVariantSuite.scala | 7 ++++++
4 files changed, 51 insertions(+), 17 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
index 3f0ce387883c..2073c5922fea 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -496,17 +496,24 @@ class UnivocityParser(
def parseDecimal(): DataType = {
try {
var d = decimalParser(s)
- if (d.scale() < 0) {
- d = d.setScale(0)
- }
- if (d.scale() <= VariantUtil.MAX_DECIMAL16_PRECISION &&
- d.precision() <= VariantUtil.MAX_DECIMAL16_PRECISION) {
- builder.appendDecimal(d)
- // The actual decimal type doesn't matter. `appendDecimal` will
use the smallest
- // possible decimal type to store the value.
- DecimalType.USER_DEFAULT
- } else {
+ if (d.scale() < -VariantUtil.MAX_DECIMAL16_PRECISION) {
+ // Scale is so extremely negative that setScale(0) would require
computing
+ // bigTenToThe(|scale|), which is prohibitively expensive. The
resulting precision
+ // would also exceed MAX_DECIMAL16_PRECISION, so fall through to
string.
if (options.preferDate) parseDate() else parseTimestampNTZ()
+ } else {
+ if (d.scale() < 0) {
+ d = d.setScale(0)
+ }
+ if (d.scale() <= VariantUtil.MAX_DECIMAL16_PRECISION &&
+ d.precision() <= VariantUtil.MAX_DECIMAL16_PRECISION) {
+ builder.appendDecimal(d)
+ // The actual decimal type doesn't matter. `appendDecimal` will
use the smallest
+ // possible decimal type to store the value.
+ DecimalType.USER_DEFAULT
+ } else {
+ if (options.preferDate) parseDate() else parseTimestampNTZ()
+ }
}
} catch {
case NonFatal(_) =>
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
index 58038cc189e1..268b39b89d43 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -1284,13 +1284,19 @@ object StaxXmlParser {
val decimalParser = ExprUtils.getDecimalParser(options.locale)
try {
var d = decimalParser(value)
- if (d.scale() < 0) {
- d = d.setScale(0)
- }
- if (d.scale <= VariantUtil.MAX_DECIMAL16_PRECISION &&
- d.precision <= VariantUtil.MAX_DECIMAL16_PRECISION) {
- builder.appendDecimal(d)
- return
+ if (d.scale() < -VariantUtil.MAX_DECIMAL16_PRECISION) {
+ // Scale is so extremely negative that setScale(0) would require
computing
+ // bigTenToThe(|scale|), which is prohibitively expensive. The
resulting precision
+ // would also exceed MAX_DECIMAL16_PRECISION, so fall through to
string.
+ } else {
+ if (d.scale() < 0) {
+ d = d.setScale(0)
+ }
+ if (d.scale <= VariantUtil.MAX_DECIMAL16_PRECISION &&
+ d.precision <= VariantUtil.MAX_DECIMAL16_PRECISION) {
+ builder.appendDecimal(d)
+ return
+ }
}
} catch {
case NonFatal(_) =>
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
index e796ec0f0ea7..f8daf677ddae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala
@@ -836,6 +836,20 @@ class CsvFunctionsSuite extends QueryTest with
SharedSparkSession {
Seq(Row(s"""{null, $largeInput}""")))
}
+ test("from_csv with variant: extreme negative scale decimal does not hang") {
+ // A value like "1E99999" parses to a BigDecimal with scale=-99999.
+ // Calling setScale(0) on it would hang, so it should fall through to
string.
+ val df = Seq("1E99999").toDF("value")
+ checkAnswer(
+ df.select(
+ from_csv(
+ $"value",
+ StructType.fromDDL("a variant"),
+ Map.empty[String, String]
+ ).cast("string")),
+ Seq(Row("""{"1E99999"}""")))
+ }
+
test("SPARK-47497: the input of to_csv must be StructType") {
val df = Seq(1, 2).toDF("value")
checkError(
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
index 5738cd2a9927..d8c09ce5ab53 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlVariantSuite.scala
@@ -80,6 +80,13 @@ class XmlVariantSuite extends QueryTest with
SharedSparkSession with TestXmlData
xml = "<ROW><amount>1e40</amount></ROW>",
expectedJsonStr = """{"amount":"1e40"}"""
)
+ // Extreme negative scale: parsed as String to avoid hanging on
setScale(0).
+ // "1e-99999" parses to a BigDecimal with scale=99999, which is fine
(positive scale).
+ // "1e99999" parses to a BigDecimal with scale=-99999, triggering the
guard.
+ testParser(
+ xml = "<ROW><amount>1e99999</amount></ROW>",
+ expectedJsonStr = """{"amount":"1e99999"}"""
+ )
// Date -> String
testParser(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]