This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e2a0171aeb43 [SPARK-45414][SQL][TESTS] Add regression tests for XML 
mixed type serialization
e2a0171aeb43 is described below

commit e2a0171aeb4370a2903940430707eb74db58d5ab
Author: David Roberts <[email protected]>
AuthorDate: Mon Feb 23 00:43:05 2026 +0800

    [SPARK-45414][SQL][TESTS] Add regression tests for XML mixed type 
serialization
    
    This adds two regression tests for SPARK-45414 to prevent reintroduction of 
the bug where string tag content was misplaced when writing XML with mixed 
column types (structs, arrays, and strings).
    
    Tests verify:
    - String columns between and after nested types write correctly
    - Attributes mixed with string elements serialize properly
    
    ### What changes were proposed in this pull request?
    Add two regression tests for SPARK-45414 to prevent reintroduction of the 
bug where string tag content could be misplaced when writing XML with mixed 
column types.
    
    ### Why are the changes needed?
    SPARK-45414 reported a bug in the spark-xml library where string content 
was misplaced when writing structs with mixed types. While this bug was fixed 
during spark-xml integration into Spark core, there were no tests ensuring this 
specific scenario remains correct. These tests provide coverage for the bug 
scenario.
    
    ### Does this PR introduce _any_ user-facing change?
    No, this only adds test coverage.
    
    ### How was this patch tested?
    The new tests were run successfully:
    
    ### Was this patch authored or co-authored using generative AI tooling?
    Yes, co-authored with Claude Sonnet 4.5.
    
    Generated-by: Claude Sonnet 4.5
    
    Closes #53857 from jdavidroberts/spark-45414-xml-tests.
    
    Lead-authored-by: David Roberts <[email protected]>
    Co-authored-by: jdavidroberts 
<[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../xml/parsers/StaxXmlGeneratorSuite.scala        | 106 +++++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
index 1798d32d8a2c..fefe54c7bae9 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/parsers/StaxXmlGeneratorSuite.scala
@@ -75,4 +75,110 @@ final class StaxXmlGeneratorSuite extends 
SharedSparkSession {
     assert(df.collect().toSeq === newDf.collect().toSeq)
   }
 
+  // SPARK-45414: Test for string tag content misplacement issue (found in 
spark-xml library)
+  // with mixed column types
+  test("SPARK-45414: write mixed types with string columns between and after 
nested types") {
+    import org.apache.spark.sql.Row
+
+    // Create a schema with mixed types: struct, array, and string columns
+    // This reproduces the scenario from SPARK-45414 where string content gets 
misplaced
+    val schema = StructType(
+      Seq(
+        StructField("id", IntegerType, nullable = false),
+        StructField(
+          "metadata",
+          StructType(Seq(StructField("version", StringType), 
StructField("timestamp", LongType))),
+          nullable = true),
+        StructField("description", StringType, nullable = true), // String 
between nested types
+        StructField("tags", ArrayType(StringType), nullable = true),
+        StructField("color", StringType, nullable = true), // String at the end
+        StructField("numbers", ArrayType(IntegerType), nullable = true)))
+
+    val data = Seq(
+      Row(1, Row("v1.0", 1000L), "MyDescription", Array("tag1", "tag2"), 
"Red", Array(1, 2, 3)),
+      Row(2, Row("v2.0", 2000L), "AnotherDescription", Array("tag3"), "Blue", 
Array(4, 5)))
+
+    val df = spark.createDataFrame(spark.sparkContext.parallelize(data), 
schema)
+
+    // Write to XML
+    val targetFile =
+      
Files.createTempDirectory("StaxXmlGeneratorSuite").resolve("mixed-types.xml").toString
+    df.write.option("rowTag", "item").xml(targetFile)
+
+    // Read back and verify the content is in correct XML tags
+    val readDf = spark.read.option("rowTag", 
"item").schema(schema).xml(targetFile)
+    val results = readDf.collect()
+
+    // Verify structure is preserved
+    assert(results.length === 2)
+
+    // Verify first row - ensure no data misplacement
+    assert(results(0).getAs[Int]("id") === 1)
+    val metadata1 = results(0).getAs[Row]("metadata")
+    assert(metadata1.getAs[String]("version") === "v1.0")
+    assert(metadata1.getAs[Long]("timestamp") === 1000L)
+    // Critical: ensure "MyDescription" is in description field, not in tags 
or color
+    assert(results(0).getAs[String]("description") === "MyDescription")
+    assert(results(0).getAs[Seq[String]]("tags") === Seq("tag1", "tag2"))
+    // Critical: ensure "Red" is in color field, not misplaced
+    assert(results(0).getAs[String]("color") === "Red")
+    assert(results(0).getAs[Seq[Int]]("numbers") === Seq(1, 2, 3))
+
+    // Verify second row
+    assert(results(1).getAs[Int]("id") === 2)
+    val metadata2 = results(1).getAs[Row]("metadata")
+    assert(metadata2.getAs[String]("version") === "v2.0")
+    assert(metadata2.getAs[Long]("timestamp") === 2000L)
+    assert(results(1).getAs[String]("description") === "AnotherDescription")
+    assert(results(1).getAs[Seq[String]]("tags") === Seq("tag3"))
+    assert(results(1).getAs[String]("color") === "Blue")
+    assert(results(1).getAs[Seq[Int]]("numbers") === Seq(4, 5))
+  }
+
+  // SPARK-45414: Test with attributes mixed with elements
+  test("SPARK-45414: write mixed types with attributes and string elements") {
+    import org.apache.spark.sql.Row
+
+    // Schema with attributes (using _ prefix) and string elements
+    val schema = StructType(
+      Seq(
+        StructField("_id", IntegerType, nullable = false), // attribute
+        StructField(
+          "nested",
+          StructType(
+            Seq(
+              StructField("_attr1", StringType), // attribute
+              StructField("value", StringType))),
+          nullable = true),
+        StructField("description", StringType, nullable = true), // element
+        StructField("items", ArrayType(IntegerType), nullable = true),
+        StructField("name", StringType, nullable = true) // element at end
+      ))
+
+    val data = Seq(Row(100, Row("attrValue", "nestedValue"), "DescText", 
Array(1, 2), "ItemName"))
+
+    val df = spark.createDataFrame(spark.sparkContext.parallelize(data), 
schema)
+
+    val targetFile =
+      
Files.createTempDirectory("StaxXmlGeneratorSuite").resolve("mixed-attrs.xml").toString
+    df.write.option("rowTag", "record").option("attributePrefix", 
"_").xml(targetFile)
+
+    val readDf = spark.read
+      .option("rowTag", "record")
+      .option("attributePrefix", "_")
+      .schema(schema)
+      .xml(targetFile)
+    val results = readDf.collect()
+
+    assert(results.length === 1)
+    assert(results(0).getAs[Int]("_id") === 100)
+    val nested = results(0).getAs[Row]("nested")
+    assert(nested.getAs[String]("_attr1") === "attrValue")
+    assert(nested.getAs[String]("value") === "nestedValue")
+    // Critical: ensure string elements are not misplaced
+    assert(results(0).getAs[String]("description") === "DescText")
+    assert(results(0).getAs[Seq[Int]]("items") === Seq(1, 2))
+    assert(results(0).getAs[String]("name") === "ItemName")
+  }
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to