(spark) branch master updated: [SPARK-44790][SQL] XML: to_xml implementation and bindings for python, connect and SQL

gurwls223 Sun, 29 Oct 2023 22:00:52 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 00d2b4fa2de [SPARK-44790][SQL] XML: to_xml implementation and bindings 
for python, connect and SQL
00d2b4fa2de is described below

commit 00d2b4fa2def948e7517bacfce7c75be6a37eb20
Author: Sandip Agarwala <[email protected]>
AuthorDate: Mon Oct 30 14:00:31 2023 +0900

    [SPARK-44790][SQL] XML: to_xml implementation and bindings for python, 
connect and SQL
    
    ### What changes were proposed in this pull request?
    to_xml: Converts a `StructType` to a XML output string.
    Bindings for python, connect and SQL
    
    ### Why are the changes needed?
    to_xml: Converts a `StructType` to a XML output string.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, new to_xml API.
    
    ### How was this patch tested?
    New unit tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #43503 from sandip-db/to_xml.
    
    Authored-by: Sandip Agarwala <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 .../scala/org/apache/spark/sql/functions.scala     |  31 +++
 .../org/apache/spark/sql/FunctionTestSuite.scala   |   6 +
 .../source/reference/pyspark.sql/functions.rst     |   1 +
 python/pyspark/sql/connect/functions.py            |  10 +
 python/pyspark/sql/functions.py                    |  36 +++
 .../sql/tests/connect/test_connect_function.py     |  14 +
 sql/catalyst/pom.xml                               |   4 +
 .../sql/catalyst/analysis/FunctionRegistry.scala   |   3 +-
 .../sql/catalyst/expressions/xmlExpressions.scala  |  90 ++++++-
 .../spark/sql/catalyst/xml/StaxXmlGenerator.scala  | 295 ++++++++++++---------
 .../apache/spark/sql/catalyst/xml/XmlOptions.scala |   5 +
 sql/core/pom.xml                                   |   4 -
 .../datasources/xml/XmlOutputWriter.scala          |  53 +---
 .../scala/org/apache/spark/sql/functions.scala     |  30 +++
 .../sql-functions/sql-expression-schema.md         |   1 +
 .../analyzer-results/xml-functions.sql.out         | 122 +++++++++
 .../resources/sql-tests/inputs/xml-functions.sql   |  18 +-
 .../sql-tests/results/xml-functions.sql.out        | 134 ++++++++++
 .../sql/execution/datasources/xml/XmlSuite.scala   |  25 ++
 19 files changed, 696 insertions(+), 186 deletions(-)

diff --git 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
index 9c5adca7e28..1c8f5993d29 100644
--- 
a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
+++ 
b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -7470,6 +7470,37 @@ object functions {
     fnWithOptions("schema_of_xml", options.asScala.iterator, xml)
   }
 
+  // scalastyle:off line.size.limit
+
+  /**
+   * (Java-specific) Converts a column containing a `StructType` into a XML 
string with the
+   * specified schema. Throws an exception, in the case of an unsupported type.
+   *
+   * @param e
+   *   a column containing a struct.
+   * @param options
+   *   options to control how the struct column is converted into a XML 
string. It accepts the
+   *   same options as the XML data source. See <a href=
+   *   
"https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option";>
 Data
+   *   Source Option</a> in the version you use.
+   * @group xml_funcs
+   * @since 4.0.0
+   */
+  // scalastyle:on line.size.limit
+  def to_xml(e: Column, options: java.util.Map[String, String]): Column =
+    fnWithOptions("to_xml", options.asScala.iterator, e)
+
+  /**
+   * Converts a column containing a `StructType` into a XML string with the 
specified schema.
+   * Throws an exception, in the case of an unsupported type.
+   *
+   * @param e
+   *   a column containing a struct.
+   * @group xml_funcs
+   * @since 4.0.0
+   */
+  def to_xml(e: Column): Column = to_xml(e, Collections.emptyMap())
+
   /**
    * Returns the total number of elements in the array. The function returns 
null for null input.
    *
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
index e350bde9946..748843ec991 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala
@@ -237,6 +237,12 @@ class FunctionTestSuite extends ConnectFunSuite {
     from_xml(a, schema, Map.empty[String, String].asJava),
     from_xml(a, schema, Collections.emptyMap[String, String]),
     from_xml(a, lit(schema.json), Collections.emptyMap[String, String]))
+  testEquals(
+    "schema_of_xml",
+    schema_of_xml(lit("<p><a>1.0</a><b>test</b></p>")),
+    schema_of_xml("<p><a>1.0</a><b>test</b></p>"),
+    schema_of_xml(lit("<p><a>1.0</a><b>test</b></p>"), Collections.emptyMap()))
+  testEquals("to_xml", to_xml(a), to_xml(a, Collections.emptyMap[String, 
String]))
 
   testEquals(
     "from_avro",
diff --git a/python/docs/source/reference/pyspark.sql/functions.rst 
b/python/docs/source/reference/pyspark.sql/functions.rst
index 5e05dac7bc3..4dc10cc1556 100644
--- a/python/docs/source/reference/pyspark.sql/functions.rst
+++ b/python/docs/source/reference/pyspark.sql/functions.rst
@@ -547,6 +547,7 @@ XML Functions
 
     from_xml
     schema_of_xml
+    to_xml
     xpath
     xpath_boolean
     xpath_double
diff --git a/python/pyspark/sql/connect/functions.py 
b/python/pyspark/sql/connect/functions.py
index f065e5391fe..38eb814247c 100644
--- a/python/pyspark/sql/connect/functions.py
+++ b/python/pyspark/sql/connect/functions.py
@@ -2200,6 +2200,16 @@ def to_json(col: "ColumnOrName", options: 
Optional[Dict[str, str]] = None) -> Co
 to_json.__doc__ = pysparkfuncs.to_json.__doc__
 
 
+def to_xml(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> 
Column:
+    if options is None:
+        return _invoke_function("to_xml", _to_col(col))
+    else:
+        return _invoke_function("to_xml", _to_col(col), 
_options_to_col(options))
+
+
+to_xml.__doc__ = pysparkfuncs.to_xml.__doc__
+
+
 def transform(
     col: "ColumnOrName",
     f: Union[Callable[[Column], Column], Callable[[Column, Column], Column]],
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 05c22685b09..869506a3558 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -13678,6 +13678,42 @@ def schema_of_xml(xml: "ColumnOrName", options: 
Optional[Dict[str, str]] = None)
     return _invoke_function("schema_of_xml", col, _options_to_str(options))
 
 
+@_try_remote_functions
+def to_xml(col: "ColumnOrName", options: Optional[Dict[str, str]] = None) -> 
Column:
+    """
+    Converts a column containing a :class:`StructType` into a XML string.
+    Throws an exception, in the case of an unsupported type.
+
+    .. versionadded:: 4.0.0
+
+    Parameters
+    ----------
+    col : :class:`~pyspark.sql.Column` or str
+        name of column containing a struct.
+    options: dict, optional
+        options to control converting. accepts the same options as the XML 
datasource.
+        See `Data Source Option 
<https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option>`_
+        for the version you use.
+
+        .. # noqa
+
+    Returns
+    -------
+    :class:`~pyspark.sql.Column`
+        a XML string converted from given :class:`StructType`.
+
+    Examples
+    --------
+    >>> from pyspark.sql import Row
+    >>> data = [(1, Row(age=2, name='Alice'))]
+    >>> df = spark.createDataFrame(data, ("key", "value"))
+    >>> df.select(to_xml(df.value, {'rowTag':'person'}).alias("xml")).collect()
+    [Row(xml='<person>\\n    <age>2</age>\\n    
<name>Alice</name>\\n</person>')]
+    """
+
+    return _invoke_function("to_xml", _to_java_column(col), 
_options_to_str(options))
+
+
 @_try_remote_functions
 def schema_of_csv(csv: "ColumnOrName", options: Optional[Dict[str, str]] = 
None) -> Column:
     """
diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py 
b/python/pyspark/sql/tests/connect/test_connect_function.py
index bc0cf162648..9adae0f6f75 100644
--- a/python/pyspark/sql/tests/connect/test_connect_function.py
+++ b/python/pyspark/sql/tests/connect/test_connect_function.py
@@ -1879,6 +1879,10 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, 
PandasOnSparkTestUtils, S
                 cdf.select(CF.from_xml("b", schema, {"mode": "FAILFAST"})),
                 sdf.select(SF.from_xml("b", schema, {"mode": "FAILFAST"})),
             )
+            self.compare_by_show(
+                sdf.select(SF.to_xml(SF.struct(SF.from_xml("b", schema)), 
{"rowTag": "person"})),
+                sdf.select(SF.to_xml(SF.struct(SF.from_xml("b", schema)), 
{"rowTag": "person"})),
+            )
 
         c_schema = CF.schema_of_xml(CF.lit("""<p><a>1</a></p>"""))
         s_schema = SF.schema_of_xml(SF.lit("""<p><a>1</a></p>"""))
@@ -1923,6 +1927,16 @@ class SparkConnectFunctionTests(ReusedConnectTestCase, 
PandasOnSparkTestUtils, S
             ).toPandas(),
         )
 
+        # test to_xml
+        self.compare_by_show(
+            cdf.select(CF.to_xml(CF.struct(CF.lit("a"), CF.lit("b")))),
+            sdf.select(SF.to_xml(SF.struct(SF.lit("a"), SF.lit("b")))),
+        )
+        self.compare_by_show(
+            cdf.select(CF.to_xml(CF.struct(CF.lit("a"), CF.lit("b")), {"mode": 
"FAILFAST"})),
+            sdf.select(SF.to_xml(SF.struct(SF.lit("a"), SF.lit("b")), {"mode": 
"FAILFAST"})),
+        )
+
     def test_string_functions_one_arg(self):
         query = """
             SELECT * FROM VALUES
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 8f2b9ccffeb..e7f8cbe0fe6 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -113,6 +113,10 @@
       <groupId>org.apache.ws.xmlschema</groupId>
       <artifactId>xmlschema-core</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.glassfish.jaxb</groupId>
+      <artifactId>txw2</artifactId>
+    </dependency>
     <dependency>
       <groupId>org.apache.datasketches</groupId>
       <artifactId>datasketches-java</artifactId>
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 8be3199ef9b..1449764cdd5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -834,7 +834,8 @@ object FunctionRegistry {
 
     // Xml
     expression[XmlToStructs]("from_xml"),
-    expression[SchemaOfXml]("schema_of_xml")
+    expression[SchemaOfXml]("schema_of_xml"),
+    expression[StructsToXml]("to_xml")
   )
 
   val builtin: SimpleFunctionRegistry = {
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
index df63429ae33..047b669fc89 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
@@ -16,13 +16,15 @@
  */
 package org.apache.spark.sql.catalyst.expressions
 
+import java.io.CharArrayWriter
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
 import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, 
Expression, ExpressionDescription, ExprUtils, NullIntolerant, 
TimeZoneAwareExpression, UnaryExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.util.{ArrayData, FailFastMode, 
FailureSafeParser, GenericArrayData, PermissiveMode}
-import org.apache.spark.sql.catalyst.xml.{StaxXmlParser, ValidatorUtil, 
XmlInferSchema, XmlOptions}
+import org.apache.spark.sql.catalyst.xml.{StaxXmlGenerator, StaxXmlParser, 
ValidatorUtil, XmlInferSchema, XmlOptions}
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -227,3 +229,89 @@ case class SchemaOfXml(
   override protected def withNewChildInternal(newChild: Expression): 
SchemaOfXml =
     copy(child = newChild)
 }
+
+/**
+ * Converts a [[StructType]] to a XML output string.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = "_FUNC_(expr[, options]) - Returns a XML string with a given struct 
value",
+  examples = """
+    Examples:
+      > SELECT _FUNC_(named_struct('a', 1, 'b', 2));
+       <ROW>
+           <a>1</a>
+           <b>2</b>
+       </ROW>
+      > SELECT _FUNC_(named_struct('time', to_timestamp('2015-08-26', 
'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy'));
+       <ROW>
+           <time>26/08/2015</time>
+       </ROW>
+  """,
+  since = "4.0.0",
+  group = "xml_funcs")
+// scalastyle:on line.size.limit
+case class StructsToXml(
+    options: Map[String, String],
+    child: Expression,
+    timeZoneId: Option[String] = None)
+  extends UnaryExpression
+  with TimeZoneAwareExpression
+  with CodegenFallback
+  with ExpectsInputTypes
+  with NullIntolerant {
+  override def nullable: Boolean = true
+
+  def this(options: Map[String, String], child: Expression) = this(options, 
child, None)
+
+  // Used in `FunctionRegistry`
+  def this(child: Expression) = this(Map.empty, child, None)
+
+  def this(child: Expression, options: Expression) =
+    this(
+      options = ExprUtils.convertToMapData(options),
+      child = child,
+      timeZoneId = None)
+
+  @transient
+  lazy val writer = new CharArrayWriter()
+
+  @transient
+  lazy val inputSchema: StructType = child.dataType match {
+    case st: StructType => st
+    case other =>
+      throw new IllegalArgumentException(s"Unsupported input type 
${other.catalogString}")
+  }
+
+  @transient
+  lazy val gen = new StaxXmlGenerator(
+    inputSchema, writer, new XmlOptions(options, timeZoneId.get), false)
+
+  // This converts rows to the XML output according to the given schema.
+  @transient
+  lazy val converter: Any => UTF8String = {
+    def getAndReset(): UTF8String = {
+      gen.flush()
+      val xmlString = writer.toString
+      writer.reset()
+      UTF8String.fromString(xmlString)
+    }
+    (row: Any) =>
+      gen.write(row.asInstanceOf[InternalRow])
+      getAndReset()
+  }
+
+  override def dataType: DataType = StringType
+
+  override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
+    copy(timeZoneId = Option(timeZoneId))
+
+  override def nullSafeEval(value: Any): Any = converter(value)
+
+  override def inputTypes: Seq[AbstractDataType] = StructType :: Nil
+
+  override def prettyName: String = "to_xml"
+
+  override protected def withNewChildInternal(newChild: Expression): 
StructsToXml =
+    copy(child = newChild)
+}
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala
index f1cbc8996b0..4477cf50823 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlGenerator.scala
@@ -16,164 +16,201 @@
  */
 package org.apache.spark.sql.catalyst.xml
 
+import java.io.Writer
 import java.sql.Timestamp
-import javax.xml.stream.XMLStreamWriter
+import javax.xml.stream.XMLOutputFactory
 
 import scala.collection.Map
 
+import com.sun.xml.txw2.output.IndentingXMLStreamWriter
+import org.apache.hadoop.shaded.com.ctc.wstx.api.WstxOutputProperties
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
-// This class is borrowed from Spark json datasource.
-private[sql] object StaxXmlGenerator {
+class StaxXmlGenerator(
+    schema: StructType,
+    writer: Writer,
+    options: XmlOptions,
+    validateStructure: Boolean = true) {
+
+  require(options.attributePrefix.nonEmpty,
+    "'attributePrefix' option should not be empty string.")
+  private val indentDisabled = options.indent == ""
+
+  private val gen = {
+    val factory = XMLOutputFactory.newInstance()
+    // to_xml disables structure validation to allow multiple root tags
+    factory.setProperty(WstxOutputProperties.P_OUTPUT_VALIDATE_STRUCTURE, 
validateStructure)
+    val xmlWriter = factory.createXMLStreamWriter(writer)
+    if (!indentDisabled) {
+      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
+      indentingXmlWriter.setIndentStep(options.indent)
+      indentingXmlWriter
+    } else {
+      xmlWriter
+    }
+  }
+
+  private var rootElementWritten: Boolean = false
+  def writeDeclaration(): Unit = {
+    // Allow a root tag to be like "rootTag foo='bar'"
+    // This is hacky; won't deal correctly with spaces in attributes, but want
+    // to make this at least work for simple cases without much complication
+    val rootTagTokens = options.rootTag.split(" ")
+    val rootElementName = rootTagTokens.head
+    val rootAttributes: Map[String, String] =
+      if (rootTagTokens.length > 1) {
+        rootTagTokens.tail.map { kv =>
+          val Array(k, v) = kv.split("=")
+          k -> v.replaceAll("['\"]", "")
+        }.toMap
+      } else {
+        Map.empty
+      }
+    val declaration = options.declaration
+    if (declaration != null && declaration.nonEmpty) {
+      gen.writeProcessingInstruction("xml", declaration)
+      gen.writeCharacters("\n")
+    }
+    gen.writeStartElement(rootElementName)
+    rootAttributes.foreach { case (k, v) =>
+      gen.writeAttribute(k, v)
+    }
+    if (indentDisabled) {
+      gen.writeCharacters("\n")
+    }
+    rootElementWritten = true
+  }
+
+  def flush(): Unit = gen.flush()
+
+  def close(): Unit = {
+    if (rootElementWritten) {
+      gen.writeEndElement()
+      gen.close()
+    }
+    writer.close()
+  }
 
   /**
    * Transforms a single Row to XML
    *
-   * @param schema
-   *   the schema object used for conversion
-   * @param writer
-   *   a XML writer object
-   * @param options
-   *   options for XML datasource.
    * @param row
-   *   The row to convert
+   * The row to convert
    */
-  def apply(schema: StructType, writer: XMLStreamWriter, options: XmlOptions)(
-      row: InternalRow): Unit = {
-
-    require(
-      options.attributePrefix.nonEmpty,
-      "'attributePrefix' option should not be empty string.")
+  def write(row: InternalRow): Unit = {
+    writeChildElement(options.rowTag, schema, row)
+    if (indentDisabled) {
+      gen.writeCharacters("\n")
+    }
+  }
 
-    def writeChildElement(name: String, dt: DataType, v: Any): Unit = (name, 
dt, v) match {
+  def writeChildElement(name: String, dt: DataType, v: Any): Unit = (name, dt, 
v) match {
+    // If this is meant to be value but in no child, write only a value
+    case (_, _, null) | (_, NullType, _) if options.nullValue == null =>
+    // Because usually elements having `null` do not exist, just do not write
+    // elements when given values are `null`.
+    case (_, _, _) if name == options.valueTag =>
       // If this is meant to be value but in no child, write only a value
-      case (_, _, null) | (_, NullType, _) if options.nullValue == null =>
-      // Because usually elements having `null` do not exist, just do not write
-      // elements when given values are `null`.
-      case (_, _, _) if name == options.valueTag =>
-        // If this is meant to be value but in no child, write only a value
-        writeElement(dt, v, options)
-      case (_, _, _) =>
-        writer.writeStartElement(name)
-        writeElement(dt, v, options)
-        writer.writeEndElement()
-    }
+      writeElement(dt, v, options)
+    case (_, _, _) =>
+      gen.writeStartElement(name)
+      writeElement(dt, v, options)
+      gen.writeEndElement()
+  }
 
-    def writeChild(name: String, dt: DataType, v: Any): Unit = {
-      (dt, v) match {
-        // If this is meant to be attribute, write an attribute
-        case (_, null) | (NullType, _)
-            if name.startsWith(options.attributePrefix) && name != 
options.valueTag =>
-          Option(options.nullValue).foreach {
-            
writer.writeAttribute(name.substring(options.attributePrefix.length), _)
-          }
-        case _ if name.startsWith(options.attributePrefix) && name != 
options.valueTag =>
-          
writer.writeAttribute(name.substring(options.attributePrefix.length), 
v.toString)
-
-        // For ArrayType, we just need to write each as XML element.
-        case (ArrayType(ty, _), v: ArrayData) =>
-          (0 until v.numElements()).foreach { i =>
-            writeChildElement(name, ty, v.get(i, ty))
-          }
-        // For other datatypes, we just write normal elements.
-        case _ =>
-          writeChildElement(name, dt, v)
-      }
-    }
+  def writeChild(name: String, dt: DataType, v: Any): Unit = {
+    (dt, v) match {
+      // If this is meant to be attribute, write an attribute
+      case (_, null) | (NullType, _)
+        if name.startsWith(options.attributePrefix) && name != 
options.valueTag =>
+        Option(options.nullValue).foreach {
+          gen.writeAttribute(name.substring(options.attributePrefix.length), _)
+        }
+      case _ if name.startsWith(options.attributePrefix) && name != 
options.valueTag =>
+        gen.writeAttribute(name.substring(options.attributePrefix.length), 
v.toString)
 
-    def writeElement(dt: DataType, v: Any, options: XmlOptions): Unit = (dt, 
v) match {
-      case (_, null) | (NullType, _) => 
writer.writeCharacters(options.nullValue)
-      case (StringType, v: UTF8String) => writer.writeCharacters(v.toString)
-      case (StringType, v: String) => writer.writeCharacters(v)
-      case (TimestampType, v: Timestamp) =>
-        
writer.writeCharacters(options.timestampFormatterInWrite.format(v.toInstant()))
-      case (TimestampType, v: Long) =>
-        writer.writeCharacters(options.timestampFormatterInWrite.format(v))
-      case (DateType, v: Int) =>
-        writer.writeCharacters(options.dateFormatterInWrite.format(v))
-      case (IntegerType, v: Int) => writer.writeCharacters(v.toString)
-      case (ShortType, v: Short) => writer.writeCharacters(v.toString)
-      case (FloatType, v: Float) => writer.writeCharacters(v.toString)
-      case (DoubleType, v: Double) => writer.writeCharacters(v.toString)
-      case (LongType, v: Long) => writer.writeCharacters(v.toString)
-      case (DecimalType(), v: java.math.BigDecimal) => 
writer.writeCharacters(v.toString)
-      case (DecimalType(), v: Decimal) => writer.writeCharacters(v.toString)
-      case (ByteType, v: Byte) => writer.writeCharacters(v.toString)
-      case (BooleanType, v: Boolean) => writer.writeCharacters(v.toString)
-
-      // For the case roundtrip in reading and writing XML files, 
[[ArrayType]] cannot have
-      // [[ArrayType]] as element type. It always wraps the element with 
[[StructType]]. So,
-      // this case only can happen when we convert a normal [[DataFrame]] to 
XML file.
-      // When [[ArrayType]] has [[ArrayType]] as elements, it is confusing 
what is element name
-      // for XML file.
+      // For ArrayType, we just need to write each as XML element.
       case (ArrayType(ty, _), v: ArrayData) =>
         (0 until v.numElements()).foreach { i =>
-          writeChild(options.arrayElementName, ty, v.get(i, ty))
+          writeChildElement(name, ty, v.get(i, ty))
         }
+      // For other datatypes, we just write normal elements.
+      case _ =>
+        writeChildElement(name, dt, v)
+    }
+  }
 
-      case (MapType(_, vt, _), mv: Map[_, _]) =>
-        val (attributes, elements) = mv.toSeq.partition { case (f, _) =>
-          f.toString.startsWith(options.attributePrefix) && f.toString != 
options.valueTag
-        }
-        // We need to write attributes first before the value.
-        (attributes ++ elements).foreach { case (k, v) =>
-          writeChild(k.toString, vt, v)
-        }
+  def writeElement(dt: DataType, v: Any, options: XmlOptions): Unit = (dt, v) 
match {
+    case (_, null) | (NullType, _) => gen.writeCharacters(options.nullValue)
+    case (StringType, v: UTF8String) => gen.writeCharacters(v.toString)
+    case (StringType, v: String) => gen.writeCharacters(v)
+    case (TimestampType, v: Timestamp) =>
+      
gen.writeCharacters(options.timestampFormatterInWrite.format(v.toInstant()))
+    case (TimestampType, v: Long) =>
+      gen.writeCharacters(options.timestampFormatterInWrite.format(v))
+    case (DateType, v: Int) =>
+      gen.writeCharacters(options.dateFormatterInWrite.format(v))
+    case (IntegerType, v: Int) => gen.writeCharacters(v.toString)
+    case (ShortType, v: Short) => gen.writeCharacters(v.toString)
+    case (FloatType, v: Float) => gen.writeCharacters(v.toString)
+    case (DoubleType, v: Double) => gen.writeCharacters(v.toString)
+    case (LongType, v: Long) => gen.writeCharacters(v.toString)
+    case (DecimalType(), v: java.math.BigDecimal) => 
gen.writeCharacters(v.toString)
+    case (DecimalType(), v: Decimal) => gen.writeCharacters(v.toString)
+    case (ByteType, v: Byte) => gen.writeCharacters(v.toString)
+    case (BooleanType, v: Boolean) => gen.writeCharacters(v.toString)
+
+    // For the case roundtrip in reading and writing XML files, [[ArrayType]] 
cannot have
+    // [[ArrayType]] as element type. It always wraps the element with 
[[StructType]]. So,
+    // this case only can happen when we convert a normal [[DataFrame]] to XML 
file.
+    // When [[ArrayType]] has [[ArrayType]] as elements, it is confusing what 
is element name
+    // for XML file.
+    case (ArrayType(ty, _), v: ArrayData) =>
+      (0 until v.numElements()).foreach { i =>
+        writeChild(options.arrayElementName, ty, v.get(i, ty))
+      }
 
-      case (mt: MapType, mv: MapData) => writeMapData(mt, mv)
+    case (MapType(_, vt, _), mv: Map[_, _]) =>
+      val (attributes, elements) = mv.toSeq.partition { case (f, _) =>
+        f.toString.startsWith(options.attributePrefix) && f.toString != 
options.valueTag
+      }
+      // We need to write attributes first before the value.
+      (attributes ++ elements).foreach { case (k, v) =>
+        writeChild(k.toString, vt, v)
+      }
 
-      case (st: StructType, r: InternalRow) =>
-        val (attributes, elements) = st.zip(r.toSeq(st)).partition { case (f, 
_) =>
-          f.name.startsWith(options.attributePrefix) && f.name != 
options.valueTag
-        }
-        // We need to write attributes first before the value.
-        (attributes ++ elements).foreach { case (field, value) =>
-          writeChild(field.name, field.dataType, value)
-        }
+    case (mt: MapType, mv: MapData) => writeMapData(mt, mv)
 
-      case (_, _) =>
-        throw new IllegalArgumentException(
-          s"Failed to convert value $v (class of ${v.getClass}) in type $dt to 
XML.")
-    }
-
-    def writeMapData(mapType: MapType, map: MapData): Unit = {
-      val keyArray = map.keyArray()
-      val valueArray = map.valueArray()
-      // write attributes first
-      Seq(true, false).foreach { writeAttribute =>
-        (0 until map.numElements()).foreach { i =>
-          val key = keyArray.get(i, mapType.keyType).toString
-          val isAttribute = key.startsWith(options.attributePrefix) && key != 
options.valueTag
-          if (writeAttribute == isAttribute) {
-            writeChild(key, mapType.valueType, valueArray.get(i, 
mapType.valueType))
-          }
-        }
+    case (st: StructType, r: InternalRow) =>
+      val (attributes, elements) = st.zip(r.toSeq(st)).partition { case (f, _) 
=>
+        f.name.startsWith(options.attributePrefix) && f.name != 
options.valueTag
+      }
+      // We need to write attributes first before the value.
+      (attributes ++ elements).foreach { case (field, value) =>
+        writeChild(field.name, field.dataType, value)
       }
-    }
 
-    val rowSeq = row.toSeq(schema)
-    val (attributes, elements) = schema.zip(rowSeq).partition { case (f, _) =>
-      f.name.startsWith(options.attributePrefix) && f.name != options.valueTag
-    }
-    // Writing attributes
-    writer.writeStartElement(options.rowTag)
-    attributes.foreach {
-      case (f, v) if v == null || f.dataType == NullType =>
-        Option(options.nullValue).foreach {
-          
writer.writeAttribute(f.name.substring(options.attributePrefix.length), _)
+    case (_, _) =>
+      throw new IllegalArgumentException(
+        s"Failed to convert value $v (class of ${v.getClass}) in type $dt to 
XML.")
+  }
+
+  def writeMapData(mapType: MapType, map: MapData): Unit = {
+    val keyArray = map.keyArray()
+    val valueArray = map.valueArray()
+    // write attributes first
+    Seq(true, false).foreach { writeAttribute =>
+      (0 until map.numElements()).foreach { i =>
+        val key = keyArray.get(i, mapType.keyType).toString
+        val isAttribute = key.startsWith(options.attributePrefix) && key != 
options.valueTag
+        if (writeAttribute == isAttribute) {
+          writeChild(key, mapType.valueType, valueArray.get(i, 
mapType.valueType))
         }
-      case (f, v) =>
-        
writer.writeAttribute(f.name.substring(options.attributePrefix.length), 
v.toString)
+      }
     }
-    // Writing elements
-    val (names, values) = elements.unzip
-    val elementSchema = StructType(schema.filter(names.contains))
-
-    val elementRow = InternalRow.fromSeq(rowSeq.filter(values.contains))
-    writeElement(elementSchema, elementRow, options)
-    writer.writeEndElement()
   }
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
index 763aa877ca0..7d049fdd82b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala
@@ -100,6 +100,9 @@ private[sql] class XmlOptions(
   val wildcardColName =
     parameters.getOrElse(WILDCARD_COL_NAME, 
XmlOptions.DEFAULT_WILDCARD_COL_NAME)
   val ignoreNamespace = getBool(IGNORE_NAMESPACE, false)
+  // setting indent to "" disables indentation in the generated XML.
+  // Each row will be written in a new line.
+  val indent = parameters.getOrElse(INDENT, DEFAULT_INDENT)
 
   /**
    * Infer columns with all valid date entries as date type (otherwise 
inferred as string or
@@ -198,6 +201,7 @@ private[sql] object XmlOptions extends DataSourceOptions {
   val DEFAULT_CHARSET: String = StandardCharsets.UTF_8.name
   val DEFAULT_NULL_VALUE: String = null
   val DEFAULT_WILDCARD_COL_NAME = "xs_any"
+  val DEFAULT_INDENT = "    "
   val ROW_TAG = newOption("rowTag")
   val ROOT_TAG = newOption("rootTag")
   val DECLARATION = newOption("declaration")
@@ -222,6 +226,7 @@ private[sql] object XmlOptions extends DataSourceOptions {
   val DATE_FORMAT = newOption("dateFormat")
   val TIMESTAMP_FORMAT = newOption("timestampFormat")
   val TIME_ZONE = newOption("timeZone")
+  val INDENT = newOption("indent")
   // Options with alternative
   val ENCODING = "encoding"
   val CHARSET = "charset"
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index b2b8398c9d5..8fabfd4699d 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -145,10 +145,6 @@
       <groupId>org.apache.ws.xmlschema</groupId>
       <artifactId>xmlschema-core</artifactId>
     </dependency>
-    <dependency>
-      <groupId>org.glassfish.jaxb</groupId>
-      <artifactId>txw2</artifactId>
-    </dependency>
     <dependency>
       <groupId>org.apache.xbean</groupId>
       <artifactId>xbean-asm9-shaded</artifactId>
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlOutputWriter.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlOutputWriter.scala
index cde866dcedf..ac3dfb287ad 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlOutputWriter.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlOutputWriter.scala
@@ -17,15 +17,13 @@
 package org.apache.spark.sql.execution.datasources.xml
 
 import java.nio.charset.Charset
-import javax.xml.stream.XMLOutputFactory
 
-import com.sun.xml.txw2.output.IndentingXMLStreamWriter
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.{xml, InternalRow}
-import org.apache.spark.sql.catalyst.xml.XmlOptions
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.xml.{StaxXmlGenerator, XmlOptions}
 import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriter}
 import org.apache.spark.sql.types.StructType
 
@@ -35,59 +33,20 @@ class XmlOutputWriter(
     context: TaskAttemptContext,
     options: XmlOptions) extends OutputWriter with Logging {
 
-  private val DEFAULT_INDENT = "    "
   private val charset = Charset.forName(options.charset)
 
   private val writer = CodecStreams.createOutputStreamWriter(context, new 
Path(path), charset)
-  private val factory = XMLOutputFactory.newInstance()
-  private val xmlWriter = factory.createXMLStreamWriter(writer)
-  private val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
-  indentingXmlWriter.setIndentStep(DEFAULT_INDENT)
 
-  // Allow a root tag to be like "rootTag foo='bar'"
-  // This is hacky; won't deal correctly with spaces in attributes, but want
-  // to make this at least work for simple cases without much complication
-  private val rootTagTokens = options.rootTag.split(" ")
-  private val rootElementName = rootTagTokens.head
-  private val rootAttributes: Map[String, String] =
-    if (rootTagTokens.length > 1) {
-      rootTagTokens.tail.map { kv =>
-        val Array(k, v) = kv.split("=")
-        k -> v.replaceAll("['\"]", "")
-      }.toMap
-    } else {
-      Map.empty
-    }
-  private val declaration = options.declaration
-
-
-  // private val gen = new UnivocityGenerator(dataSchema, writer, params)
+  private val gen = new StaxXmlGenerator(dataSchema, writer, options)
 
   private var firstRow: Boolean = true
-
   override def write(row: InternalRow): Unit = {
     if (firstRow) {
-      if (declaration != null && declaration.nonEmpty) {
-        indentingXmlWriter.writeProcessingInstruction("xml", declaration)
-        indentingXmlWriter.writeCharacters("\n")
-      }
-      indentingXmlWriter.writeStartElement(rootElementName)
-      rootAttributes.foreach { case (k, v) =>
-        indentingXmlWriter.writeAttribute(k, v)
-      }
+      gen.writeDeclaration()
       firstRow = false
     }
-    xml.StaxXmlGenerator(
-      dataSchema,
-      indentingXmlWriter,
-      options)(row)
+    gen.write(row)
   }
 
-  override def close(): Unit = {
-    if (!firstRow) {
-      indentingXmlWriter.writeEndElement()
-      indentingXmlWriter.close()
-    }
-    writer.close()
-  }
+  override def close(): Unit = gen.close()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 34e18cdf27a..b5e40fe35cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -7218,6 +7218,36 @@ object functions {
     withExpr(SchemaOfXml(xml.expr, options.asScala.toMap))
   }
 
+  // scalastyle:off line.size.limit
+
+  /**
+   * (Java-specific) Converts a column containing a `StructType` into a XML 
string with
+   * the specified schema. Throws an exception, in the case of an unsupported 
type.
+   *
+   * @param e       a column containing a struct.
+   * @param options options to control how the struct column is converted into 
a XML string.
+   *                It accepts the same options as the XML data source.
+   *                See
+   *                <a href=
+   *                
"https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option";>
+   *                Data Source Option</a> in the version you use.
+   * @group xml_funcs
+   * @since 4.0.0
+   */
+  // scalastyle:on line.size.limit
+  def to_xml(e: Column, options: java.util.Map[String, String]): Column =
+    fnWithOptions("to_xml", options.asScala.iterator, e)
+
+  /**
+   * Converts a column containing a `StructType` into a XML string with the 
specified schema.
+   * Throws an exception, in the case of an unsupported type.
+   *
+   * @param e a column containing a struct.
+   * @group xml_funcs
+   * @since 4.0.0
+   */
+  def to_xml(e: Column): Column = to_xml(e, Map.empty[String, String].asJava)
+
   /**
    * A transform for timestamps and dates to partition data into years.
    *
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md 
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 42907b52cda..017cc474ea0 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -318,6 +318,7 @@
 | org.apache.spark.sql.catalyst.expressions.StringTrimRight | rtrim | SELECT 
rtrim('    SparkSQL   ') | struct<rtrim(    SparkSQL   ):string> |
 | org.apache.spark.sql.catalyst.expressions.StructsToCsv | to_csv | SELECT 
to_csv(named_struct('a', 1, 'b', 2)) | struct<to_csv(named_struct(a, 1, b, 
2)):string> |
 | org.apache.spark.sql.catalyst.expressions.StructsToJson | to_json | SELECT 
to_json(named_struct('a', 1, 'b', 2)) | struct<to_json(named_struct(a, 1, b, 
2)):string> |
+| org.apache.spark.sql.catalyst.expressions.StructsToXml | to_xml | SELECT 
to_xml(named_struct('a', 1, 'b', 2)) | struct<to_xml(named_struct(a, 1, b, 
2)):string> |
 | org.apache.spark.sql.catalyst.expressions.Substring | substr | SELECT 
substr('Spark SQL', 5) | struct<substr(Spark SQL, 5, 2147483647):string> |
 | org.apache.spark.sql.catalyst.expressions.Substring | substring | SELECT 
substring('Spark SQL', 5) | struct<substring(Spark SQL, 5, 2147483647):string> |
 | org.apache.spark.sql.catalyst.expressions.SubstringIndex | substring_index | 
SELECT substring_index('www.apache.org', '.', 2) | 
struct<substring_index(www.apache.org, ., 2):string> |
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out
index e62f4aab344..51cf3d976f6 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/xml-functions.sql.out
@@ -1,4 +1,126 @@
 -- Automatically generated by SQLQueryTestSuite
+-- !query
+select to_xml(named_struct('a', 1, 'b', 2), map('indent', ''))
+-- !query analysis
+Project [to_xml((indent,), named_struct(a, 1, b, 2), 
Some(America/Los_Angeles)) AS to_xml(named_struct(a, 1, b, 2))#x]
++- OneRowRelation
+
+
+-- !query
+select to_xml(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), 
map('timestampFormat', 'dd/MM/yyyy', 'indent', ''))
+-- !query analysis
+Project [to_xml((timestampFormat,dd/MM/yyyy), (indent,), named_struct(time, 
to_timestamp(2015-08-26, Some(yyyy-MM-dd), TimestampType, 
Some(America/Los_Angeles), false)), Some(America/Los_Angeles)) AS 
to_xml(named_struct(time, to_timestamp(2015-08-26, yyyy-MM-dd)))#x]
++- OneRowRelation
+
+
+-- !query
+select to_xml(array(named_struct('a', 1, 'b', 2)))
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"array(named_struct(a, 1, b, 2))\"",
+    "inputType" : "\"ARRAY<STRUCT<a: INT, b: INT>>\"",
+    "paramIndex" : "1",
+    "requiredType" : "\"STRUCT\"",
+    "sqlExpr" : "\"to_xml(array(named_struct(a, 1, b, 2)))\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 50,
+    "fragment" : "to_xml(array(named_struct('a', 1, 'b', 2)))"
+  } ]
+}
+
+
+-- !query
+select to_xml(map('a', 1))
+-- !query analysis
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"map(a, 1)\"",
+    "inputType" : "\"MAP<STRING, INT>\"",
+    "paramIndex" : "1",
+    "requiredType" : "\"STRUCT\"",
+    "sqlExpr" : "\"to_xml(map(a, 1))\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 26,
+    "fragment" : "to_xml(map('a', 1))"
+  } ]
+}
+
+
+-- !query
+select to_xml(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE'))
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_OPTIONS.NON_MAP_FUNCTION",
+  "sqlState" : "42K06",
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 79,
+    "fragment" : "to_xml(named_struct('a', 1, 'b', 2), named_struct('mode', 
'PERMISSIVE'))"
+  } ]
+}
+
+
+-- !query
+select to_xml(named_struct('a', 1, 'b', 2), map('mode', 1))
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_OPTIONS.NON_STRING_TYPE",
+  "sqlState" : "42K06",
+  "messageParameters" : {
+    "mapType" : "\"MAP<STRING, INT>\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 59,
+    "fragment" : "to_xml(named_struct('a', 1, 'b', 2), map('mode', 1))"
+  } ]
+}
+
+
+-- !query
+select to_xml()
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION",
+  "sqlState" : "42605",
+  "messageParameters" : {
+    "actualNum" : "0",
+    "docroot" : "https://spark.apache.org/docs/latest";,
+    "expectedNum" : "[1, 2]",
+    "functionName" : "`to_xml`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 15,
+    "fragment" : "to_xml()"
+  } ]
+}
+
+
 -- !query
 select from_xml('<p><a>1</a></p>', 'a INT')
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/xml-functions.sql 
b/sql/core/src/test/resources/sql-tests/inputs/xml-functions.sql
index cdf56712b11..7e3d21ef753 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/xml-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/xml-functions.sql
@@ -1,4 +1,14 @@
--- from_json
+-- to_xml
+select to_xml(named_struct('a', 1, 'b', 2), map('indent', ''));
+select to_xml(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), 
map('timestampFormat', 'dd/MM/yyyy', 'indent', ''));
+-- Check if errors handled
+select to_xml(array(named_struct('a', 1, 'b', 2)));
+select to_xml(map('a', 1));
+select to_xml(named_struct('a', 1, 'b', 2), named_struct('mode', 
'PERMISSIVE'));
+select to_xml(named_struct('a', 1, 'b', 2), map('mode', 1));
+select to_xml();
+
+-- from_xml
 select from_xml('<p><a>1</a></p>', 'a INT');
 select from_xml('<p><time>26/08/2015</time></p>', 'time Timestamp', 
map('timestampFormat', 'dd/MM/yyyy'));
 -- Check if errors handled
@@ -11,15 +21,15 @@ select from_xml();
 -- Clean up
 DROP VIEW IF EXISTS xmlTable;
 
--- from_json - complex types
+-- from_xml - complex types
 select from_xml('<p><a>1</a></p>', 'struct<a:array<int>>');
 select from_xml('<p><a>1</a><b>"2"</b></p>', 'struct<a:int,b:string>');
 
--- infer schema of json literal
+-- infer schema of xml literal
 select schema_of_xml('<p><a>1</a><b>"2"</b></p>');
 select from_xml('<p><a>1</a><a>2</a><a>3</a></p>', 
schema_of_xml('<p><a>1</a><a>2</a></p>'));
 
--- from_json - array type
+-- from_xml - array type
 select from_xml('<p><a>1</a><a>2</a></p>', 'struct<a:array<int>>');
 select from_xml('<p><a>1</a><a>"2"</a></p>', 'struct<a:array<int>>');
 select from_xml('<p><a>1</a><a></a></p>', 'struct<a:array<int>>');
diff --git 
a/sql/core/src/test/resources/sql-tests/results/xml-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/xml-functions.sql.out
index 61e8e9c8662..704addb7a93 100644
--- a/sql/core/src/test/resources/sql-tests/results/xml-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/xml-functions.sql.out
@@ -1,4 +1,138 @@
 -- Automatically generated by SQLQueryTestSuite
+-- !query
+select to_xml(named_struct('a', 1, 'b', 2), map('indent', ''))
+-- !query schema
+struct<to_xml(named_struct(a, 1, b, 2)):string>
+-- !query output
+<ROW><a>1</a><b>2</b></ROW>
+
+
+-- !query
+select to_xml(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), 
map('timestampFormat', 'dd/MM/yyyy', 'indent', ''))
+-- !query schema
+struct<to_xml(named_struct(time, to_timestamp(2015-08-26, yyyy-MM-dd))):string>
+-- !query output
+<ROW><time>26/08/2015</time></ROW>
+
+
+-- !query
+select to_xml(array(named_struct('a', 1, 'b', 2)))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"array(named_struct(a, 1, b, 2))\"",
+    "inputType" : "\"ARRAY<STRUCT<a: INT, b: INT>>\"",
+    "paramIndex" : "1",
+    "requiredType" : "\"STRUCT\"",
+    "sqlExpr" : "\"to_xml(array(named_struct(a, 1, b, 2)))\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 50,
+    "fragment" : "to_xml(array(named_struct('a', 1, 'b', 2)))"
+  } ]
+}
+
+
+-- !query
+select to_xml(map('a', 1))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.ExtendedAnalysisException
+{
+  "errorClass" : "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE",
+  "sqlState" : "42K09",
+  "messageParameters" : {
+    "inputSql" : "\"map(a, 1)\"",
+    "inputType" : "\"MAP<STRING, INT>\"",
+    "paramIndex" : "1",
+    "requiredType" : "\"STRUCT\"",
+    "sqlExpr" : "\"to_xml(map(a, 1))\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 26,
+    "fragment" : "to_xml(map('a', 1))"
+  } ]
+}
+
+
+-- !query
+select to_xml(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE'))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_OPTIONS.NON_MAP_FUNCTION",
+  "sqlState" : "42K06",
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 79,
+    "fragment" : "to_xml(named_struct('a', 1, 'b', 2), named_struct('mode', 
'PERMISSIVE'))"
+  } ]
+}
+
+
+-- !query
+select to_xml(named_struct('a', 1, 'b', 2), map('mode', 1))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_OPTIONS.NON_STRING_TYPE",
+  "sqlState" : "42K06",
+  "messageParameters" : {
+    "mapType" : "\"MAP<STRING, INT>\""
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 59,
+    "fragment" : "to_xml(named_struct('a', 1, 'b', 2), map('mode', 1))"
+  } ]
+}
+
+
+-- !query
+select to_xml()
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "WRONG_NUM_ARGS.WITHOUT_SUGGESTION",
+  "sqlState" : "42605",
+  "messageParameters" : {
+    "actualNum" : "0",
+    "docroot" : "https://spark.apache.org/docs/latest";,
+    "expectedNum" : "[1, 2]",
+    "functionName" : "`to_xml`"
+  },
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 8,
+    "stopIndex" : 15,
+    "fragment" : "to_xml()"
+  } ]
+}
+
+
 -- !query
 select from_xml('<p><a>1</a></p>', 'a INT')
 -- !query schema
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
index 7e5817bc3a0..20600848019 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -1287,6 +1287,31 @@ class XmlSuite extends QueryTest with SharedSparkSession 
{
     assert(result.select("decoded").head().get(0) === Row(null, null))
   }
 
+  test("from_xml to to_xml round trip") {
+    val xmlData = Seq(
+      "<person><age>100</age><name>Alice</name></person>",
+      "<person><age>100</age><name>Alice</name></person>",
+      "<person><age>100</age><name>Alice</name></person>")
+    val df = xmlData.toDF("xmlString")
+    val xmlSchema = schema_of_xml(xmlData.head)
+
+    val df2 = df.withColumn("parsed",
+      from_xml(df.col("xmlString"), xmlSchema))
+    val df3 = df2.select(to_xml($"parsed", Map("rowTag" -> "person").asJava))
+    val xmlResult = df3.collect().map(_.getString(0).replaceAll("\\s+", ""))
+    assert(xmlData.sortBy(_.toString) === xmlResult.sortBy(_.toString))
+  }
+
+  test("to_xml to from_xml round trip") {
+    val df = spark.read.option("rowTag", "ROW").xml(getTestResourcePath(resDir 
+ "cars.xml"))
+    val df1 = df.select(to_xml(struct("*")).as("xmlString"))
+    val schema = schema_of_xml(df1.select("xmlString").head().getString(0))
+    val df2 = df1.select(from_xml($"xmlString", schema).as("fromXML"))
+    val df3 = df2.select(col("fromXML.*"))
+    assert(df3.collect().length === 3)
+    checkAnswer(df3, df)
+  }
+
   test("decimals with scale greater than precision") {
     val spark = this.spark;
     import spark.implicits._


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-44790][SQL] XML: to_xml implementation and bindings for python, connect and SQL

Reply via email to