This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new ae518ecb7068 [SPARK-47218][SQL] XML: Changed SchemaOfXml to fail on
DROPMALFORMED mode
ae518ecb7068 is described below
commit ae518ecb7068347f70d947255eb54fdfd5ec8d48
Author: Yousof Hosny <[email protected]>
AuthorDate: Mon Mar 11 08:40:19 2024 +0900
[SPARK-47218][SQL] XML: Changed SchemaOfXml to fail on DROPMALFORMED mode
### What changes were proposed in this pull request?
Changed schema_of_xml should fail with an error on DROPMALFORMED mode to
avoid creating schemas out of invalid XML.
### Why are the changes needed?
DROPMALFORMED parse mode imply silently dropping the malformed record. But
SchemaOfXml is expected to return a schema and may not have a valid schema to
return for a malformed record. So DROPMALFORMED cannot be supported..
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Unit test.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #45379 from yhosny/xml-parsemode-error.
Authored-by: Yousof Hosny <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../sql/catalyst/expressions/xmlExpressions.scala | 8 +++--
.../sql/execution/datasources/xml/XmlSuite.scala | 36 ++++++++++++++++++++++
2 files changed, 42 insertions(+), 2 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
index 800515ca84b5..8cc1c3a89745 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/xmlExpressions.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.util.{ArrayData, FailFastMode,
FailureSafeParser, GenericArrayData, PermissiveMode}
+import org.apache.spark.sql.catalyst.util.{ArrayData, DropMalformedMode,
FailFastMode, FailureSafeParser, GenericArrayData, PermissiveMode}
import org.apache.spark.sql.catalyst.xml.{StaxXmlGenerator, StaxXmlParser,
ValidatorUtil, XmlInferSchema, XmlOptions}
import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryErrorsBase}
import org.apache.spark.sql.internal.SQLConf
@@ -189,8 +189,12 @@ case class SchemaOfXml(
private lazy val xmlFactory = xmlOptions.buildXmlFactory()
@transient
- private lazy val xmlInferSchema =
+ private lazy val xmlInferSchema = {
+ if (xmlOptions.parseMode == DropMalformedMode) {
+ throw QueryCompilationErrors.parseModeUnsupportedError("schema_of_xml",
xmlOptions.parseMode)
+ }
new XmlInferSchema(xmlOptions, caseSensitive =
SQLConf.get.caseSensitiveAnalysis)
+ }
@transient
private lazy val xml = child.eval().asInstanceOf[UTF8String]
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
index 2194f76e7da6..d7dc96184dab 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -1302,6 +1302,42 @@ class XmlSuite
assert(result.select("decoded._corrupt_record").head().getString(0).nonEmpty)
}
+ test("schema_of_xml with DROPMALFORMED parse error test") {
+ val e = intercept[AnalysisException] {
+ spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode',
'DROPMALFORMED'))""")
+ .collect()
+ }
+ checkError(
+ exception = e,
+ errorClass = "_LEGACY_ERROR_TEMP_1099",
+ parameters = Map(
+ "funcName" -> "schema_of_xml",
+ "mode" -> "DROPMALFORMED",
+ "permissiveMode" -> "PERMISSIVE",
+ "failFastMode" -> FailFastMode.name)
+ )
+ }
+
+ test("schema_of_xml with FAILFAST parse error test") {
+ val e = intercept[SparkException] {
+ spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode',
'FAILFAST'))""")
+ .collect()
+ }
+ checkError(
+ exception = e,
+ errorClass = "_LEGACY_ERROR_TEMP_2165",
+ parameters = Map(
+ "failFastMode" -> FailFastMode.name)
+ )
+ }
+
+ test("schema_of_xml with PERMISSIVE check no error test") {
+ val s = spark.sql(s"""SELECT schema_of_xml('<ROW><a>1<ROW>', map('mode',
'PERMISSIVE'))""")
+ .collect()
+ assert(s.head.get(0) == "STRUCT<_corrupt_record: STRING>")
+ }
+
+
test("from_xml with PERMISSIVE parse mode with no corrupt col schema") {
// XML contains error
val xmlData =
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]