This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 462b3dd2bf39 [SPARK-46667][SQL] XML: Throw error on multiple XML data
source
462b3dd2bf39 is described below
commit 462b3dd2bf392af4248b77c8e196ce8fcd1be20c
Author: Sandip Agarwala <[email protected]>
AuthorDate: Fri Jan 12 09:24:19 2024 +0900
[SPARK-46667][SQL] XML: Throw error on multiple XML data source
### What changes were proposed in this pull request?
Spark-XML library users will notice some changes with built-in XML.
Notably, some of the character data that was earlier dropped by spark-xml will
be faithfully parsed by the built-in XML. Support for new data types like
DecimalType, TimeStampNTZ, etc. have also been added. Few spark-xml users may
still want to continue using the library.
Rather than implicitly switching to the built-in XML, this PR throws an
error when an external data source is detected for XML format.
Users can continue using the old spark-xml library by specifying the full
package name. Alternatively, they can remove spark-xml from the classpath and
switch to built-in XML.
### Why are the changes needed?
Same as above
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?
New unit test
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #44685 from sandip-db/xml-multiple-data-source.
Authored-by: Sandip Agarwala <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
common/utils/src/main/resources/error/error-classes.json | 6 ++++++
docs/sql-error-conditions.md | 6 ++++++
.../org/apache/spark/sql/errors/QueryCompilationErrors.scala | 11 +++++++++++
.../apache/spark/sql/execution/datasources/DataSource.scala | 10 ++++++++--
4 files changed, 31 insertions(+), 2 deletions(-)
diff --git a/common/utils/src/main/resources/error/error-classes.json
b/common/utils/src/main/resources/error/error-classes.json
index e770b9c7053e..19817ced3356 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -2661,6 +2661,12 @@
],
"sqlState" : "42K0E"
},
+ "MULTIPLE_XML_DATA_SOURCE" : {
+ "message" : [
+ "Detected multiple data sources with the name <provider>
(<sourceNames>). Please specify the fully qualified class name or remove
<externalSource> from the classpath."
+ ],
+ "sqlState" : "42710"
+ },
"MULTI_SOURCES_UNSUPPORTED_FOR_EXPRESSION" : {
"message" : [
"The expression <expr> does not support more than one source."
diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
index db8ecf5b2a30..ef12f6d03c06 100644
--- a/docs/sql-error-conditions.md
+++ b/docs/sql-error-conditions.md
@@ -1505,6 +1505,12 @@ The query does not include a GROUP BY clause. Add GROUP
BY or turn it into the w
Cannot specify time travel in both the time travel clause and options.
+### MULTIPLE_XML_DATA_SOURCE
+
+[SQLSTATE:
42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+Detected multiple data sources with the name `<provider>` (`<sourceNames>`).
Please specify the fully qualified class name or remove `<externalSource>` from
the classpath.
+
### MULTI_SOURCES_UNSUPPORTED_FOR_EXPRESSION
[SQLSTATE:
42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
index 91d18788fd4c..e0740a325358 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -3881,6 +3881,17 @@ private[sql] object QueryCompilationErrors extends
QueryErrorsBase with Compilat
messageParameters = Map("provider" -> provider))
}
+ def foundMultipleXMLDataSourceError(provider: String,
+ sourceNames: Seq[String],
+ externalSource: String): Throwable = {
+ new AnalysisException(
+ errorClass = "MULTIPLE_XML_DATA_SOURCE",
+ messageParameters = Map("provider" -> provider,
+ "sourceNames" -> sourceNames.mkString(", "),
+ "externalSource" -> externalSource)
+ )
+ }
+
def xmlRowTagRequiredError(optionName: String): Throwable = {
new AnalysisException(
errorClass = "XML_ROW_TAG_MISSING",
diff --git
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index decc20c52531..5190075f652b 100644
---
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -686,9 +686,15 @@ object DataSource extends Logging {
// There are multiple registered aliases for the input. If there is
single datasource
// that has "org.apache.spark" package in the prefix, we use it
considering it is an
// internal datasource within Spark.
- val sourceNames = sources.map(_.getClass.getName)
+ val sourceNames = sources.map(_.getClass.getName).sortBy(_.toString)
val internalSources =
sources.filter(_.getClass.getName.startsWith("org.apache.spark"))
- if (internalSources.size == 1) {
+ if (provider.equalsIgnoreCase("xml") && sources.size == 2) {
+ val externalSource = sources.filterNot(_.getClass.getName
+
.startsWith("org.apache.spark.sql.execution.datasources.xml.XmlFileFormat")
+ ).head.getClass
+ throw QueryCompilationErrors
+ .foundMultipleXMLDataSourceError(provider1, sourceNames,
externalSource.getName)
+ } else if (internalSources.size == 1) {
logWarning(s"Multiple sources found for $provider1
(${sourceNames.mkString(", ")}), " +
s"defaulting to the internal datasource
(${internalSources.head.getClass.getName}).")
internalSources.head.getClass
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]