This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 462b3dd2bf39 [SPARK-46667][SQL] XML: Throw error on multiple XML data 
source
462b3dd2bf39 is described below

commit 462b3dd2bf392af4248b77c8e196ce8fcd1be20c
Author: Sandip Agarwala <[email protected]>
AuthorDate: Fri Jan 12 09:24:19 2024 +0900

    [SPARK-46667][SQL] XML: Throw error on multiple XML data source
    
    ### What changes were proposed in this pull request?
    Spark-XML library users will notice some changes with built-in XML. 
Notably, some of the character data that was earlier dropped by spark-xml will 
be faithfully parsed by the built-in XML. Support for new data types like 
DecimalType, TimeStampNTZ, etc. have also been added. Few spark-xml users may 
still want to continue using the library.
    
    Rather than implicitly switching to the built-in XML, this PR throws an 
error when an external data source is detected for XML format.
    
    Users can continue using the old spark-xml library by specifying the full 
package name. Alternatively, they can remove spark-xml from the classpath and 
switch to built-in XML.
    
    ### Why are the changes needed?
    Same as above
    
    ### Does this PR introduce _any_ user-facing change?
    Yes
    
    ### How was this patch tested?
    New unit test
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #44685 from sandip-db/xml-multiple-data-source.
    
    Authored-by: Sandip Agarwala <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 common/utils/src/main/resources/error/error-classes.json      |  6 ++++++
 docs/sql-error-conditions.md                                  |  6 ++++++
 .../org/apache/spark/sql/errors/QueryCompilationErrors.scala  | 11 +++++++++++
 .../apache/spark/sql/execution/datasources/DataSource.scala   | 10 ++++++++--
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-classes.json 
b/common/utils/src/main/resources/error/error-classes.json
index e770b9c7053e..19817ced3356 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -2661,6 +2661,12 @@
     ],
     "sqlState" : "42K0E"
   },
+  "MULTIPLE_XML_DATA_SOURCE" : {
+    "message" : [
+      "Detected multiple data sources with the name <provider> 
(<sourceNames>). Please specify the fully qualified class name or remove 
<externalSource> from the classpath."
+    ],
+    "sqlState" : "42710"
+  },
   "MULTI_SOURCES_UNSUPPORTED_FOR_EXPRESSION" : {
     "message" : [
       "The expression <expr> does not support more than one source."
diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
index db8ecf5b2a30..ef12f6d03c06 100644
--- a/docs/sql-error-conditions.md
+++ b/docs/sql-error-conditions.md
@@ -1505,6 +1505,12 @@ The query does not include a GROUP BY clause. Add GROUP 
BY or turn it into the w
 
 Cannot specify time travel in both the time travel clause and options.
 
+### MULTIPLE_XML_DATA_SOURCE
+
+[SQLSTATE: 
42710](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+Detected multiple data sources with the name `<provider>` (`<sourceNames>`). 
Please specify the fully qualified class name or remove `<externalSource>` from 
the classpath.
+
 ### MULTI_SOURCES_UNSUPPORTED_FOR_EXPRESSION
 
 [SQLSTATE: 
42K0E](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
index 91d18788fd4c..e0740a325358 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -3881,6 +3881,17 @@ private[sql] object QueryCompilationErrors extends 
QueryErrorsBase with Compilat
       messageParameters = Map("provider" -> provider))
   }
 
+  def foundMultipleXMLDataSourceError(provider: String,
+      sourceNames: Seq[String],
+      externalSource: String): Throwable = {
+    new AnalysisException(
+      errorClass = "MULTIPLE_XML_DATA_SOURCE",
+      messageParameters = Map("provider" -> provider,
+        "sourceNames" -> sourceNames.mkString(", "),
+        "externalSource" -> externalSource)
+    )
+  }
+
   def xmlRowTagRequiredError(optionName: String): Throwable = {
     new AnalysisException(
       errorClass = "XML_ROW_TAG_MISSING",
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index decc20c52531..5190075f652b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -686,9 +686,15 @@ object DataSource extends Logging {
           // There are multiple registered aliases for the input. If there is 
single datasource
           // that has "org.apache.spark" package in the prefix, we use it 
considering it is an
           // internal datasource within Spark.
-          val sourceNames = sources.map(_.getClass.getName)
+          val sourceNames = sources.map(_.getClass.getName).sortBy(_.toString)
           val internalSources = 
sources.filter(_.getClass.getName.startsWith("org.apache.spark"))
-          if (internalSources.size == 1) {
+          if (provider.equalsIgnoreCase("xml") && sources.size == 2) {
+            val externalSource = sources.filterNot(_.getClass.getName
+              
.startsWith("org.apache.spark.sql.execution.datasources.xml.XmlFileFormat")
+            ).head.getClass
+            throw QueryCompilationErrors
+              .foundMultipleXMLDataSourceError(provider1, sourceNames, 
externalSource.getName)
+          } else if (internalSources.size == 1) {
             logWarning(s"Multiple sources found for $provider1 
(${sourceNames.mkString(", ")}), " +
               s"defaulting to the internal datasource 
(${internalSources.head.getClass.getName}).")
             internalSources.head.getClass


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to