This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 60393202ccdb [SPARK-45225][SQL] XML: XSD file URL support
60393202ccdb is described below
commit 60393202ccdb12d5c25c68dfc96a93ab4c897b6b
Author: Sandip Agarwala <[email protected]>
AuthorDate: Wed Sep 20 13:19:06 2023 +0900
[SPARK-45225][SQL] XML: XSD file URL support
### What changes were proposed in this pull request?
Add support to read XSD file URL.
### Why are the changes needed?
Add support to read XSD file URL.
### Does this PR introduce _any_ user-facing change?
Yes
### How was this patch tested?
Unit test
Manual test
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #43000 from sandip-db/xml-xsd-url-master.
Authored-by: Sandip Agarwala <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
.../spark/sql/catalyst/xml/ValidatorUtil.scala | 29 +++++++++++++++-------
1 file changed, 20 insertions(+), 9 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala
index 6509842fc6d1..f8b546332c2a 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/ValidatorUtil.scala
@@ -16,32 +16,43 @@
*/
package org.apache.spark.sql.catalyst.xml
-import java.nio.file.Paths
import javax.xml.XMLConstants
+import javax.xml.transform.stream.StreamSource
import javax.xml.validation.{Schema, SchemaFactory}
import com.google.common.cache.{CacheBuilder, CacheLoader}
+import org.apache.hadoop.fs.Path
import org.apache.spark.SparkFiles
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.util.Utils
/**
* Utilities for working with XSD validation.
*/
private[sql] object ValidatorUtil {
-
// Parsing XSDs may be slow, so cache them by path:
private val cache = CacheBuilder.newBuilder().softValues().build(
new CacheLoader[String, Schema] {
override def load(key: String): Schema = {
- // Handle case where file exists as specified
- var path = Paths.get(key)
- if (!path.toFile.exists()) {
- // Handle case where it was added with sc.addFile
- path = Paths.get(SparkFiles.get(key))
+ val in = try {
+ // Handle case where file exists as specified
+ val fs = Utils.getHadoopFileSystem(key, SparkHadoopUtil.get.conf)
+ fs.open(new Path(key))
+ } catch {
+ case _: Throwable =>
+ // Handle case where it was added with sc.addFile
+ val addFileUrl = SparkFiles.get(key)
+ val fs = Utils.getHadoopFileSystem(addFileUrl,
SparkHadoopUtil.get.conf)
+ fs.open(new Path(addFileUrl))
+ }
+ try {
+ val schemaFactory =
SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
+ schemaFactory.newSchema(new StreamSource(in))
+ } finally {
+ in.close()
}
- val schemaFactory =
SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI)
- schemaFactory.newSchema(path.toFile)
}
})
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]