This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new b3ca63c [SPARK-28266][SQL] convertToLogicalRelation should not interpret `path` property when reading Hive tables b3ca63c is described below commit b3ca63c3ed5e8c9728738ab6b1bc143c5d0d6219 Author: Shardul Mahadik <smaha...@linkedin.com> AuthorDate: Wed Jul 21 22:40:39 2021 +0800 [SPARK-28266][SQL] convertToLogicalRelation should not interpret `path` property when reading Hive tables ### What changes were proposed in this pull request? For non-datasource Hive tables, e.g. tables written outside of Spark (through Hive or Trino), we have certain optimzations in Spark where we use Spark ORC and Parquet datasources to read these tables ([Ref](https://github.com/apache/spark/blob/fbf53dee37129a493a4e5d5a007625b35f44fbda/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala#L128)) rather than using the Hive serde. If such a table contains a `path` property, Spark will try to list this path property in addition to the table location when creating an `InMemoryFileIndex`. ([Ref](https://github.com/apache/spark/blob/fbf53dee37129a493a4e5d5a007625b35f44fbda/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala#L575)) This can lead to wrong data if `path` property points to a directory location or an error if `path` is not a location. A concrete example is provided in [S [...] Since these tables were not written through Spark, Spark should not interpret this `path` property as it can be set by an external system with a different meaning. ### Why are the changes needed? For better compatibility with Hive tables generated by other platforms (non-Spark) ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added unit test Closes #33328 from shardulm94/spark-28266. Authored-by: Shardul Mahadik <smaha...@linkedin.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 685c3fd05bf8e9d85ea9b33d4e28807d436cd5ca) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/hive/HiveMetastoreCatalog.scala | 4 +- .../spark/sql/hive/HiveMetastoreCatalogSuite.scala | 47 ++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index a89243c..c67bc7d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -244,7 +244,9 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log paths = rootPath.toString :: Nil, userSpecifiedSchema = Option(updatedTable.dataSchema), bucketSpec = None, - options = options, + // Do not interpret the 'path' option at all when tables are read using the Hive + // source, since the URIs will already have been read from the table's LOCATION. + options = options.filter { case (k, _) => !k.equalsIgnoreCase("path") }, className = fileType).resolveRelation(), table = updatedTable) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 1a6f684..af3d455 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -363,4 +363,51 @@ class DataSourceWithHiveMetastoreCatalogSuite } }) } + + Seq( + "parquet" -> ( + "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", + HiveUtils.CONVERT_METASTORE_PARQUET.key), + "orc" -> ( + "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + HiveUtils.CONVERT_METASTORE_ORC.key) + ).foreach { case (format, (serde, formatConvertConf)) => + test("SPARK-28266: convertToLogicalRelation should not interpret `path` property when " + + s"reading Hive tables using $format file format") { + withTempPath(dir => { + val baseDir = dir.getAbsolutePath + withSQLConf(formatConvertConf -> "true") { + + withTable("t1") { + hiveClient.runSqlHive( + s""" + |CREATE TABLE t1 (id bigint) + |ROW FORMAT SERDE '$serde' + |WITH SERDEPROPERTIES ('path'='someNonLocationValue') + |STORED AS $format LOCATION '$baseDir' + |""".stripMargin) + + assertResult(0) { + spark.sql("SELECT * FROM t1").count() + } + } + + spark.range(3).selectExpr("id").write.format(format).save(baseDir) + withTable("t2") { + hiveClient.runSqlHive( + s""" + |CREATE TABLE t2 (id bigint) + |ROW FORMAT SERDE '$serde' + |WITH SERDEPROPERTIES ('path'='$baseDir') + |STORED AS $format LOCATION '$baseDir' + |""".stripMargin) + + assertResult(3) { + spark.sql("SELECT * FROM t2").count() + } + } + } + }) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org