Repository: spark Updated Branches: refs/heads/master 264b0f36c -> f18b905f6
[SPARK-21457][SQL] ExternalCatalog.listPartitions should correctly handle partition values with dot ## What changes were proposed in this pull request? When we list partitions from hive metastore with a partial partition spec, we are expecting exact matching according to the partition values. However, hive treats dot specially and match any single character for dot. We should do an extra filter to drop unexpected partitions. ## How was this patch tested? new regression test. Author: Wenchen Fan <[email protected]> Closes #18671 from cloud-fan/hive. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f18b905f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f18b905f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f18b905f Branch: refs/heads/master Commit: f18b905f6cace7686ef169fda7de474079d0af23 Parents: 264b0f3 Author: Wenchen Fan <[email protected]> Authored: Tue Jul 18 15:56:16 2017 -0700 Committer: gatorsmile <[email protected]> Committed: Tue Jul 18 15:56:16 2017 -0700 ---------------------------------------------------------------------- .../sql/catalyst/catalog/ExternalCatalogUtils.scala | 12 ++++++++++++ .../spark/sql/catalyst/catalog/InMemoryCatalog.scala | 12 ------------ .../sql/catalyst/catalog/ExternalCatalogSuite.scala | 12 ++++++++++++ .../org/apache/spark/sql/hive/HiveExternalCatalog.scala | 12 +++++++++++- 4 files changed, 35 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/f18b905f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala index 1fc3a65..50f32e8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala @@ -159,6 +159,18 @@ object ExternalCatalogUtils { } } } + + /** + * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a + * partial partition spec w.r.t. PARTITION (a=1,b=2). + */ + def isPartialPartitionSpec( + spec1: TablePartitionSpec, + spec2: TablePartitionSpec): Boolean = { + spec1.forall { + case (partitionColumn, value) => spec2(partitionColumn) == value + } + } } object CatalogUtils { http://git-wip-us.apache.org/repos/asf/spark/blob/f18b905f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala index d253c72..37e9eea 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala @@ -553,18 +553,6 @@ class InMemoryCatalog( } } - /** - * Returns true if `spec1` is a partial partition spec w.r.t. `spec2`, e.g. PARTITION (a=1) is a - * partial partition spec w.r.t. PARTITION (a=1,b=2). - */ - private def isPartialPartitionSpec( - spec1: TablePartitionSpec, - spec2: TablePartitionSpec): Boolean = { - spec1.forall { - case (partitionColumn, value) => spec2(partitionColumn) == value - } - } - override def listPartitionsByFilter( db: String, table: String, http://git-wip-us.apache.org/repos/asf/spark/blob/f18b905f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala index 66e895a..94593ef 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogSuite.scala @@ -448,6 +448,18 @@ abstract class ExternalCatalogSuite extends SparkFunSuite with BeforeAndAfterEac assert(catalog.listPartitions("db2", "tbl2", Some(Map("a" -> "unknown"))).isEmpty) } + test("SPARK-21457: list partitions with special chars") { + val catalog = newBasicCatalog() + assert(catalog.listPartitions("db2", "tbl1").isEmpty) + + val part1 = CatalogTablePartition(Map("a" -> "1", "b" -> "i+j"), storageFormat) + val part2 = CatalogTablePartition(Map("a" -> "1", "b" -> "i.j"), storageFormat) + catalog.createPartitions("db2", "tbl1", Seq(part1, part2), ignoreIfExists = false) + + assert(catalog.listPartitions("db2", "tbl1", Some(part1.spec)).map(_.spec) == Seq(part1.spec)) + assert(catalog.listPartitions("db2", "tbl1", Some(part2.spec)).map(_.spec) == Seq(part2.spec)) + } + test("list partitions by filter") { val tz = TimeZone.getDefault.getID val catalog = newBasicCatalog() http://git-wip-us.apache.org/repos/asf/spark/blob/f18b905f/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala ---------------------------------------------------------------------- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 306b380..70d7dd2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -1088,9 +1088,19 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat table: String, partialSpec: Option[TablePartitionSpec] = None): Seq[CatalogTablePartition] = withClient { val partColNameMap = buildLowerCasePartColNameMap(getTable(db, table)) - client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => + val res = client.getPartitions(db, table, partialSpec.map(lowerCasePartitionSpec)).map { part => part.copy(spec = restorePartitionSpec(part.spec, partColNameMap)) } + + partialSpec match { + // This might be a bug of Hive: When the partition value inside the partial partition spec + // contains dot, and we ask Hive to list partitions w.r.t. the partial partition spec, Hive + // treats dot as matching any single character and may return more partitions than we + // expected. Here we do an extra filter to drop unexpected partitions. + case Some(spec) if spec.exists(_._2.contains(".")) => + res.filter(p => isPartialPartitionSpec(spec, p.spec)) + case _ => res + } } override def listPartitionsByFilter( --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
