[hudi] 06/37: [HUDI-6732] Allow wildcards from Spark-SQL entrypoints for drop partition DDL (#9491)

pwason Wed, 13 Sep 2023 00:34:38 -0700

This is an automated email from the ASF dual-hosted git repository.

pwason pushed a commit to branch release-0.14.0
in repository https://gitbox.apache.org/repos/asf/hudi.git


commit 4bc418449577d8b529216d3405d25f46738ed173
Author: voonhous <[email protected]>
AuthorDate: Fri Sep 1 13:54:27 2023 +0800

    [HUDI-6732] Allow wildcards from Spark-SQL entrypoints for drop partition 
DDL (#9491)
---
 .../org/apache/hudi/HoodieSparkSqlWriter.scala     |  6 ++--
 .../sql/hudi/TestAlterTableDropPartition.scala     | 36 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
index cf78e514dda..6d0ce7d16bf 100644
--- 
a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
+++ 
b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
@@ -606,7 +606,8 @@ object HoodieSparkSqlWriter {
    */
   private def resolvePartitionWildcards(partitions: List[String], jsc: 
JavaSparkContext, cfg: HoodieConfig, basePath: String): List[String] = {
     //find out if any of the input partitions have wildcards
-    var (wildcardPartitions, fullPartitions) = partitions.partition(partition 
=> partition.contains("*"))
+    //note:spark-sql may url-encode special characters (* -> %2A)
+    var (wildcardPartitions, fullPartitions) = partitions.partition(partition 
=> partition.matches(".*(\\*|%2A).*"))
 
     if (wildcardPartitions.nonEmpty) {
       //get list of all partitions
@@ -621,7 +622,8 @@ object HoodieSparkSqlWriter {
         //prevent that from happening. Any text inbetween \\Q and \\E is 
considered literal
         //So we start the string with \\Q and end with \\E and then whenever 
we find a * we add \\E before
         //and \\Q after so all other characters besides .* will be enclosed 
between a set of \\Q \\E
-        val regexPartition = "^\\Q" + partition.replace("*", "\\E.*\\Q") + 
"\\E$"
+        val wildcardToken: String = if (partition.contains("*")) "*" else "%2A"
+        val regexPartition = "^\\Q" + partition.replace(wildcardToken, 
"\\E.*\\Q") + "\\E$"
 
         //filter all partitions with the regex and append the result to the 
list of full partitions
         fullPartitions = 
List.concat(fullPartitions,allPartitions.filter(_.matches(regexPartition)))
diff --git 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala
 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala
index 2261e83f7f9..b421732d270 100644
--- 
a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala
+++ 
b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala
@@ -620,4 +620,40 @@ class TestAlterTableDropPartition extends 
HoodieSparkSqlTestBase {
       checkExceptionContain(s"ALTER TABLE $tableName DROP 
PARTITION($partition)")(errMsg)
     }
   }
+
+  test("Test drop partition with wildcards") {
+    withRecordType()(withTempDir { tmp =>
+      Seq("cow", "mor").foreach { tableType =>
+        val tableName = generateTableName
+        spark.sql(
+          s"""
+             |create table $tableName (
+             |  id int,
+             |  name string,
+             |  price double,
+             |  ts long,
+             |  partition_date_col string
+             |) using hudi
+             | location '${tmp.getCanonicalPath}/$tableName'
+             | tblproperties (
+             |  primaryKey ='id',
+             |  type = '$tableType',
+             |  preCombineField = 'ts'
+             | ) partitioned by (partition_date_col)
+         """.stripMargin)
+        spark.sql(s"insert into $tableName values " +
+          s"(1, 'a1', 10, 1000, '2023-08-01'), (2, 'a2', 10, 1000, 
'2023-08-02'), (3, 'a3', 10, 1000, '2023-09-01')")
+        checkAnswer(s"show partitions $tableName")(
+          Seq("partition_date_col=2023-08-01"),
+          Seq("partition_date_col=2023-08-02"),
+          Seq("partition_date_col=2023-09-01")
+        )
+        spark.sql(s"alter table $tableName drop 
partition(partition_date_col='2023-08-*')")
+        // show partitions will still return all partitions for tests, use 
select distinct as a stop-gap
+        checkAnswer(s"select distinct partition_date_col from $tableName")(
+          Seq("2023-09-01")
+        )
+      }
+    })
+  }
 }

[hudi] 06/37: [HUDI-6732] Allow wildcards from Spark-SQL entrypoints for drop partition DDL (#9491)

Reply via email to