This is an automated email from the ASF dual-hosted git repository. pwason pushed a commit to branch release-0.14.0 in repository https://gitbox.apache.org/repos/asf/hudi.git
commit 4bc418449577d8b529216d3405d25f46738ed173 Author: voonhous <[email protected]> AuthorDate: Fri Sep 1 13:54:27 2023 +0800 [HUDI-6732] Allow wildcards from Spark-SQL entrypoints for drop partition DDL (#9491) --- .../org/apache/hudi/HoodieSparkSqlWriter.scala | 6 ++-- .../sql/hudi/TestAlterTableDropPartition.scala | 36 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala index cf78e514dda..6d0ce7d16bf 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala @@ -606,7 +606,8 @@ object HoodieSparkSqlWriter { */ private def resolvePartitionWildcards(partitions: List[String], jsc: JavaSparkContext, cfg: HoodieConfig, basePath: String): List[String] = { //find out if any of the input partitions have wildcards - var (wildcardPartitions, fullPartitions) = partitions.partition(partition => partition.contains("*")) + //note:spark-sql may url-encode special characters (* -> %2A) + var (wildcardPartitions, fullPartitions) = partitions.partition(partition => partition.matches(".*(\\*|%2A).*")) if (wildcardPartitions.nonEmpty) { //get list of all partitions @@ -621,7 +622,8 @@ object HoodieSparkSqlWriter { //prevent that from happening. Any text inbetween \\Q and \\E is considered literal //So we start the string with \\Q and end with \\E and then whenever we find a * we add \\E before //and \\Q after so all other characters besides .* will be enclosed between a set of \\Q \\E - val regexPartition = "^\\Q" + partition.replace("*", "\\E.*\\Q") + "\\E$" + val wildcardToken: String = if (partition.contains("*")) "*" else "%2A" + val regexPartition = "^\\Q" + partition.replace(wildcardToken, "\\E.*\\Q") + "\\E$" //filter all partitions with the regex and append the result to the list of full partitions fullPartitions = List.concat(fullPartitions,allPartitions.filter(_.matches(regexPartition))) diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala index 2261e83f7f9..b421732d270 100644 --- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala +++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestAlterTableDropPartition.scala @@ -620,4 +620,40 @@ class TestAlterTableDropPartition extends HoodieSparkSqlTestBase { checkExceptionContain(s"ALTER TABLE $tableName DROP PARTITION($partition)")(errMsg) } } + + test("Test drop partition with wildcards") { + withRecordType()(withTempDir { tmp => + Seq("cow", "mor").foreach { tableType => + val tableName = generateTableName + spark.sql( + s""" + |create table $tableName ( + | id int, + | name string, + | price double, + | ts long, + | partition_date_col string + |) using hudi + | location '${tmp.getCanonicalPath}/$tableName' + | tblproperties ( + | primaryKey ='id', + | type = '$tableType', + | preCombineField = 'ts' + | ) partitioned by (partition_date_col) + """.stripMargin) + spark.sql(s"insert into $tableName values " + + s"(1, 'a1', 10, 1000, '2023-08-01'), (2, 'a2', 10, 1000, '2023-08-02'), (3, 'a3', 10, 1000, '2023-09-01')") + checkAnswer(s"show partitions $tableName")( + Seq("partition_date_col=2023-08-01"), + Seq("partition_date_col=2023-08-02"), + Seq("partition_date_col=2023-09-01") + ) + spark.sql(s"alter table $tableName drop partition(partition_date_col='2023-08-*')") + // show partitions will still return all partitions for tests, use select distinct as a stop-gap + checkAnswer(s"select distinct partition_date_col from $tableName")( + Seq("2023-09-01") + ) + } + }) + } }
