parisni commented on issue #3670:
URL: https://github.com/apache/hudi/issues/3670#issuecomment-926644355
hi @xushiyan, see below example/explanations
```scala
// create a basic parquet table
val inputDF = Seq(
("100", "2015-01-01", "2015-01-01T13:51:39.340396Z"),
("101", "2015-01-01", "2015-01-01T12:14:58.597216Z"),
("102", "2015-01-01", "2015-01-01T13:51:40.417052Z"),
("103", "2015-01-01", "2015-01-01T13:51:40.519832Z"),
("104", "2015-01-02", "2015-01-01T12:15:00.512679Z"),
("105", "2015-01-02", "2015-01-01T13:51:42.248818Z")
).toDF("id", "creation_date", "last_update_time")
.withColumn("creation_date", expr("cast(creation_date as date)"))
.withColumn("id", expr("cast(id as bigint)"))
inputDF.write.format("parquet").saveAsTable("test_hudi_partitionned")
// create a sql hudi managed table and insert into it the basic parquet data
spark.sql(
"""
create table if not exists test_hudi_partition_sql using hudi
location 's3://test-bucket/test/test_hudi_partitionned_sql'
options (
type = 'mor',
primaryKey = 'id',
preCombineField = 'last_update_time'
)
partitioned by (creation_date)
as select id, last_update_time, creation_date
from test_hudi_partition
""")
// try to delete from the hudi table with datasource api
val tableName = "test_hudi_partition_sql"
val hudiOptions = Map[String, String](
HoodieWriteConfig.TABLE_NAME -> tableName,
DataSourceWriteOptions.OPERATION_OPT_KEY -> "delete",
DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY -> "COPY_ON_WRITE",
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "id",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "creation_date",
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "last_update_time",
DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY -> "default",
DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY -> "true",
DataSourceWriteOptions.HIVE_TABLE_OPT_KEY -> "test_hudi_partition",
DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY -> "creation_date",
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY ->
classOf[MultiPartKeysValueExtractor].getName,
"hoodie.datasource.hive_sync.mode" -> "hms",
)
spark.sql("select * from test_hudi_partition_sql") // delete all the rows
.write
.format("org.apache.hudi")
.options(hudiOptions)
.option(DataSourceWriteOptions.OPERATION_OPT_KEY,
DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL)
.mode(SaveMode.Append)
.save("s3://test-bucket/test/test_hudi_partitionned_sql")
/*
// ISSUE: this should delete all the rows, but does not work
+-------------------+--------------------+------------------+----------------------+--------------------+---+----------------+-------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|
_hoodie_file_name| id|last_update_time|creation_date|
+-------------------+--------------------+------------------+----------------------+--------------------+---+----------------+-------------+
| 20210913144107| 20210913144107_0_88| id:103|
creation_date=201...|bf32dc7e-5252-489...|103| world| 2015-01-01|
| 20210913144107| 20210913144107_1_91| id:101|
creation_date=201...|7afd8fd6-3710-4dc...|101| world| 2015-01-01|
| 20210913144107| 20210913144107_3_90| id:102|
creation_date=201...|ff511a06-eb3a-492...|102| world| 2015-01-01|
| 20210913144107| 20210913144107_2_89| id:100|
creation_date=201...|cd505e2c-2178-48f...|100| world| 2015-01-01|
| 20210913144107| 20210913144107_4_92| id:105|
creation_date=201...|708c9bc1-aef5-4f0...|105| world| 2015-01-02|
| 20210913144107| 20210913144107_5_93| id:104|
creation_date=201...|045a0fdd-3387-406...|104| world| 2015-01-02|
+-------------------+--------------------+------------------+----------------------+--------------------+---+----------------+-------------+
*/
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]