parisni opened a new issue, #8893:
URL: https://github.com/apache/hudi/issues/8893
0.13.1 does not honnor hive_partitionning if not specified explicitly while
delete operation. In previous version it likely get the conf from
`hoodie.properties`. As a result, deletion can fail silently.
```scala
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import org.apache.hudi.DataSourceWriteOptions
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.hive.MultiPartKeysValueExtractor
val path="/tmp/tbl"
val dt = spark.sql("""
select cast(1 as bigint) as list_id, 1 as _hudi_last_update, '123' as
_hudi_partition
""")
var hudiOptions = Map[String, String](
HoodieWriteConfig.TABLE_NAME -> "tbl",
DataSourceWriteOptions.OPERATION_OPT_KEY -> "insert",
DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY -> "COPY_ON_WRITE", // COW
rewrite the file
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "list_id",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "_hudi_partition",
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "_hudi_last_update",
DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY -> "false",
"hoodie.index.type" -> "SIMPLE",
"hoodie.metadata.enable" -> "true",
"hoodie.datasource.write.keygenerator.class" ->
"org.apache.hudi.keygen.ComplexKeyGenerator",
"hoodie.datasource.write.hive_style_partitioning" -> "true"
)
dt.write.format("org.apache.hudi").options(hudiOptions).mode(SaveMode.Overwrite).save(path)
spark.read.format("hudi").load(path).show(false)
hudiOptions = Map[String, String](
HoodieWriteConfig.TABLE_NAME -> "tbl",
DataSourceWriteOptions.OPERATION_OPT_KEY -> "delete",
DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY -> "COPY_ON_WRITE", // COW
rewrite the file
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "list_id",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "_hudi_partition",
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "_hudi_last_update",
DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY -> "false",
"hoodie.index.type" -> "SIMPLE",
"hoodie.metadata.enable" -> "true",
"hoodie.datasource.write.keygenerator.class" ->
"org.apache.hudi.keygen.ComplexKeyGenerator",
)
dt.dropDuplicates("list_id").select("list_id",
"_hudi_partition").write.format("org.apache.hudi").options(hudiOptions).mode(SaveMode.Append).save(path)
spark.read.format("hudi").load(path).show(false)
```
## In `0.13.1` deletion is NOT applied:
```
+-------------------+---------------------+------------------+----------------------+--------------------------------------------------------------------------+-------+-----------------+---------------+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|list_id|_hudi_last_update|_hudi_partition|
+-------------------+---------------------+------------------+----------------------+--------------------------------------------------------------------------+-------+-----------------+---------------+
|20230606112459697 |20230606112459697_0_0|list_id:1
|_hudi_partition=123
|010e5fcf-2848-49a5-88a2-d62dc9442f06-0_0-189-165_20230606112459697.parquet|1
|1 |123 |
+-------------------+---------------------+------------------+----------------------+--------------------------------------------------------------------------+-------+-----------------+---------------+
+-------------------+---------------------+------------------+----------------------+--------------------------------------------------------------------------+-------+-----------------+---------------+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|list_id|_hudi_last_update|_hudi_partition|
+-------------------+---------------------+------------------+----------------------+--------------------------------------------------------------------------+-------+-----------------+---------------+
|20230606112459697 |20230606112459697_0_0|list_id:1
|_hudi_partition=123
|010e5fcf-2848-49a5-88a2-d62dc9442f06-0_0-189-165_20230606112459697.parquet|1
|1 |123 |
+-------------------+---------------------+------------------+----------------------+--------------------------------------------------------------------------+-------+-----------------+---------------+
```
## In `0.12.3` deletion is applied:
```
+-------------------+---------------------+------------------+----------------------+------------------------------------------------------------------------+-------+-----------------+---------------+
|_hoodie_commit_time|_hoodie_commit_seqno
|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name
|list_id|_hudi_last_update|_hudi_partition|
+-------------------+---------------------+------------------+----------------------+------------------------------------------------------------------------+-------+-----------------+---------------+
|20230606112748141 |20230606112748141_0_0|list_id:1
|_hudi_partition=123
|b9d5fc08-7fdd-4ba9-a675-ea8eaf402257-0_0-17-15_20230606112748141.parquet|1
|1 |123 |
+-------------------+---------------------+------------------+----------------------+------------------------------------------------------------------------+-------+-----------------+---------------+
23/06/06 11:27:55 WARN DataSourceOptionsHelper$:
hoodie.datasource.write.storage.type is deprecated and will be removed in a
later release; Please use hoodie.datasource.write.table.type
+-------------------+--------------------+------------------+----------------------+-----------------+-------+-----------------+---------------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|_hoodie_file_name|list_id|_hudi_last_update|_hudi_partition|
+-------------------+--------------------+------------------+----------------------+-----------------+-------+-----------------+---------------+
+-------------------+--------------------+------------------+----------------------+-----------------+-------+-----------------+---------------+
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]