[jira] [Updated] (HUDI-3282) Fix delete exception for Spark SQL when sync Hive

Jira Wed, 19 Jan 2022 22:30:05 -0800


     [ 
https://issues.apache.org/jira/browse/HUDI-3282?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


董可伦 updated HUDI-3282:
----------------------
    Description: 
{{hudi 0.11.0 master build
spark: 2.4.5}}
hive create database test_hudi;
spark-shell --master yarn --deploy-mode client --executor-memory 2G 
--num-executors 3 --executor-cores 2 --driver-memory 4G --driver-cores 2 --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--principal .. --keytab .. import org.apache.hudi.DataSourceWriteOptions._ 
import org.apache.hudi.QuickstartUtils.{DataGenerator, convertToStringList, 
getQuickstartWriteConfigs} import 
org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import 
org.apache.spark.sql.SaveMode._ import org.apache.spark.sql.{SaveMode, 
SparkSession} import org.apache.spark.sql.functions.lit import 
org.apache.hudi.DataSourceReadOptions._ import 
org.apache.hudi.config.HoodieWriteConfig import 
org.apache.hudi.keygen.SimpleKeyGenerator import 
org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodiePayloadProps} 
import org.apache.hudi.io.HoodieMergeHandle import 
org.apache.hudi.common.table.HoodieTableConfig import 
org.apache.spark.sql.functions._ import spark.implicits._ val df = Seq((1, 
"a1", 10, 1000, "2022-01-19")).toDF("id", "name", "value", "ts", "dt") 
df.write.format("hudi"). option(HoodieWriteConfig.TBL_NAME.key, 
"test_hudi_table_sync_hive"). option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL). 
option(RECORDKEY_FIELD.key, "id"). option(PRECOMBINE_FIELD.key, "ts"). 
option(KEYGENERATOR_CLASS_OPT_KEY, 
"org.apache.hudi.keygen.NonpartitionedKeyGenerator"). 
option("hoodie.datasource.write.partitionpath.field", ""). 
option("hoodie.metadata.enable", false). option(KEYGENERATOR_CLASS_OPT_KEY, 
"org.apache.hudi.keygen.ComplexKeyGenerator"). option(META_SYNC_ENABLED.key(), 
true). option(HIVE_USE_JDBC.key(), false). option(HIVE_DATABASE.key(), 
"test_hudi"). option(HIVE_AUTO_CREATE_DATABASE.key(), true). 
option(HIVE_TABLE.key(), "test_hudi_table_sync_hive"). 
option(HIVE_PARTITION_EXTRACTOR_CLASS.key(), 
"org.apache.hudi.hive.MultiPartKeysValueExtractor"). mode("overwrite"). 
save("/test_hudi/test_hudi_table_sync_hive")

{{# hoodie.properties
hoodie.table.precombine.field=ts
hoodie.table.partition.fields=
hoodie.table.type=COPY_ON_WRITE
hoodie.archivelog.folder=archived
hoodie.populate.meta.fields=true
hoodie.timeline.layout.version=1
hoodie.table.version=3
hoodie.table.recordkey.fields=id
hoodie.table.base.file.format=PARQUET
hoodie.table.timeline.timezone=LOCAL
hoodie.table.keygenerator.class=org.apache.hudi.keygen.ComplexKeyGenerator
hoodie.table.name=test_hudi_table_sync_hive
hoodie.datasource.write.hive_style_partitioning=false}}
hive
show create table test_hudi_table_sync_hive; 
+----------------------------------------------------+ | createtab_stmt | 
+----------------------------------------------------+ | CREATE EXTERNAL TABLE 
`test_hudi_table_sync_hive`( | | `_hoodie_commit_time` string, | | 
`_hoodie_commit_seqno` string, | | `_hoodie_record_key` string, | | 
`_hoodie_partition_path` string, | | `_hoodie_file_name` string, | | `id` int, 
| | `name` string, | | `value` int, | | `ts` int, | | `dt` string) | | ROW 
FORMAT SERDE | | 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
| | WITH SERDEPROPERTIES ( | | 'hoodie.query.as.ro.table'='false', | | 
'path'='/test_hudi/test_hudi_table_sync_hive') | | STORED AS INPUTFORMAT | | 
'org.apache.hudi.hadoop.HoodieParquetInputFormat' | | OUTPUTFORMAT | | 
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | | LOCATION | 
| 'hdfs://cluster1/test_hudi/test_hudi_table_sync_hive' | | TBLPROPERTIES ( | | 
'last_commit_time_sync'='20220119110215185', | | 
'spark.sql.sources.provider'='hudi', | | 
'spark.sql.sources.schema.numParts'='1', | | 
'spark.sql.sources.schema.part.0'='{"type":"struct","fields":[\{"name":"_hoodie_commit_time","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_commit_seqno","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_record_key","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_partition_path","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_file_name","type":"string","nullable":true,"metadata":{}},\{"name":"id","type":"integer","nullable":false,"metadata":{}},\{"name":"name","type":"string","nullable":true,"metadata":{}},\{"name":"value","type":"integer","nullable":false,"metadata":{}},\{"name":"ts","type":"integer","nullable":false,"metadata":{}},\{"name":"dt","type":"string","nullable":true,"metadata":{}}]}',
 | | 'transient_lastDdlTime'='1642561355') | 
+----------------------------------------------------+ 28 rows selected (0.429 
seconds)
spark-sql --master yarn --deploy-mode client --conf 
'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
--principal .. --keytab .. delete from test_hudi.test_hudi_table_sync_hive 
where id=1;

  was:h1. Fix delete exception for Spark SQL when sync Hive


> Fix delete exception for Spark SQL when sync Hive
> -------------------------------------------------
>
>                 Key: HUDI-3282
>                 URL: https://issues.apache.org/jira/browse/HUDI-3282
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: hive-sync, spark-sql
>    Affects Versions: 0.10.0
>            Reporter: 董可伦
>            Assignee: 董可伦
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.10.1
>
>
> {{hudi 0.11.0 master build
> spark: 2.4.5}}
> hive create database test_hudi;
> spark-shell --master yarn --deploy-mode client --executor-memory 2G 
> --num-executors 3 --executor-cores 2 --driver-memory 4G --driver-cores 2 
> --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
> --principal .. --keytab .. import org.apache.hudi.DataSourceWriteOptions._ 
> import org.apache.hudi.QuickstartUtils.{DataGenerator, convertToStringList, 
> getQuickstartWriteConfigs} import 
> org.apache.hudi.config.HoodieWriteConfig.TBL_NAME import 
> org.apache.spark.sql.SaveMode._ import org.apache.spark.sql.{SaveMode, 
> SparkSession} import org.apache.spark.sql.functions.lit import 
> org.apache.hudi.DataSourceReadOptions._ import 
> org.apache.hudi.config.HoodieWriteConfig import 
> org.apache.hudi.keygen.SimpleKeyGenerator import 
> org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodiePayloadProps} 
> import org.apache.hudi.io.HoodieMergeHandle import 
> org.apache.hudi.common.table.HoodieTableConfig import 
> org.apache.spark.sql.functions._ import spark.implicits._ val df = Seq((1, 
> "a1", 10, 1000, "2022-01-19")).toDF("id", "name", "value", "ts", "dt") 
> df.write.format("hudi"). option(HoodieWriteConfig.TBL_NAME.key, 
> "test_hudi_table_sync_hive"). option(TABLE_TYPE.key, COW_TABLE_TYPE_OPT_VAL). 
> option(RECORDKEY_FIELD.key, "id"). option(PRECOMBINE_FIELD.key, "ts"). 
> option(KEYGENERATOR_CLASS_OPT_KEY, 
> "org.apache.hudi.keygen.NonpartitionedKeyGenerator"). 
> option("hoodie.datasource.write.partitionpath.field", ""). 
> option("hoodie.metadata.enable", false). option(KEYGENERATOR_CLASS_OPT_KEY, 
> "org.apache.hudi.keygen.ComplexKeyGenerator"). 
> option(META_SYNC_ENABLED.key(), true). option(HIVE_USE_JDBC.key(), false). 
> option(HIVE_DATABASE.key(), "test_hudi"). 
> option(HIVE_AUTO_CREATE_DATABASE.key(), true). option(HIVE_TABLE.key(), 
> "test_hudi_table_sync_hive"). option(HIVE_PARTITION_EXTRACTOR_CLASS.key(), 
> "org.apache.hudi.hive.MultiPartKeysValueExtractor"). mode("overwrite"). 
> save("/test_hudi/test_hudi_table_sync_hive")
> {{# hoodie.properties
> hoodie.table.precombine.field=ts
> hoodie.table.partition.fields=
> hoodie.table.type=COPY_ON_WRITE
> hoodie.archivelog.folder=archived
> hoodie.populate.meta.fields=true
> hoodie.timeline.layout.version=1
> hoodie.table.version=3
> hoodie.table.recordkey.fields=id
> hoodie.table.base.file.format=PARQUET
> hoodie.table.timeline.timezone=LOCAL
> hoodie.table.keygenerator.class=org.apache.hudi.keygen.ComplexKeyGenerator
> hoodie.table.name=test_hudi_table_sync_hive
> hoodie.datasource.write.hive_style_partitioning=false}}
> hive
> show create table test_hudi_table_sync_hive; 
> +----------------------------------------------------+ | createtab_stmt | 
> +----------------------------------------------------+ | CREATE EXTERNAL 
> TABLE `test_hudi_table_sync_hive`( | | `_hoodie_commit_time` string, | | 
> `_hoodie_commit_seqno` string, | | `_hoodie_record_key` string, | | 
> `_hoodie_partition_path` string, | | `_hoodie_file_name` string, | | `id` 
> int, | | `name` string, | | `value` int, | | `ts` int, | | `dt` string) | | 
> ROW FORMAT SERDE | | 
> 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | | WITH 
> SERDEPROPERTIES ( | | 'hoodie.query.as.ro.table'='false', | | 
> 'path'='/test_hudi/test_hudi_table_sync_hive') | | STORED AS INPUTFORMAT | | 
> 'org.apache.hudi.hadoop.HoodieParquetInputFormat' | | OUTPUTFORMAT | | 
> 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | | LOCATION 
> | | 'hdfs://cluster1/test_hudi/test_hudi_table_sync_hive' | | TBLPROPERTIES ( 
> | | 'last_commit_time_sync'='20220119110215185', | | 
> 'spark.sql.sources.provider'='hudi', | | 
> 'spark.sql.sources.schema.numParts'='1', | | 
> 'spark.sql.sources.schema.part.0'='{"type":"struct","fields":[\{"name":"_hoodie_commit_time","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_commit_seqno","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_record_key","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_partition_path","type":"string","nullable":true,"metadata":{}},\{"name":"_hoodie_file_name","type":"string","nullable":true,"metadata":{}},\{"name":"id","type":"integer","nullable":false,"metadata":{}},\{"name":"name","type":"string","nullable":true,"metadata":{}},\{"name":"value","type":"integer","nullable":false,"metadata":{}},\{"name":"ts","type":"integer","nullable":false,"metadata":{}},\{"name":"dt","type":"string","nullable":true,"metadata":{}}]}',
>  | | 'transient_lastDdlTime'='1642561355') | 
> +----------------------------------------------------+ 28 rows selected 
> (0.429 seconds)
> spark-sql --master yarn --deploy-mode client --conf 
> 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf 
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension' 
> --principal .. --keytab .. delete from test_hudi.test_hudi_table_sync_hive 
> where id=1;



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

[jira] [Updated] (HUDI-3282) Fix delete exception for Spark SQL when sync Hive

Reply via email to