ad1happy2go commented on issue #8416:
URL: https://github.com/apache/hudi/issues/8416#issuecomment-1506677225
@luyongbiao I was able to reproduce the bug with
spark.sql.codegen.wholeStage as false with spark 3.1 and 3.2.
But when I tried with later spark version i.e. 3.3.2 with same hudi version,
this issue no longer exists. So looks like some code fix has been done on spark
side with the latest version. Can you please confirm by upgrading the spark
version. Below is same Scala version of your code.
Pasting scala shell code -
Add conf while opening spark shell :
--conf 'spark.sql.codegen.wholeStage=false'
import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.hudi.common.model.HoodieRecord
val tableAPath = "file:///tmp/hudi_trips_mor_issue_8416_try_user3"
val tableName = "hudi_trips_mor_issue_8416_try_user3"
var dataset = spark.sql("select 1 id, 'mock' field1, 'mock' field2"
+ "\nunion all select 2 id, 'mock' field1,
'mock' field2"
+ "\nunion all select 3 id, 'mock' field1,
'mock' field2"
+ "\nunion all select 4 id, 'mock' field1,
'mock' field2"
+ "\nunion all select 5 id, 'mock' field1,
'mock' field2");
dataset = dataset.withColumn("lake_update_date", current_timestamp());
(dataset.write.format("org.apache.hudi")
.option(TABLE_TYPE.key, MOR_TABLE_TYPE_OPT_VAL)
.option(OPERATION_OPT_KEY, UPSERT_OPERATION_OPT_VAL)
.option(RECORDKEY_FIELD_OPT_KEY, "id")
.option(PARTITIONPATH_FIELD_OPT_KEY, "")
.option(PRECOMBINE_FIELD_PROP, "lake_update_date")
.option(TABLE_NAME, "tableA")
.mode(Append)
.save(tableAPath))
print(dataset.count())
var dataset2 = (spark.read.format("org.apache.hudi")
.option(QUERY_TYPE_OPT_KEY, QUERY_TYPE_SNAPSHOT_OPT_VAL)
.load(tableAPath));
dataset2 = dataset2.filter(
"id = 1 or id = 2").withColumn(
"field2", lit("mock2")).withColumn(
"lake_update_date", current_timestamp);
(dataset2.write.format("org.apache.hudi")
.option(TABLE_TYPE.key, MOR_TABLE_TYPE_OPT_VAL)
.option(OPERATION_OPT_KEY, UPSERT_OPERATION_OPT_VAL)
.option(RECORDKEY_FIELD_OPT_KEY, "id")
.option(PARTITIONPATH_FIELD_OPT_KEY, "")
.option(PRECOMBINE_FIELD_PROP, "lake_update_date")
.option(TABLE_NAME, "tableA")
.mode(Append)
.save(tableAPath))
spark.read.format("org.apache.hudi").option(QUERY_TYPE_OPT_KEY,
QUERY_TYPE_SNAPSHOT_OPT_VAL).load(tableAPath).count()
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]