[
https://issues.apache.org/jira/browse/HUDI-9791?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
ASF GitHub Bot updated HUDI-9791:
---------------------------------
Labels: pull-request-available (was: )
> MDT breaks with hfile reader changes when writing with master using table
> version 6
> -----------------------------------------------------------------------------------
>
> Key: HUDI-9791
> URL: https://issues.apache.org/jira/browse/HUDI-9791
> Project: Apache Hudi
> Issue Type: Bug
> Components: metadata
> Reporter: Jonathan Vexler
> Assignee: Lin Liu
> Priority: Blocker
> Labels: pull-request-available
> Fix For: 1.1.0
>
> Attachments: hfile.error
>
>
> Exception: [^hfile.error]
> Summary:
> # Write an insert and update to mor table with spark 3.1 hudi 0.14.1
> # write an update with spark 3.5 with current master
> `adda6950e0aaa7353add88ee2fc0499d7135ee33` using write table version 6
> # Read table with spark 3.1 hudi 0.14.1 and get exception
> The hoodie.properties still says table version is 6
> Here is my runscript:
> {code:java}
> set_spark 3.1
> hudi_spark_shell -p -v 0.14.1
> import scala.collection.JavaConversions._
> import org.apache.spark.sql.SaveMode._
> import org.apache.hudi.DataSourceReadOptions._
> import org.apache.hudi.DataSourceWriteOptions._
> import org.apache.hudi.common.table.HoodieTableConfig._
> import org.apache.hudi.config.HoodieWriteConfig._
> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._
> import org.apache.hudi.common.model.HoodieRecord
> import spark.implicits._
> val tableName = "trips_table"
> val basePath = "file:///tmp/trips_table"
> val columns = Seq("ts","uuid","rider","driver","fare","city")
> val data =
>
> Seq((1695159649087L,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
>
> (1695091554788L,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70
> ,"san_francisco"),
>
> (1695046462179L,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90
> ,"san_francisco"),
>
> (1695516137016L,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"
> ),
>
> (1695115999911L,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai"));
> var inserts = spark.createDataFrame(data).toDF(columns:_*)
> inserts.write.format("hudi").
> option("hoodie.datasource.write.partitionpath.field", "city").
> option("hoodie.table.name", tableName).
> option("hoodie.metadata.index.column.stats.enable", "true").
> option("hoodie.datasource.write.table.type", "MERGE_ON_READ").
> mode(Overwrite).
> save(basePath)
> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider" ===
> "rider-D").withColumn("fare", col("fare") * 10)
> updatesDf.write.format("hudi").
> option("hoodie.datasource.write.operation", "upsert").
> option("hoodie.datasource.write.partitionpath.field", "city").
> option("hoodie.table.name", tableName).
> option("hoodie.metadata.index.column.stats.enable", "true").
> option("hoodie.datasource.write.table.type", "MERGE_ON_READ").
> mode(Append).
> save(basePath)
> //exit
> set_spark 3.5
> hudi_spark_shell -j
> import scala.collection.JavaConversions._
> import org.apache.spark.sql.SaveMode._
> import org.apache.hudi.DataSourceReadOptions._
> import org.apache.hudi.DataSourceWriteOptions._
> import org.apache.hudi.common.table.HoodieTableConfig._
> import org.apache.hudi.config.HoodieWriteConfig._
> import org.apache.hudi.keygen.constant.KeyGeneratorOptions._
> import org.apache.hudi.common.model.HoodieRecord
> import spark.implicits._
> val tableName = "trips_table"
> val basePath = "file:///tmp/trips_table"
> val updatesDf = spark.read.format("hudi").load(basePath).filter($"rider" ===
> "rider-D").withColumn("fare", col("fare") * 10)
> updatesDf.write.format("hudi").option("hoodie.datasource.write.operation",
> "upsert").option("hoodie.datasource.write.partitionpath.field",
> "city").option("hoodie.table.name",
> tableName).option("hoodie.metadata.index.column.stats.enable",
> "true").option("hoodie.write.table.version",
> "6").option("hoodie.datasource.write.table.type",
> "MERGE_ON_READ").mode(Append).save(basePath)
> //exit
> set_spark 3.1
> hudi_spark_shell -p -v 0.14.1
> spark.read.format("hudi").option("hoodie.metadata.enable",
> "true").option("hoodie.enable.data.skipping",
> "true").option("hoodie.metadata.index.column.stats.enable",
> "true").load("/tmp/trips_table").filter("fare > 100").show(100,false) {code}
> Command for running 0.14.1 with spark 3.1 using mvn package:
> {code:java}
> /Users/jon/Documents/sparkroot/spark-3.1.3-bin-hadoop3.2/bin/spark-shell
> --packages org.apache.hudi:hudi-spark3.1-bundle_2.12:0.14.1 --conf
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
> --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
> 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' --conf
> 'spark.sql.catalogImplementation=in-memory' {code}
> Command for running with current master on spark 3.5
> {code:java}
> /Users/jon/Documents/sparkroot/spark-3.5.2-bin-hadoop3/bin/spark-shell --jars
> /Users/jon/git/hudi-versions/current/spark3.5/hudi/packaging/hudi-spark-bundle/target/hudi-spark3.5-bundle_2.12-1.1.0-SNAPSHOT.jar
> --conf
> 'spark.sql.extensions=org.apache.spark.sql.hudi.HoodieSparkSessionExtension'
> --conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer' --conf
> 'spark.kryo.registrator=org.apache.spark.HoodieSparkKryoRegistrar' --conf
> 'spark.sql.catalog.spark_catalog=org.apache.spark.sql.hudi.catalog.HoodieCatalog'
> --conf 'spark.sql.catalogImplementation=in-memory' {code}
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)