[
https://issues.apache.org/jira/browse/HUDI-6201?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17724316#comment-17724316
]
Jonathan Vexler commented on HUDI-6201:
---------------------------------------
Here is a test that can be run in spark shell and fails around 1/4 of runs.
Data: [^base.tar.gz][^base001.tar.gz]
{code:java}
import org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider
import
org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector
import org.apache.hudi.{DataSourceWriteOptions, HoodieDataSourceHelpers}
import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
import org.apache.hudi.keygen.SimpleKeyGenerator
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.types._
import java.nio.file.{Paths, Files}
val iter = 3
val srcPath = "/Users/jon/Documents/timelineFail/base"
val targetPath = "/Users/jon/Documents/timelineFail/target/test" + iter + "/"
val updateDf =
spark.read.format("parquet").load("/Users/jon/Documents/timelineFail/base001")
val m = collection.mutable.Map[String, Int]().withDefaultValue(0)
var timesBad = 0
var total = 0
for (i <- 0 to 100) {
val bootstrapDF = spark.emptyDataFrame
val basePath = targetPath + i
bootstrapDF.write.format("hudi")
.option(HoodieWriteConfig.TABLE_NAME, "hoodie_test")
.option(DataSourceWriteOptions.OPERATION_OPT_KEY,
DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY,
"partition_path")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "timestamp")
.option(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key, "true")
.option(DataSourceWriteOptions.TABLE_TYPE.key,
DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
.option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, srcPath)
.option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key,
classOf[SimpleKeyGenerator].getName)
.option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR,
classOf[MetadataOnlyBootstrapModeSelector].getName)
.mode(SaveMode.Overwrite)
.save(basePath)
updateDf.write.format("hudi")
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, "_row_key")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY,
"partition_path")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY, "timestamp")
.option(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key, "true")
.option(HoodieWriteConfig.KEYGENERATOR_CLASS_NAME.key,
classOf[SimpleKeyGenerator].getName)
.mode(SaveMode.Append)
.save(basePath)
val result =
spark.read.format("hudi").load(basePath).filter("_hoodie_is_deleted is NULL")
total += 1
if (result.count() > 0) {
println("Failure in iter " + iter + " run " + i)
timesBad += 1
}
}
println("Failure rate is: " + timesBad.toDouble/total.toDouble)
{code}
> Timeline server sometimes does not send bootstrap base path for a skeleton
> file
> -------------------------------------------------------------------------------
>
> Key: HUDI-6201
> URL: https://issues.apache.org/jira/browse/HUDI-6201
> Project: Apache Hudi
> Issue Type: Bug
> Components: bootstrap, timeline-server
> Reporter: Jonathan Vexler
> Priority: Major
> Attachments: TestBootstrapRead.java, base.tar.gz, base001.tar.gz
>
>
> [^TestBootstrapRead.java] In the attached file, enable the timeline serverĀ
> 'hoodie.embed.timeline.server'. It will occasionally fail in metadata or
> mixed mode because some records will be null besides the metadata columns:
> {code:java}
> +-------------------+---------------------+------------------------------------+-------------------------+------------------------------------------------------------------------+------------------+------------------------------------+------------------+-------------------+-------------+------------+----------+------------------+----------+-------------------+-------------------+------------------------+-------------------+--------+----------+--------------+---------+--------------------+---------+--------------------------+----------+|_hoodie_commit_time|_hoodie_commit_seqno
> |_hoodie_record_key |_hoodie_partition_path
> |_hoodie_file_name
> |_hoodie_is_deleted|_row_key |begin_lat
> |begin_lon
> |city_to_state|current_date|current_ts|distance_in_meters|driver |end_lat
> |end_lon |fare |height
> |nation |partition |partition_path|rider |seconds_since_epoch
> |timestamp|tip_history |weight
> |+-------------------+---------------------+------------------------------------+-------------------------+------------------------------------------------------------------------+------------------+------------------------------------+------------------+-------------------+-------------+------------+----------+------------------+----------+-------------------+-------------------+------------------------+-------------------+--------+----------+--------------+---------+--------------------+---------+--------------------------+----------+|00000000000001
> |00000000000001_4_0
> |876743b0-f5e7-4289-b13b-1a0404d94380|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||00000000000001
> |00000000000001_4_1
> |00923d1a-58fc-4d42-8953-4a47b47d738f|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||20230510125841762
> |20230510125841762_1_2|b318c482-8e43-4614-bdab-80946d5a9f53|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|false
>
> |b318c482-8e43-4614-bdab-80946d5a9f53|0.5285807377766387|0.12835359814395741|[CA]
> |12 |1047178778|521899450
> |driver-001|0.41394620067559684|0.08532822423986208|[42.25978252084417,
> USD]|[0, 0, 7, -91, -36]|[Canada]|2015-03-16|2015-03-16
> |rider-001|-2845295541651788027|0 |[[32.10533813167099,
> USD]]|0.59076524||00000000000001 |00000000000001_4_3
> |4dcc72d7-0878-41c2-a85d-3e6374b88bb8|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||00000000000001
> |00000000000001_4_4
> |cfa79530-fc9f-42de-a181-34c06e79d9c5|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||00000000000001
> |00000000000001_4_5
> |4d2a4755-83a7-4201-9b65-0148752d55b7|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||00000000000001
> |00000000000001_4_6
> |76429084-f78e-4c6d-a70d-088cb5d955aa|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||00000000000001
> |00000000000001_4_7
> |8436456b-9858-45cb-8a07-bdae536a2d17|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null ||00000000000001
> |00000000000001_4_8
> |b6ca00ed-a4f4-4db5-a269-6be250b64caa|partition_path=2015-03-16|7e1dea56-e88c-4072-be61-f4ae01feaaa3_1-138-381_20230510125841762.parquet|null
> |null |null |null
> |null |null |null |null |null
> |null |null |null |null
> |null |null |null |null |null
> |null |null |null
> |+-------------------+---------------------+------------------------------------+-------------------------+------------------------------------------------------------------------+------------------+------------------------------------+------------------+-------------------+-------------+------------+----------+------------------+----------+-------------------+-------------------+------------------------+-------------------+--------+----------+--------------+---------+--------------------+---------+--------------------------+----------++-------------------+--------------------+------------------------------------+-------------------------+-------------------------------------------------------------------+
> {code}
> This is due to the bootstrap base file not being set, so HoodieMergeHelper
> does not process the file as a bootstrap skeleton file
--
This message was sent by Atlassian Jira
(v8.20.10#820010)