[
https://issues.apache.org/jira/browse/HUDI-8551?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Y Ethan Guo updated HUDI-8551:
------------------------------
Remaining Estimate: 12h
Original Estimate: 12h
> Allow no precombine field in MOR table
> --------------------------------------
>
> Key: HUDI-8551
> URL: https://issues.apache.org/jira/browse/HUDI-8551
> Project: Apache Hudi
> Issue Type: Sub-task
> Reporter: Y Ethan Guo
> Assignee: Y Ethan Guo
> Priority: Blocker
> Labels: pull-request-available
> Fix For: 1.0.1
>
> Original Estimate: 12h
> Remaining Estimate: 12h
>
> MOR table without precombine field specified in SQL failed to be read. We
> should still allow no precombine field in MOR table, which should be treated
> as natural or commit time ordering with the default EVENT_TIME_ORDERING mode
> (i.e., setting ordering/precombine value as 0).
> {code:java}
> REATE DATABASE testing_partial_updates;
> CREATE TABLE testing_partial_updates.table1 (
> ts BIGINT,
> uuid STRING,
> rider STRING,
> driver STRING,
> fare DOUBLE,
> city STRING
> ) USING HUDI
> LOCATION
> 'file:///Users/ethan/Work/tmp/hudi-1.0.0-testing/partial-update/table1'
> TBLPROPERTIES (
> type = 'mor',
> primaryKey = 'uuid'
> )
> PARTITIONED BY (city);
> INSERT INTO testing_partial_updates.table1
> VALUES
> (1695159649087,'334e26e9-8355-45cc-97c6-c31daf0df330','rider-A','driver-K',19.10,'san_francisco'),
> (1695091554788,'e96c4396-3fad-413a-a942-4cb36106d721','rider-C','driver-M',27.70
> ,'san_francisco'),
> (1695046462179,'9909a8b1-2d15-4d3d-8ec9-efc48c536a00','rider-D','driver-L',33.90
> ,'san_francisco'),
> (1695332066204,'1dced545-862b-4ceb-8b43-d2a568f6616b','rider-E','driver-O',93.50,'san_francisco'),
> (1695516137016,'e3cf430c-889d-4015-bc98-59bdce1e530c','rider-F','driver-P',34.15,'sao_paulo'
> ),
> (1695376420876,'7a84095f-737f-40bc-b62f-6b69664712d2','rider-G','driver-Q',43.40
> ,'sao_paulo' ),
> (1695173887231,'3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04','rider-I','driver-S',41.06
> ,'chennai' ),
> (1695115999911,'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa','rider-J','driver-T',17.85,'chennai');CREATE
> TABLE merge_source1 (ts bigint, uuid String, fare DOUBLE, city STRING) using
> parquet;
> INSERT INTO merge_source1 values (1695159649090,
> '334e26e9-8355-45cc-97c6-c31daf0df330', 25.20, 'san_francisco'),
> (1695173887240, '3eeb61f7-c2b0-4636-99bd-5d7a5a1d2c04', 50.00, 'chennai');SET
> hoodie.merge.small.file.group.candidates.limit = 0;MERGE INTO
> testing_partial_updates.table1 as target
> using (
> select * from merge_source1
> ) source
> on target.uuid = source.uuid
> when matched then
> update set ts = source.ts, fare = source.fare
> ;
> spark-sql (default)> select * from testing_partial_updates.table1;
> 24/11/15 16:40:31 ERROR Executor: Exception in task 2.0 in stage 52.0 (TID 98)
> org.apache.spark.SparkException: Encountered error while reading file
> file:///Users/ethan/Work/tmp/hudi-1.0.0-testing/partial-update/table1/city=chennai/56770d17-8431-49b3-852d-07cb693db466-0_2-11-33_20241115163103124.parquet.
> Details:
> at
> org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:864)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:293)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
> at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
> at
> org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
> at
> org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
> at
> org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
> at org.apache.spark.scheduler.Task.run(Task.scala:141)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: java.lang.NullPointerException
> at scala.collection.immutable.StringLike.split(StringLike.scala:266)
> at scala.collection.immutable.StringLike.split$(StringLike.scala:265)
> at scala.collection.immutable.StringOps.split(StringOps.scala:33)
> at
> org.apache.spark.sql.HoodieUnsafeRowUtils$.composeNestedFieldPath(HoodieUnsafeRowUtils.scala:97)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:105)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$$anon$3.apply(HoodieInternalRowUtils.scala:103)
> at
> java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
> at
> org.apache.spark.sql.HoodieInternalRowUtils$.getCachedPosList(HoodieInternalRowUtils.scala:103)
> at
> org.apache.spark.sql.HoodieInternalRowUtils.getCachedPosList(HoodieInternalRowUtils.scala)
> at
> org.apache.hudi.common.model.HoodieSparkRecord.getOrderingValue(HoodieSparkRecord.java:319)
> at
> org.apache.hudi.DefaultSparkRecordMerger.partialMerge(DefaultSparkRecordMerger.java:113)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.merge(HoodieBaseFileGroupRecordBuffer.java:388)
> at
> org.apache.hudi.common.table.read.HoodiePositionBasedFileGroupRecordBuffer.hasNextBaseRecord(HoodiePositionBasedFileGroupRecordBuffer.java:227)
> at
> org.apache.hudi.common.table.read.HoodieKeyBasedFileGroupRecordBuffer.doHasNext(HoodieKeyBasedFileGroupRecordBuffer.java:135)
> at
> org.apache.hudi.common.table.read.HoodieBaseFileGroupRecordBuffer.hasNext(HoodieBaseFileGroupRecordBuffer.java:149)
> at
> org.apache.hudi.common.table.read.HoodieFileGroupReader.hasNext(HoodieFileGroupReader.java:235)
> at
> org.apache.hudi.common.table.read.HoodieFileGroupReader$HoodieFileGroupReaderIterator.hasNext(HoodieFileGroupReader.java:289)
> at
> org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedParquetFileFormat$$anon$1.hasNext(HoodieFileGroupReaderBasedParquetFileFormat.scala:273)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
> at
> org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)
> ... 19 more {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)