[
https://issues.apache.org/jira/browse/HUDI-8552?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Lin Liu reassigned HUDI-8552:
-----------------------------
Assignee: Y Ethan Guo (was: Lin Liu)
> Fix compaction with partial updates
> -----------------------------------
>
> Key: HUDI-8552
> URL: https://issues.apache.org/jira/browse/HUDI-8552
> Project: Apache Hudi
> Issue Type: Bug
> Reporter: Y Ethan Guo
> Assignee: Y Ethan Guo
> Priority: Blocker
> Labels: pull-request-available
> Fix For: 1.0.0
>
>
> The compaction fails with new partial updates support in the new log format.
> Branch that has the tests added with reproducible errors:
> [https://github.com/apache/hudi/pull/12289]
> {code:java}
> spark-sql (default)> MERGE INTO testing_partial_updates.table2 as target
> > using (
> > select * from merge_source1
> > ) source
> > on target.uuid = source.uuid
> > when matched then
> > update set ts = source.ts, fare = source.fare
> > ;
> 24/11/15 17:23:11 WARN HoodieFileIndex: Data skipping requires both Metadata
> Table and at least one of Column Stats Index, Record Level Index, or
> Functional Index to be enabled as well! (isMetadataTableEnabled = false,
> isColumnStatsIndexEnabled = false, isRecordIndexApplicable = false,
> isFunctionalIndexEnabled = false, isBucketIndexEnable = false,
> isPartitionStatsIndexEnabled = false), isBloomFiltersIndexEnabled = false)
> 24/11/15 17:23:15 ERROR SimpleExecutor: Failed consuming records
> org.apache.hudi.exception.HoodieUpsertException: Failed to combine/merge new
> record with old value in storage, for new record {HoodieRecord{key=HoodieKey
> { recordKey=334e26e9-8355-45cc-97c6-c31daf0df330
> partitionPath=city=san_francisco}, currentLocation='null',
> newLocation='null'}}, old value {HoodieRecord{key=null,
> currentLocation='null', newLocation='null'}}
> at
> org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:376)
> at
> org.apache.hudi.table.action.commit.BaseMergeHelper$UpdateHandler.consume(BaseMergeHelper.java:54)
> at
> org.apache.hudi.table.action.commit.BaseMergeHelper$UpdateHandler.consume(BaseMergeHelper.java:44)
> at
> org.apache.hudi.common.util.queue.SimpleExecutor.execute(SimpleExecutor.java:69)
> at
> org.apache.hudi.table.action.commit.HoodieMergeHelper.runMerge(HoodieMergeHelper.java:149)
> at
> org.apache.hudi.table.HoodieSparkTable.runMerge(HoodieSparkTable.java:149)
> at
> org.apache.hudi.table.HoodieSparkCopyOnWriteTable.handleUpdateInternal(HoodieSparkCopyOnWriteTable.java:246)
> at
> org.apache.hudi.table.HoodieSparkCopyOnWriteTable.handleUpdate(HoodieSparkCopyOnWriteTable.java:241)
> at
> org.apache.hudi.table.action.compact.CompactionExecutionHelper.writeFileAndGetWriteStats(CompactionExecutionHelper.java:64)
> at
> org.apache.hudi.table.action.compact.HoodieCompactor.compact(HoodieCompactor.java:238)
> at
> org.apache.hudi.table.action.compact.HoodieCompactor.lambda$compact$988df80a$1(HoodieCompactor.java:133)
> at
> org.apache.spark.api.java.JavaPairRDD$.$anonfun$toScalaFunction$1(JavaPairRDD.scala:1070)
> at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
> at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
> at
> org.apache.spark.storage.memory.MemoryStore.putIterator(MemoryStore.scala:223)
> at
> org.apache.spark.storage.memory.MemoryStore.putIteratorAsBytes(MemoryStore.scala:352)
> at
> org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1614)
> at
> org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1524)
> at
> org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1588)
> at
> org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1389)
> at
> org.apache.spark.storage.BlockManager.getOrElseUpdateRDDBlock(BlockManager.scala:1343)
> at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:379)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
> at
> org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
> at
> org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
> at org.apache.spark.scheduler.Task.run(Task.scala:141)
> at
> org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
> at
> org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.avro.AvroRuntimeException: Malformed data. Length is
> negative: -26
> at org.apache.avro.io.BinaryDecoder.readString(BinaryDecoder.java:308)
> at
> org.apache.avro.io.ResolvingDecoder.readString(ResolvingDecoder.java:208)
> at
> org.apache.avro.generic.GenericDatumReader.readString(GenericDatumReader.java:470)
> at
> org.apache.avro.generic.GenericDatumReader.readString(GenericDatumReader.java:460)
> at
> org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:192)
> at
> org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:161)
> at
> org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:188)
> at
> org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:161)
> at
> org.apache.avro.generic.GenericDatumReader.readField(GenericDatumReader.java:260)
> at
> org.apache.avro.generic.GenericDatumReader.readRecord(GenericDatumReader.java:248)
> at
> org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:180)
> at
> org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:161)
> at
> org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:154)
> at
> org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro(HoodieAvroUtils.java:259)
> at
> org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro(HoodieAvroUtils.java:247)
> at
> org.apache.hudi.avro.HoodieAvroUtils.bytesToAvro(HoodieAvroUtils.java:239)
> at
> org.apache.hudi.common.model.DefaultHoodieRecordPayload.combineAndGetUpdateValue(DefaultHoodieRecordPayload.java:68)
> at
> org.apache.hudi.common.model.HoodieAvroRecordMerger.combineAndGetUpdateValue(HoodieAvroRecordMerger.java:62)
> at
> org.apache.hudi.common.model.HoodieAvroRecordMerger.merge(HoodieAvroRecordMerger.java:47)
> at
> org.apache.hudi.io.HoodieMergeHandle.write(HoodieMergeHandle.java:358)
> ... 37 more
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)