lokeshj1703 opened a new pull request, #9618: URL: https://github.com/apache/hudi/pull/9618
### Change Logs Sometimes we see some flakiness around parquet inline reading. Ref: https://dev.azure.com/apache-hudi-ci-org/785b6ef4-2f42-4a89-8f0e-5f0d7039a0cc/_apis/build/builds/19457/logs/8 ``` 2023-08-25T05:00:14.1359469Z 1389627 [Executor task launch worker for task 1.0 in stage 4124.0 (TID 5621)] ERROR org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader [] - Got exception when reading log file 2023-08-25T05:00:14.1360427Z org.apache.hudi.exception.HoodieException: unable to read next record from parquet file 2023-08-25T05:00:14.1361525Z at org.apache.hudi.common.util.ParquetReaderIterator.hasNext(ParquetReaderIterator.java:54) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1362403Z at org.apache.hudi.common.util.collection.MappingIterator.hasNext(MappingIterator.java:39) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1363340Z at org.apache.hudi.common.util.collection.MappingIterator.hasNext(MappingIterator.java:39) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1364854Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.processDataBlock(AbstractHoodieLogRecordReader.java:625) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1365985Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.processQueuedBlocksForInstant(AbstractHoodieLogRecordReader.java:667) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1367473Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternalV1(AbstractHoodieLogRecordReader.java:362) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1368371Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:220) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1369127Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:201) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1369901Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:117) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1370633Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:76) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1371380Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:466) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1372312Z at org.apache.hudi.LogFileIterator$.scanLog(Iterators.scala:371) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1372915Z at org.apache.hudi.LogFileIterator.<init>(Iterators.scala:110) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1373549Z at org.apache.hudi.RecordMergingFileIterator.<init>(Iterators.scala:201) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1374172Z at org.apache.hudi.RecordMergingFileIterator.<init>(Iterators.scala:212) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1374809Z at org.apache.hudi.RecordMergingFileIterator.<init>(Iterators.scala:217) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1375480Z at org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:109) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1376156Z at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1376653Z at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1377283Z at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1377837Z at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1378323Z at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1378855Z at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1379397Z at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1379899Z at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1380446Z at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1381328Z at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1381902Z at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1382409Z at org.apache.spark.scheduler.Task.run(Task.scala:131) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1383130Z at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1383688Z at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1384226Z at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1384631Z at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_382] 2023-08-25T05:00:14.1385010Z at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_382] 2023-08-25T05:00:14.1385313Z at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_382] 2023-08-25T05:00:14.1386139Z Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file inlinefs://tmp/junit8419288775999068556/continuous_mor/2015/03/16/.daf7ee3a-506b-464b-b78f-776c76d7e47a-0_20230825050006621.log.1_1-4089-5572/file/?start_offset=1788&length=50947 2023-08-25T05:00:14.1387069Z at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:254) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1387683Z at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:132) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1388227Z at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:136) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1388947Z at org.apache.hudi.common.util.ParquetReaderIterator.hasNext(ParquetReaderIterator.java:49) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1389273Z ... 34 more 2023-08-25T05:00:14.1389655Z Caused by: org.apache.parquet.io.ParquetDecodingException: The requested schema is not compatible with the file schema. incompatible types: required fixed_len_byte_array(5) height (DECIMAL(10,6)) != required int64 height (DECIMAL(10,6)) 2023-08-25T05:00:14.1390406Z at org.apache.parquet.io.ColumnIOFactory$ColumnIOCreatorVisitor.incompatibleSchema(ColumnIOFactory.java:101) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1391060Z at org.apache.parquet.io.ColumnIOFactory$ColumnIOCreatorVisitor.visit(ColumnIOFactory.java:93) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1391655Z at org.apache.parquet.schema.PrimitiveType.accept(PrimitiveType.java:602) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1392271Z at org.apache.parquet.io.ColumnIOFactory$ColumnIOCreatorVisitor.visitChildren(ColumnIOFactory.java:83) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1392925Z at org.apache.parquet.io.ColumnIOFactory$ColumnIOCreatorVisitor.visit(ColumnIOFactory.java:57) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1393517Z at org.apache.parquet.schema.MessageType.accept(MessageType.java:55) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1394091Z at org.apache.parquet.io.ColumnIOFactory.getColumnIO(ColumnIOFactory.java:162) ~[parquet-column-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1394744Z at org.apache.parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:135) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1395437Z at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:225) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1396044Z at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:132) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1396601Z at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:136) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1397241Z at org.apache.hudi.common.util.ParquetReaderIterator.hasNext(ParquetReaderIterator.java:49) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1397572Z ... 34 more 2023-08-25T05:00:14.1413335Z 1389628 [Executor task launch worker for task 2.0 in stage 4124.0 (TID 5622)] ERROR org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader [] - Got exception when reading log file 2023-08-25T05:00:14.1414164Z org.apache.hudi.exception.HoodieException: unable to read next record from parquet file 2023-08-25T05:00:14.1417364Z at org.apache.hudi.common.util.ParquetReaderIterator.hasNext(ParquetReaderIterator.java:54) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1419044Z at org.apache.hudi.common.util.collection.MappingIterator.hasNext(MappingIterator.java:39) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1419938Z at org.apache.hudi.common.util.collection.MappingIterator.hasNext(MappingIterator.java:39) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1421149Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.processDataBlock(AbstractHoodieLogRecordReader.java:625) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1422075Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.processQueuedBlocksForInstant(AbstractHoodieLogRecordReader.java:667) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1423012Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternalV1(AbstractHoodieLogRecordReader.java:362) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1423879Z at org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:220) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1425472Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:201) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1426992Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:117) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1427800Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:76) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1428569Z at org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:466) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1429241Z at org.apache.hudi.LogFileIterator$.scanLog(Iterators.scala:371) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1429848Z at org.apache.hudi.LogFileIterator.<init>(Iterators.scala:110) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1430475Z at org.apache.hudi.RecordMergingFileIterator.<init>(Iterators.scala:201) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1431098Z at org.apache.hudi.RecordMergingFileIterator.<init>(Iterators.scala:212) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1431731Z at org.apache.hudi.RecordMergingFileIterator.<init>(Iterators.scala:217) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1432392Z at org.apache.hudi.HoodieMergeOnReadRDD.compute(HoodieMergeOnReadRDD.scala:109) ~[hudi-spark-common_2.12-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1432960Z at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1433459Z at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1433994Z at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1434520Z at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1435014Z at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1435538Z at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1436053Z at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1436846Z at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1437415Z at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1438006Z at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1438563Z at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1439081Z at org.apache.spark.scheduler.Task.run(Task.scala:131) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1439643Z at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1440181Z at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1491) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1440781Z at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509) ~[spark-core_2.12-3.2.3.jar:3.2.3] 2023-08-25T05:00:14.1441180Z at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_382] 2023-08-25T05:00:14.1441542Z at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_382] 2023-08-25T05:00:14.1441862Z at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_382] 2023-08-25T05:00:14.1442808Z Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 0 in block -1 in file inlinefs://tmp/junit8419288775999068556/continuous_mor/2015/03/17/.76b836b0-2aae-4518-9987-c879a3a9422b-0_20230825050006621.log.1_2-4089-5573/file/?start_offset=1788&length=54731 2023-08-25T05:00:14.1443600Z at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:254) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1444188Z at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:132) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1444747Z at org.apache.parquet.hadoop.ParquetReader.read(ParquetReader.java:136) ~[parquet-hadoop-1.12.2.jar:1.12.2] 2023-08-25T05:00:14.1445384Z at org.apache.hudi.common.util.ParquetReaderIterator.hasNext(ParquetReaderIterator.java:49) ~[hudi-common-1.0.0-SNAPSHOT.jar:1.0.0-SNAPSHOT] 2023-08-25T05:00:14.1445700Z ... 34 more 2023-08-25T05:00:14.1446104Z Caused by: org.apache.parquet.io.ParquetDecodingException: The requested schema is not compatible with the file schema. incompatible types: required fixed_len_byte_array(5) height (DECIMAL(10,6)) != required int64 height (DECIMAL(10,6)) ``` When HoodieRecordType is set to SPARK in DeltaStreamer test, the necessary changes are made for DeltaStreamer configs but not for reading via DataSource APIs. The PR adds a `hudiOpts` field to Base Deltastreamer test class so that the APIs always consider the hudiOpts during datasource read. ### Impact NA ### Risk level (write none, low medium or high below) low ### Documentation Update NA ### Contributor's checklist - [ ] Read through [contributor's guide](https://hudi.apache.org/contribute/how-to-contribute) - [ ] Change Logs and Impact were stated clearly - [ ] Adequate tests were added if applicable - [ ] CI passed -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
