jlloh commented on issue #8191:
URL: https://github.com/apache/hudi/issues/8191#issuecomment-1627759070
Seeing something similar for Flink 1.16 with Hudi 0.13.1, COW insert with
metadata enabled. Problem seems to occur ~4 hours after the job has been
running. The job is an inline clustering job. After disabling metadata, the job
is able to proceed.
Configurations:
```
"table.table": "COPY_ON_WRITE"
"write.operation": "insert"
"write.insert.cluster": "true"
"hoodie.datasource.write.hive_style_partitioning": "true"
"metadata.enabled": "true"
"hoodie.datasource.write.hive_style_partitioning": "true"
"hoodie.parquet.max.file.size": "104857600"
"hoodie.parquet.small.file.limit": "20971520"
"clustering.plan.strategy.small.file.limit": "100"
```
Files:
~211 parquet files per partition across 4 hourly partitions when the issue
started happening and the job failed to continue. The bucket assigner task is
the one that hits this error. I have tried both hourly and daily partitions but
both jobs seem to eventually fail and not able to recover with metadata enabled.
Full stacktrace:
```
org.apache.hudi.exception.HoodieMetadataException: Failed to retrieve files
in partition
s3a://<bucket_name>/folder_name/local_year=2023/local_month=07/local_day=08
from metadata
at
org.apache.hudi.metadata.BaseTableMetadata.getAllFilesInPartition(BaseTableMetadata.java:152)
at
org.apache.hudi.metadata.HoodieMetadataFileSystemView.listPartition(HoodieMetadataFileSystemView.java:69)
at
org.apache.hudi.common.table.view.AbstractTableFileSystemView.lambda$ensurePartitionLoadedCorrectly$16(AbstractTableFileSystemView.java:432)
at
java.util.concurrent.ConcurrentHashMap.computeIfAbsent(ConcurrentHashMap.java:1660)
at
org.apache.hudi.common.table.view.AbstractTableFileSystemView.ensurePartitionLoadedCorrectly(AbstractTableFileSystemView.java:423)
at
org.apache.hudi.common.table.view.AbstractTableFileSystemView.getLatestBaseFilesBeforeOrOn(AbstractTableFileSystemView.java:660)
at
org.apache.hudi.common.table.view.PriorityBasedFileSystemView.execute(PriorityBasedFileSystemView.java:104)
at
org.apache.hudi.common.table.view.PriorityBasedFileSystemView.getLatestBaseFilesBeforeOrOn(PriorityBasedFileSystemView.java:145)
at
org.apache.hudi.sink.partitioner.profile.WriteProfile.smallFilesProfile(WriteProfile.java:208)
at
org.apache.hudi.sink.partitioner.profile.WriteProfile.getSmallFiles(WriteProfile.java:191)
at
org.apache.hudi.sink.partitioner.BucketAssigner.getSmallFileAssign(BucketAssigner.java:179)
at
org.apache.hudi.sink.partitioner.BucketAssigner.addInsert(BucketAssigner.java:137)
at
org.apache.hudi.sink.partitioner.BucketAssignFunction.getNewRecordLocation(BucketAssignFunction.java:215)
at
org.apache.hudi.sink.partitioner.BucketAssignFunction.processRecord(BucketAssignFunction.java:200)
at
org.apache.hudi.sink.partitioner.BucketAssignFunction.processElement(BucketAssignFunction.java:162)
at
org.apache.flink.streaming.api.operators.KeyedProcessOperator.processElement(KeyedProcessOperator.java:83)
at
org.apache.flink.streaming.runtime.tasks.OneInputStreamTask$StreamTaskNetworkOutput.emitRecord(OneInputStreamTask.java:233)
at
org.apache.flink.streaming.runtime.io.AbstractStreamTaskNetworkInput.processElement(AbstractStreamTaskNetworkInput.java:134)
at
org.apache.flink.streaming.runtime.io.AbstractStreamTaskNetworkInput.emitNext(AbstractStreamTaskNetworkInput.java:105)
at
org.apache.flink.streaming.runtime.io.StreamOneInputProcessor.processInput(StreamOneInputProcessor.java:65)
at
org.apache.flink.streaming.runtime.tasks.StreamTask.processInput(StreamTask.java:542)
at
org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:231)
at
org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:831)
at
org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:780)
at
org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:935)
at
org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:914)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:728)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:550)
at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.hudi.exception.HoodieException: Exception when reading
log file
at
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternalV1(AbstractHoodieLogRecordReader.java:374)
at
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternal(AbstractHoodieLogRecordReader.java:223)
at
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.performScan(HoodieMergedLogRecordScanner.java:198)
at
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:114)
at
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner.<init>(HoodieMergedLogRecordScanner.java:73)
at
org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner$Builder.build(HoodieMergedLogRecordScanner.java:464)
at
org.apache.hudi.metadata.HoodieMetadataLogRecordReader$Builder.build(HoodieMetadataLogRecordReader.java:218)
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.getLogRecordScanner(HoodieBackedTableMetadata.java:546)
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.openReaders(HoodieBackedTableMetadata.java:447)
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.getOrCreateReaders(HoodieBackedTableMetadata.java:432)
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.lambda$getRecordsByKeys$3(HoodieBackedTableMetadata.java:239)
at java.util.HashMap.forEach(HashMap.java:1290)
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.getRecordsByKeys(HoodieBackedTableMetadata.java:237)
at
org.apache.hudi.metadata.HoodieBackedTableMetadata.getRecordByKey(HoodieBackedTableMetadata.java:152)
at
org.apache.hudi.metadata.BaseTableMetadata.fetchAllFilesInPartition(BaseTableMetadata.java:339)
at
org.apache.hudi.metadata.BaseTableMetadata.getAllFilesInPartition(BaseTableMetadata.java:150)
... 28 more
Caused by: org.apache.hudi.exception.HoodieIOException: unable to initialize
read with log file
at
org.apache.hudi.common.table.log.HoodieLogFormatReader.hasNext(HoodieLogFormatReader.java:113)
at
org.apache.hudi.common.table.log.AbstractHoodieLogRecordReader.scanInternalV1(AbstractHoodieLogRecordReader.java:247)
... 43 more
Caused by: java.io.InterruptedIOException: getFileStatus on
s3a://<redacted>/.hoodie/metadata/files/.files-0000_00000000000000.log.2_0-1-0:
com.amazonaws.SdkClientException: Unable to execute HTTP request: Timeout
waiting for connection from pool
at
org.apache.hadoop.fs.s3a.S3AUtils.translateInterruptedException(S3AUtils.java:352)
at
org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:177)
at
org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:151)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2278)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2226)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2160)
at org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:727)
at
org.apache.hudi.common.fs.HoodieWrapperFileSystem.open(HoodieWrapperFileSystem.java:203)
at
org.apache.hudi.common.table.log.HoodieLogFileReader.getFSDataInputStream(HoodieLogFileReader.java:498)
at
org.apache.hudi.common.table.log.HoodieLogFileReader.<init>(HoodieLogFileReader.java:118)
at
org.apache.hudi.common.table.log.HoodieLogFormatReader.hasNext(HoodieLogFormatReader.java:110)
... 44 more
Caused by: com.amazonaws.SdkClientException: Unable to execute HTTP request:
Timeout waiting for connection from pool
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleRetryableException(AmazonHttpClient.java:1216)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1162)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:811)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:779)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:753)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:713)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:695)
at
com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:559)
at
com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:539)
at
com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5445)
at
com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5392)
at
com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1368)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$4(S3AFileSystem.java:1307)
at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:322)
at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:285)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:1304)
at
org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2264)
... 51 more
Caused by: org.apache.http.conn.ConnectionPoolTimeoutException: Timeout
waiting for connection from pool
at
org.apache.http.impl.conn.PoolingHttpClientConnectionManager.leaseConnection(PoolingHttpClientConnectionManager.java:316)
at
org.apache.http.impl.conn.PoolingHttpClientConnectionManager$1.get(PoolingHttpClientConnectionManager.java:282)
at sun.reflect.GeneratedMethodAccessor70.invoke(Unknown Source)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
com.amazonaws.http.conn.ClientConnectionRequestFactory$Handler.invoke(ClientConnectionRequestFactory.java:70)
at com.amazonaws.http.conn.$Proxy56.get(Unknown Source)
at
org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:190)
at
org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186)
at
org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185)
at
org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83)
at
org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56)
at
com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1343)
at
com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1154)
... 66 more
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]