[ 
https://issues.apache.org/jira/browse/HUDI-5428?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17651368#comment-17651368
 ] 

Sagar Sumit commented on HUDI-5428:
-----------------------------------

Enabled CloudWatch metrics for connection pool available/leased/pending count. 
There is no connection leak. The available pool replenishes soon after the 
benchmark and there are no pending connections. However, there are quite a few 
HEAD requests with metadata table and it being investigated in HUDI-5429. Code 
for benchmarking (with Spark 3.3.0 on EMR 6.8.0)
{code:java}
val engineContext = new HoodieSparkEngineContext(sc)
////////////////////////////////////////////
// WITH METADATA AND COLUMN STATS ENABLED //
////////////////////////////////////////////
val metadataConfig = 
HoodieMetadataConfig.newBuilder.enable(true).withMetadataIndexColumnStats(true).build
val datasetBasePath = "s3a://hudi/table/basepath/"


val hoodieBackedTableMetadata = new HoodieBackedTableMetadata(engineContext, 
metadataConfig, datasetBasePath, "/tmp/smp1/")




val hoodieBackedTableMetadata = new HoodieBackedTableMetadata(engineContext, 
metadataConfig, datasetBasePath, "/tmp/smp1/")




val prefixes = List("2021").asJava
val partitionPaths = 
hoodieBackedTableMetadata.getPartitionPathsWithPrefixes(prefixes)


val partitionName= "2021-10"


val metaClient = 
HoodieTableMetaClient.builder.setConf(sc.hadoopConfiguration).setBasePath(datasetBasePath).setLoadActiveTimelineOnLoad(true).build


val fileSystemView = 
FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, 
metadataConfig)


val partitionBaseFilesPairs = 
fileSystemView.getLatestBaseFiles(partitionName).iterator.asScala.map(baseFile 
=> Pair.of(partitionName, baseFile.getFileName)).toList


val partitionField = metaClient.getTableConfig.getPartitionFieldProp


spark.time(hoodieBackedTableMetadata.getColumnStats(partitionBaseFilesPairs.asJava,
 partitionField))


//////////////////////////////
// WITHOUT METADATA ENABLED //
//////////////////////////////
val metadataConfig = 
HoodieMetadataConfig.newBuilder.enable(false).withMetadataIndexColumnStats(false).build


val hoodieBackedTableMetadata = new HoodieBackedTableMetadata(engineContext, 
metadataConfig, datasetBasePath, "/tmp/smp1/")


val prefixes = List("2021").asJava
val partitionPaths = 
hoodieBackedTableMetadata.getPartitionPathsWithPrefixes(prefixes)


val metaClient = 
HoodieTableMetaClient.builder.setConf(sc.hadoopConfiguration).setBasePath(datasetBasePath).setLoadActiveTimelineOnLoad(true).build


val fileSystemView = 
FileSystemViewManager.createInMemoryFileSystemView(engineContext, metaClient, 
metadataConfig)


val partitionBaseFilesPairs = 
fileSystemView.getLatestBaseFiles(partitionName).iterator.asScala.map(baseFile 
=> Pair.of(partitionName, baseFile.getFileName)).toList {code}

> Investigate S3 connection leaks w/ MDT 
> ---------------------------------------
>
>                 Key: HUDI-5428
>                 URL: https://issues.apache.org/jira/browse/HUDI-5428
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: metadata
>            Reporter: sivabalan narayanan
>            Assignee: Sagar Sumit
>            Priority: Blocker
>             Fix For: 0.13.0
>
>
> Quite a few reports on S3 connection leaks w/ MDT
> [https://github.com/apache/hudi/issues/5482]
> [https://github.com/apache/hudi/issues/5767]
> [https://github.com/apache/hudi/issues/7130]
>  
> we need to triage them and fix any connection leaks



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to