Limess commented on issue #9814:
URL: https://github.com/apache/hudi/issues/9814#issuecomment-2213775585
After upgrading to 0.14.1 this is still occuring. This didn't happen until
enabling metadata.
Any recommendations to mitigate this? The cluster is pretty large using this
config:
```
[
{
"Classification": "spark",
"Properties": {
"maximizeResourceAllocation": "false"
}
},
{
"Classification": "spark-defaults",
"Properties": {
"spark.default.parallelism": "3352",
"spark.driver.cores": "4",
"spark.driver.extraJavaOptions": "-XX:+UseG1GC
-XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark
-XX:InitiatingHeapOccupancyPercent=35",
"spark.driver.memory": "25g",
"spark.driver.memoryOverhead": "3g",
"spark.dynamicAllocation.enabled": "false",
"spark.executor.cores": "4",
"spark.executor.extraJavaOptions": "-XX:+UseG1GC
-XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark
-XX:InitiatingHeapOccupancyPercent=35",
"spark.executor.instances": "419",
"spark.executor.maxNumFailures": "100",
"spark.executor.memory": "25g",
"spark.executor.memoryOverhead": "3g",
"spark.executor.processTreeMetrics.enabled": "true",
"spark.executorEnv.PEX_INHERIT_PATH": "fallback",
"spark.hadoop.fs.s3.connection.maximum": "1000",
"spark.hadoop.fs.s3a.connection.maximum": "1000",
"spark.kryoserializer.buffer.max": "256m",
"spark.metrics.namespace": "spark",
"spark.rdd.compress": "true",
"spark.scheduler.mode": "FAIR",
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
"spark.shuffle.service.enabled": "true",
"spark.sql.adaptive.coalescePartitions.enabled": "true",
"spark.sql.shuffle.partitions": "3352",
"spark.task.maxFailures": "10",
"spark.ui.prometheus.enabled": "true",
"spark.yarn.appMasterEnv.PEX_INHERIT_PATH": "fallback",
"spark.yarn.maxAppAttempts": "1"
}
},
{
"Classification": "spark-log4j2",
"Properties": {
"logger.hudi.level": "INFO",
"logger.hudi.name": "org.apache.hudi"
}
},
{
"Classification": "spark-metrics",
"Properties": {
"*.sink.prometheusServlet.class":
"org.apache.spark.metrics.sink.PrometheusServlet",
"*.sink.prometheusServlet.path": "/metrics/prometheus",
"applications.sink.prometheusServlet.path":
"/metrics/applications/prometheus",
"driver.source.jvm.class": "org.apache.spark.metrics.source.JvmSource",
"executor.source.jvm.class":
"org.apache.spark.metrics.source.JvmSource",
"master.sink.prometheusServlet.path": "/metrics/master/prometheus",
"master.source.jvm.class": "org.apache.spark.metrics.source.JvmSource",
"worker.source.jvm.class": "org.apache.spark.metrics.source.JvmSource"
}
},
{
"Classification": "capacity-scheduler",
"Properties": {
"yarn.scheduler.capacity.resource-calculator":
"org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator "
}
},
{
"Classification": "yarn-site",
"Properties": {
"yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":
"99.0",
"yarn.nodemanager.pmem-check-enabled": "false",
"yarn.nodemanager.vmem-check-enabled": "false"
}
},
{
"Classification": "emrfs-site",
"Properties": {
"fs.s3.maxConnections": "1000"
}
},
{
"Classification": "hive-site",
"Properties": {
"hive.metastore.client.factory.class":
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification": "hdfs-site",
"Properties": {
"dfs.replication": "2"
}
},
{
"Classification": "presto-connector-hive",
"Properties": {
"hive.metastore.glue.datacatalog.enabled": "true",
"hive.parquet.use-column-names": "true"
}
},
{
"Classification": "spark-hive-site",
"Properties": {
"hive.metastore.client.factory.class":
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
}
},
{
"Classification": "spark-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"PYSPARK_PYTHON": "./data_platform_spark_jobs.pex"
}
}
],
"Properties": {}
},
{
"Classification": "hadoop-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"HADOOP_DATANODE_OPTS":
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7001:/etc/hadoop/conf/hdfs_jmx_config_datanode.yaml
-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false
-Dcom.sun.management.jmxremote.authenticate=false
-Dcom.sun.management.jmxremote.port=50103",
"HADOOP_NAMENODE_OPTS":
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7001:/etc/hadoop/conf/hdfs_jmx_config_namenode.yaml
-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false
-Dcom.sun.management.jmxremote.authenticate=false
-Dcom.sun.management.jmxremote.port=50103"
}
}
],
"Properties": {}
},
{
"Classification": "yarn-env",
"Configurations": [
{
"Classification": "export",
"Properties": {
"YARN_NODEMANAGER_OPTS":
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7005:/etc/hadoop/conf/yarn_jmx_config_node_manager.yaml
-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false
-Dcom.sun.management.jmxremote.authenticate=false
-Dcom.sun.management.jmxremote.port=50111",
"YARN_RESOURCEMANAGER_OPTS":
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7005:/etc/hadoop/conf/yarn_jmx_config_resource_manager.yaml
-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false
-Dcom.sun.management.jmxremote.authenticate=false
-Dcom.sun.management.jmxremote.port=50111"
}
}
],
"Properties": {}
},
{
"Classification": "hudi-defaults",
"Properties": {
"hoodie.archive.async": "true",
"hoodie.bulkinsert.sort.mode": "GLOBAL_SORT",
"hoodie.clean.async": "true",
"hoodie.cleaner.commits.retained": "1",
"hoodie.cleaner.policy.failed.writes": "LAZY",
"hoodie.datasource.hive_sync.support_timestamp": "true",
"hoodie.datasource.meta.sync.glue.metadata_file_listing": "true",
"hoodie.enable.data.skipping": "true",
"hoodie.filesystem.view.remote.retry.enable": "true",
"hoodie.keep.max.commits": "15",
"hoodie.keep.min.commits": "10",
"hoodie.metadata.index.bloom.filter.enable": "true",
"hoodie.metadata.index.column.stats.enable": "true",
"hoodie.metrics.on": "true",
"hoodie.metrics.reporter.type": "PROMETHEUS",
"hoodie.parquet.compression.codec": "snappy",
"hoodie.parquet.max.file.size": "536870912",
"hoodie.parquet.small.file.limit": "429496729",
"hoodie.write.concurrency.early.conflict.detection.enable": "true",
"hoodie.write.concurrency.mode": "optimistic_concurrency_control",
"hoodie.write.lock.dynamodb.billing_mode": "PAY_PER_REQUEST",
"hoodie.write.lock.dynamodb.endpoint_url":
"dynamodb.eu-west-1.amazonaws.com",
"hoodie.write.lock.dynamodb.region": "eu-west-1",
"hoodie.write.lock.dynamodb.table": "data-platform-hudi-locks",
"hoodie.write.lock.provider":
"org.apache.hudi.aws.transaction.lock.DynamoDBBasedLockProvider"
}
}
]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]