Limess commented on issue #9814:
URL: https://github.com/apache/hudi/issues/9814#issuecomment-2213775585

   After upgrading to 0.14.1 this is still occuring. This didn't happen until 
enabling metadata.
   
   Any recommendations to mitigate this? The cluster is pretty large using this 
config:
   
   ```
   [
     {
       "Classification": "spark",
       "Properties": {
         "maximizeResourceAllocation": "false"
       }
     },
     {
       "Classification": "spark-defaults",
       "Properties": {
         "spark.default.parallelism": "3352",
         "spark.driver.cores": "4",
         "spark.driver.extraJavaOptions": "-XX:+UseG1GC 
-XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark 
-XX:InitiatingHeapOccupancyPercent=35",
         "spark.driver.memory": "25g",
         "spark.driver.memoryOverhead": "3g",
         "spark.dynamicAllocation.enabled": "false",
         "spark.executor.cores": "4",
         "spark.executor.extraJavaOptions": "-XX:+UseG1GC 
-XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark 
-XX:InitiatingHeapOccupancyPercent=35",
         "spark.executor.instances": "419",
         "spark.executor.maxNumFailures": "100",
         "spark.executor.memory": "25g",
         "spark.executor.memoryOverhead": "3g",
         "spark.executor.processTreeMetrics.enabled": "true",
         "spark.executorEnv.PEX_INHERIT_PATH": "fallback",
         "spark.hadoop.fs.s3.connection.maximum": "1000",
         "spark.hadoop.fs.s3a.connection.maximum": "1000",
         "spark.kryoserializer.buffer.max": "256m",
         "spark.metrics.namespace": "spark",
         "spark.rdd.compress": "true",
         "spark.scheduler.mode": "FAIR",
         "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
         "spark.shuffle.service.enabled": "true",
         "spark.sql.adaptive.coalescePartitions.enabled": "true",
         "spark.sql.shuffle.partitions": "3352",
         "spark.task.maxFailures": "10",
         "spark.ui.prometheus.enabled": "true",
         "spark.yarn.appMasterEnv.PEX_INHERIT_PATH": "fallback",
         "spark.yarn.maxAppAttempts": "1"
       }
     },
     {
       "Classification": "spark-log4j2",
       "Properties": {
         "logger.hudi.level": "INFO",
         "logger.hudi.name": "org.apache.hudi"
       }
     },
     {
       "Classification": "spark-metrics",
       "Properties": {
         "*.sink.prometheusServlet.class": 
"org.apache.spark.metrics.sink.PrometheusServlet",
         "*.sink.prometheusServlet.path": "/metrics/prometheus",
         "applications.sink.prometheusServlet.path": 
"/metrics/applications/prometheus",
         "driver.source.jvm.class": "org.apache.spark.metrics.source.JvmSource",
         "executor.source.jvm.class": 
"org.apache.spark.metrics.source.JvmSource",
         "master.sink.prometheusServlet.path": "/metrics/master/prometheus",
         "master.source.jvm.class": "org.apache.spark.metrics.source.JvmSource",
         "worker.source.jvm.class": "org.apache.spark.metrics.source.JvmSource"
       }
     },
     {
       "Classification": "capacity-scheduler",
       "Properties": {
         "yarn.scheduler.capacity.resource-calculator": 
"org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator "
       }
     },
     {
       "Classification": "yarn-site",
       "Properties": {
         
"yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":
 "99.0",
         "yarn.nodemanager.pmem-check-enabled": "false",
         "yarn.nodemanager.vmem-check-enabled": "false"
       }
     },
     {
       "Classification": "emrfs-site",
       "Properties": {
         "fs.s3.maxConnections": "1000"
       }
     },
     {
       "Classification": "hive-site",
       "Properties": {
         "hive.metastore.client.factory.class": 
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
       }
     },
     {
       "Classification": "hdfs-site",
       "Properties": {
         "dfs.replication": "2"
       }
     },
     {
       "Classification": "presto-connector-hive",
       "Properties": {
         "hive.metastore.glue.datacatalog.enabled": "true",
         "hive.parquet.use-column-names": "true"
       }
     },
     {
       "Classification": "spark-hive-site",
       "Properties": {
         "hive.metastore.client.factory.class": 
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
       }
     },
     {
       "Classification": "spark-env",
       "Configurations": [
         {
           "Classification": "export",
           "Properties": {
             "PYSPARK_PYTHON": "./data_platform_spark_jobs.pex"
           }
         }
       ],
       "Properties": {}
     },
     {
       "Classification": "hadoop-env",
       "Configurations": [
         {
           "Classification": "export",
           "Properties": {
             "HADOOP_DATANODE_OPTS": 
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7001:/etc/hadoop/conf/hdfs_jmx_config_datanode.yaml
 -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false 
-Dcom.sun.management.jmxremote.authenticate=false 
-Dcom.sun.management.jmxremote.port=50103",
             "HADOOP_NAMENODE_OPTS": 
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7001:/etc/hadoop/conf/hdfs_jmx_config_namenode.yaml
 -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false 
-Dcom.sun.management.jmxremote.authenticate=false 
-Dcom.sun.management.jmxremote.port=50103"
           }
         }
       ],
       "Properties": {}
     },
     {
       "Classification": "yarn-env",
       "Configurations": [
         {
           "Classification": "export",
           "Properties": {
             "YARN_NODEMANAGER_OPTS": 
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7005:/etc/hadoop/conf/yarn_jmx_config_node_manager.yaml
 -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false 
-Dcom.sun.management.jmxremote.authenticate=false 
-Dcom.sun.management.jmxremote.port=50111",
             "YARN_RESOURCEMANAGER_OPTS": 
"-javaagent:/etc/prometheus/jmx_prometheus_javaagent.jar=7005:/etc/hadoop/conf/yarn_jmx_config_resource_manager.yaml
 -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false 
-Dcom.sun.management.jmxremote.authenticate=false 
-Dcom.sun.management.jmxremote.port=50111"
           }
         }
       ],
       "Properties": {}
     },
     {
       "Classification": "hudi-defaults",
       "Properties": {
         "hoodie.archive.async": "true",
         "hoodie.bulkinsert.sort.mode": "GLOBAL_SORT",
         "hoodie.clean.async": "true",
         "hoodie.cleaner.commits.retained": "1",
         "hoodie.cleaner.policy.failed.writes": "LAZY",
         "hoodie.datasource.hive_sync.support_timestamp": "true",
         "hoodie.datasource.meta.sync.glue.metadata_file_listing": "true",
         "hoodie.enable.data.skipping": "true",
         "hoodie.filesystem.view.remote.retry.enable": "true",
         "hoodie.keep.max.commits": "15",
         "hoodie.keep.min.commits": "10",
         "hoodie.metadata.index.bloom.filter.enable": "true",
         "hoodie.metadata.index.column.stats.enable": "true",
         "hoodie.metrics.on": "true",
         "hoodie.metrics.reporter.type": "PROMETHEUS",
         "hoodie.parquet.compression.codec": "snappy",
         "hoodie.parquet.max.file.size": "536870912",
         "hoodie.parquet.small.file.limit": "429496729",
         "hoodie.write.concurrency.early.conflict.detection.enable": "true",
         "hoodie.write.concurrency.mode": "optimistic_concurrency_control",
         "hoodie.write.lock.dynamodb.billing_mode": "PAY_PER_REQUEST",
         "hoodie.write.lock.dynamodb.endpoint_url": 
"dynamodb.eu-west-1.amazonaws.com",
         "hoodie.write.lock.dynamodb.region": "eu-west-1",
         "hoodie.write.lock.dynamodb.table": "data-platform-hudi-locks",
         "hoodie.write.lock.provider": 
"org.apache.hudi.aws.transaction.lock.DynamoDBBasedLockProvider"
       }
     }
   ]
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to