This is an automated email from the ASF dual-hosted git repository. fanng pushed a commit to branch stats_job in repository https://gitbox.apache.org/repos/asf/gravitino.git
commit 4f09b90f0d3760c1067f6267b5c90609c1036180 Author: fanng <[email protected]> AuthorDate: Thu Mar 5 15:54:06 2026 +0800 Adjust update-stats defaults to 128MB target and 32MB small-file threshold - Change builtin update-stats job default target_file_size_bytes to 134217728 (128MB) - Use a fixed small_files threshold of 33554432 (32MB) in table/partition stats SQL - Align submit-update-stats command default target size and related tests --- .../jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java | 12 +++++++----- .../maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java | 13 +++++++++---- .../optimizer/command/SubmitUpdateStatsJobCommand.java | 2 +- .../gravitino/maintenance/optimizer/TestOptimizerCmd.java | 2 +- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java b/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java index 7f47273363..c6172acce5 100644 --- a/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java +++ b/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java @@ -63,7 +63,8 @@ public class IcebergUpdateStatsAndMetricsJob implements BuiltInJob { private static final String VERSION = "v1"; private static final String DEFAULT_STATISTICS_UPDATER = "gravitino-statistics-updater"; private static final String DEFAULT_METRICS_UPDATER = "gravitino-metrics-updater"; - private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 100_000L; + private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 128L * 1024 * 1024; + private static final long SMALL_FILE_THRESHOLD_BYTES = 32L * 1024 * 1024; private static final String DEFAULT_UPDATE_MODE = UpdateMode.ALL.modeName; private static final String CUSTOM_STAT_PREFIX = "custom-"; @@ -277,7 +278,7 @@ public class IcebergUpdateStatsAndMetricsJob implements BuiltInJob { + "SUM(CASE WHEN content = 1 THEN 1 ELSE 0 END) AS position_delete_files, " + "SUM(CASE WHEN content = 2 THEN 1 ELSE 0 END) AS equality_delete_files, " + "SUM(CASE WHEN file_size_in_bytes < " - + targetFileSizeBytes + + SMALL_FILE_THRESHOLD_BYTES + " THEN 1 ELSE 0 END) AS small_files, " + "AVG(POWER(" + targetFileSizeBytes @@ -300,7 +301,7 @@ public class IcebergUpdateStatsAndMetricsJob implements BuiltInJob { + "SUM(CASE WHEN content = 1 THEN 1 ELSE 0 END) AS position_delete_files, " + "SUM(CASE WHEN content = 2 THEN 1 ELSE 0 END) AS equality_delete_files, " + "SUM(CASE WHEN file_size_in_bytes < " - + targetFileSizeBytes + + SMALL_FILE_THRESHOLD_BYTES + " THEN 1 ELSE 0 END) AS small_files, " + "AVG(POWER(" + targetFileSizeBytes @@ -603,8 +604,9 @@ public class IcebergUpdateStatsAndMetricsJob implements BuiltInJob { + "\\n" + "Optional Options:\\n" + " --update-mode <stats|metrics|all> Update behavior mode, default: all\\n" - + " --target-file-size-bytes <bytes> Small-file threshold and MSE target\\n" - + " Default: 100000\\n" + + " --target-file-size-bytes <bytes> MSE target file size in bytes\\n" + + " Default: 134217728 (128MB)\\n" + + " small_files threshold is fixed at 33554432 (32MB)\\n" + " --updater-options <json> JSON map for updater and repository settings\\n" + " Example: '{\"gravitino_uri\":\"http://localhost:8090\",\\n" + " \"metalake\":\"test\",\"statistics_updater\":\"gravitino-statistics-updater\",\\n" diff --git a/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java b/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java index 8b2bbb79e8..cbbc27fce5 100644 --- a/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java +++ b/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java @@ -87,21 +87,26 @@ public class TestIcebergUpdateStatsJob { @Test public void testBuildStatsSql() { - String tableSql = IcebergUpdateStatsAndMetricsJob.buildTableStatsSql("cat", "db.tbl", 100000L); + String tableSql = + IcebergUpdateStatsAndMetricsJob.buildTableStatsSql("cat", "db.tbl", 134_217_728L); String partitionSql = - IcebergUpdateStatsAndMetricsJob.buildPartitionStatsSql("cat", "db.tbl", 100000L); + IcebergUpdateStatsAndMetricsJob.buildPartitionStatsSql("cat", "db.tbl", 134_217_728L); assertTrue(tableSql.contains("FROM cat.db.tbl.files")); assertTrue(tableSql.contains("AS datafile_mse")); + assertTrue(tableSql.contains("file_size_in_bytes < 33554432")); + assertTrue(tableSql.contains("134217728 - LEAST(134217728, file_size_in_bytes)")); assertTrue(partitionSql.contains("FROM cat.db.tbl.files")); assertTrue(partitionSql.contains("GROUP BY partition")); assertTrue(partitionSql.startsWith("SELECT partition")); + assertTrue(partitionSql.contains("file_size_in_bytes < 33554432")); + assertTrue(partitionSql.contains("134217728 - LEAST(134217728, file_size_in_bytes)")); } @Test public void testParseTargetFileSize() { - assertEquals(100000L, IcebergUpdateStatsAndMetricsJob.parseTargetFileSize(null)); - assertEquals(100000L, IcebergUpdateStatsAndMetricsJob.parseTargetFileSize("")); + assertEquals(134_217_728L, IcebergUpdateStatsAndMetricsJob.parseTargetFileSize(null)); + assertEquals(134_217_728L, IcebergUpdateStatsAndMetricsJob.parseTargetFileSize("")); assertEquals(2048L, IcebergUpdateStatsAndMetricsJob.parseTargetFileSize("2048")); assertThrows( IllegalArgumentException.class, diff --git a/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java b/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java index 0f7d6e5cca..dd0bc3cc8f 100644 --- a/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java +++ b/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java @@ -42,7 +42,7 @@ public class SubmitUpdateStatsJobCommand implements OptimizerCommandExecutor { private static final String JOB_TEMPLATE_NAME = "builtin-iceberg-update-stats"; private static final String DEFAULT_UPDATE_MODE = "stats"; - private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 100_000L; + private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 128L * 1024 * 1024; private static final String OPTION_UPDATER_OPTIONS = "updater-options"; private static final String OPTION_SPARK_CONF = "spark-conf"; diff --git a/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java b/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java index b18d73be1f..b450ad9916 100644 --- a/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java +++ b/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java @@ -526,7 +526,7 @@ class TestOptimizerCmd { "gravitino.optimizer.gravitinoMetalake = test", "gravitino.optimizer.gravitinoDefaultCatalog = rest", "gravitino.optimizer.jobSubmitterConfig.update_mode = stats", - "gravitino.optimizer.jobSubmitterConfig.target_file_size_bytes = 100000", + "gravitino.optimizer.jobSubmitterConfig.target_file_size_bytes = 134217728", "gravitino.optimizer.jobSubmitterConfig.updater_options = " + "{\"gravitino_uri\":\"http://localhost:8090\",\"metalake\":\"test\"}", "gravitino.optimizer.jobSubmitterConfig.spark_conf = "
