This is an automated email from the ASF dual-hosted git repository.

fanng pushed a commit to branch stats_job
in repository https://gitbox.apache.org/repos/asf/gravitino.git

commit 4f09b90f0d3760c1067f6267b5c90609c1036180
Author: fanng <[email protected]>
AuthorDate: Thu Mar 5 15:54:06 2026 +0800

    Adjust update-stats defaults to 128MB target and 32MB small-file threshold
    
    - Change builtin update-stats job default target_file_size_bytes to 
134217728 (128MB)
    
    - Use a fixed small_files threshold of 33554432 (32MB) in table/partition 
stats SQL
    
    - Align submit-update-stats command default target size and related tests
---
 .../jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java       | 12 +++++++-----
 .../maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java | 13 +++++++++----
 .../optimizer/command/SubmitUpdateStatsJobCommand.java      |  2 +-
 .../gravitino/maintenance/optimizer/TestOptimizerCmd.java   |  2 +-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git 
a/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java
 
b/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java
index 7f47273363..c6172acce5 100644
--- 
a/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java
+++ 
b/maintenance/jobs/src/main/java/org/apache/gravitino/maintenance/jobs/iceberg/IcebergUpdateStatsAndMetricsJob.java
@@ -63,7 +63,8 @@ public class IcebergUpdateStatsAndMetricsJob implements 
BuiltInJob {
   private static final String VERSION = "v1";
   private static final String DEFAULT_STATISTICS_UPDATER = 
"gravitino-statistics-updater";
   private static final String DEFAULT_METRICS_UPDATER = 
"gravitino-metrics-updater";
-  private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 100_000L;
+  private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 128L * 1024 * 
1024;
+  private static final long SMALL_FILE_THRESHOLD_BYTES = 32L * 1024 * 1024;
   private static final String DEFAULT_UPDATE_MODE = UpdateMode.ALL.modeName;
   private static final String CUSTOM_STAT_PREFIX = "custom-";
 
@@ -277,7 +278,7 @@ public class IcebergUpdateStatsAndMetricsJob implements 
BuiltInJob {
         + "SUM(CASE WHEN content = 1 THEN 1 ELSE 0 END) AS 
position_delete_files, "
         + "SUM(CASE WHEN content = 2 THEN 1 ELSE 0 END) AS 
equality_delete_files, "
         + "SUM(CASE WHEN file_size_in_bytes < "
-        + targetFileSizeBytes
+        + SMALL_FILE_THRESHOLD_BYTES
         + " THEN 1 ELSE 0 END) AS small_files, "
         + "AVG(POWER("
         + targetFileSizeBytes
@@ -300,7 +301,7 @@ public class IcebergUpdateStatsAndMetricsJob implements 
BuiltInJob {
         + "SUM(CASE WHEN content = 1 THEN 1 ELSE 0 END) AS 
position_delete_files, "
         + "SUM(CASE WHEN content = 2 THEN 1 ELSE 0 END) AS 
equality_delete_files, "
         + "SUM(CASE WHEN file_size_in_bytes < "
-        + targetFileSizeBytes
+        + SMALL_FILE_THRESHOLD_BYTES
         + " THEN 1 ELSE 0 END) AS small_files, "
         + "AVG(POWER("
         + targetFileSizeBytes
@@ -603,8 +604,9 @@ public class IcebergUpdateStatsAndMetricsJob implements 
BuiltInJob {
             + "\\n"
             + "Optional Options:\\n"
             + "  --update-mode <stats|metrics|all> Update behavior mode, 
default: all\\n"
-            + "  --target-file-size-bytes <bytes>   Small-file threshold and 
MSE target\\n"
-            + "                                     Default: 100000\\n"
+            + "  --target-file-size-bytes <bytes>   MSE target file size in 
bytes\\n"
+            + "                                     Default: 134217728 
(128MB)\\n"
+            + "                                     small_files threshold is 
fixed at 33554432 (32MB)\\n"
             + "  --updater-options <json>           JSON map for updater and 
repository settings\\n"
             + "                                     Example: 
'{\"gravitino_uri\":\"http://localhost:8090\",\\n";
             + "                                     
\"metalake\":\"test\",\"statistics_updater\":\"gravitino-statistics-updater\",\\n"
diff --git 
a/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java
 
b/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java
index 8b2bbb79e8..cbbc27fce5 100644
--- 
a/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java
+++ 
b/maintenance/jobs/src/test/java/org/apache/gravitino/maintenance/jobs/iceberg/TestIcebergUpdateStatsJob.java
@@ -87,21 +87,26 @@ public class TestIcebergUpdateStatsJob {
 
   @Test
   public void testBuildStatsSql() {
-    String tableSql = 
IcebergUpdateStatsAndMetricsJob.buildTableStatsSql("cat", "db.tbl", 100000L);
+    String tableSql =
+        IcebergUpdateStatsAndMetricsJob.buildTableStatsSql("cat", "db.tbl", 
134_217_728L);
     String partitionSql =
-        IcebergUpdateStatsAndMetricsJob.buildPartitionStatsSql("cat", 
"db.tbl", 100000L);
+        IcebergUpdateStatsAndMetricsJob.buildPartitionStatsSql("cat", 
"db.tbl", 134_217_728L);
 
     assertTrue(tableSql.contains("FROM cat.db.tbl.files"));
     assertTrue(tableSql.contains("AS datafile_mse"));
+    assertTrue(tableSql.contains("file_size_in_bytes < 33554432"));
+    assertTrue(tableSql.contains("134217728 - LEAST(134217728, 
file_size_in_bytes)"));
     assertTrue(partitionSql.contains("FROM cat.db.tbl.files"));
     assertTrue(partitionSql.contains("GROUP BY partition"));
     assertTrue(partitionSql.startsWith("SELECT partition"));
+    assertTrue(partitionSql.contains("file_size_in_bytes < 33554432"));
+    assertTrue(partitionSql.contains("134217728 - LEAST(134217728, 
file_size_in_bytes)"));
   }
 
   @Test
   public void testParseTargetFileSize() {
-    assertEquals(100000L, 
IcebergUpdateStatsAndMetricsJob.parseTargetFileSize(null));
-    assertEquals(100000L, 
IcebergUpdateStatsAndMetricsJob.parseTargetFileSize(""));
+    assertEquals(134_217_728L, 
IcebergUpdateStatsAndMetricsJob.parseTargetFileSize(null));
+    assertEquals(134_217_728L, 
IcebergUpdateStatsAndMetricsJob.parseTargetFileSize(""));
     assertEquals(2048L, 
IcebergUpdateStatsAndMetricsJob.parseTargetFileSize("2048"));
     assertThrows(
         IllegalArgumentException.class,
diff --git 
a/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java
 
b/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java
index 0f7d6e5cca..dd0bc3cc8f 100644
--- 
a/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java
+++ 
b/maintenance/optimizer/src/main/java/org/apache/gravitino/maintenance/optimizer/command/SubmitUpdateStatsJobCommand.java
@@ -42,7 +42,7 @@ public class SubmitUpdateStatsJobCommand implements 
OptimizerCommandExecutor {
 
   private static final String JOB_TEMPLATE_NAME = 
"builtin-iceberg-update-stats";
   private static final String DEFAULT_UPDATE_MODE = "stats";
-  private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 100_000L;
+  private static final long DEFAULT_TARGET_FILE_SIZE_BYTES = 128L * 1024 * 
1024;
   private static final String OPTION_UPDATER_OPTIONS = "updater-options";
   private static final String OPTION_SPARK_CONF = "spark-conf";
 
diff --git 
a/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java
 
b/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java
index b18d73be1f..b450ad9916 100644
--- 
a/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java
+++ 
b/maintenance/optimizer/src/test/java/org/apache/gravitino/maintenance/optimizer/TestOptimizerCmd.java
@@ -526,7 +526,7 @@ class TestOptimizerCmd {
                 "gravitino.optimizer.gravitinoMetalake = test",
                 "gravitino.optimizer.gravitinoDefaultCatalog = rest",
                 "gravitino.optimizer.jobSubmitterConfig.update_mode = stats",
-                "gravitino.optimizer.jobSubmitterConfig.target_file_size_bytes 
= 100000",
+                "gravitino.optimizer.jobSubmitterConfig.target_file_size_bytes 
= 134217728",
                 "gravitino.optimizer.jobSubmitterConfig.updater_options = "
                     + 
"{\"gravitino_uri\":\"http://localhost:8090\",\"metalake\":\"test\"}";,
                 "gravitino.optimizer.jobSubmitterConfig.spark_conf = "

Reply via email to