This is an automated email from the ASF dual-hosted git repository.

jshao pushed a commit to branch branch-1.1
in repository https://gitbox.apache.org/repos/asf/gravitino.git


The following commit(s) were added to refs/heads/branch-1.1 by this push:
     new 8f80781bbe [Cherry-pick to branch-1.1] [#9650] 
improvement(statistics): Add `maxStatisticsPerUpdate` configuration for Lance 
partition storage (#10149) (#10160)
8f80781bbe is described below

commit 8f80781bbe87ab85a34490c6d774c550783c7eab
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Mar 4 15:56:42 2026 +0800

    [Cherry-pick to branch-1.1] [#9650] improvement(statistics): Add 
`maxStatisticsPerUpdate` configuration for Lance partition storage (#10149) 
(#10160)
    
    **Cherry-pick Information:**
    - Original commit: 615568a1637c36cab67eac15d1a94e12767cd202
    - Target branch: `branch-1.1`
    - Status: ✅ Clean cherry-pick (no conflicts)
    
    Co-authored-by: roryqi <[email protected]>
---
 .../storage/LancePartitionStatisticStorage.java    | 22 ++++++
 .../TestLancePartitionStatisticStorage.java        | 81 ++++++++++++++++++++++
 docs/manage-statistics-in-gravitino.md             | 21 +++---
 3 files changed, 114 insertions(+), 10 deletions(-)

diff --git 
a/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
 
b/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
index c6e2398b29..a2bfb0c720 100644
--- 
a/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
+++ 
b/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
@@ -97,6 +97,8 @@ public class LancePartitionStatisticStorage implements 
PartitionStatisticStorage
   private static final long DEFAULT_METADATA_FILE_CACHE_SIZE = 100L * 1024; // 
100KB
   private static final String INDEX_CACHE_SIZE = "indexCacheSizeBytes";
   private static final long DEFAULT_INDEX_CACHE_SIZE = 100L * 1024; // 100KB
+  private static final String MAX_STATISTICS_PER_UPDATE = 
"maxStatisticsPerUpdate";
+  private static final int DEFAULT_MAX_STATISTICS_PER_UPDATE = 100;
   // The schema is `table_id`, `partition_name`,  `statistic_name`, 
`statistic_value`, `audit_info`
   private static final String TABLE_ID_COLUMN = "table_id";
   private static final String PARTITION_NAME_COLUMN = "partition_name";
@@ -124,6 +126,7 @@ public class LancePartitionStatisticStorage implements 
PartitionStatisticStorage
   private final int readBatchSize;
   private final long metadataFileCacheSize;
   private final long indexCacheSize;
+  private final int maxStatisticsPerUpdate;
   private final ScheduledThreadPoolExecutor scheduler;
 
   private final EntityStore entityStore = 
GravitinoEnv.getInstance().entityStore();
@@ -177,6 +180,14 @@ public class LancePartitionStatisticStorage implements 
PartitionStatisticStorage
         indexCacheSize > 0,
         "Lance partition statistics storage indexCacheSizeBytes must be 
positive");
 
+    this.maxStatisticsPerUpdate =
+        Integer.parseInt(
+            properties.getOrDefault(
+                MAX_STATISTICS_PER_UPDATE, 
String.valueOf(DEFAULT_MAX_STATISTICS_PER_UPDATE)));
+    Preconditions.checkArgument(
+        maxStatisticsPerUpdate > 0,
+        "Lance partition statistics storage maxStatisticsPerUpdate must be 
positive");
+
     this.properties = properties;
     if (datasetCacheSize != 0) {
       this.scheduler =
@@ -234,6 +245,17 @@ public class LancePartitionStatisticStorage implements 
PartitionStatisticStorage
   @Override
   public void updateStatistics(
       String metalake, List<MetadataObjectStatisticsUpdate> 
statisticsToUpdate) throws IOException {
+    int totalStatisticsCount =
+        statisticsToUpdate.stream()
+            .flatMap(update -> update.partitionUpdates().stream())
+            .mapToInt(partitionUpdate -> partitionUpdate.statistics().size())
+            .sum();
+    Preconditions.checkArgument(
+        totalStatisticsCount <= maxStatisticsPerUpdate,
+        "Total statistics count %s exceeds the maximum limit %s",
+        totalStatisticsCount,
+        maxStatisticsPerUpdate);
+
     try {
       //  TODO: The small updates and deletion may cause performance issues. 
The storage need to add
       // compaction operations.
diff --git 
a/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
 
b/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
index bafe0b4b94..6f2527a24b 100644
--- 
a/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
+++ 
b/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
@@ -378,6 +378,87 @@ public class TestLancePartitionStatisticStorage {
     storage.close();
   }
 
+  @Test
+  public void testExceedMaxStatisticsPerUpdateLimit() throws Exception {
+    PartitionStatisticStorageFactory factory = new 
LancePartitionStatisticStorageFactory();
+
+    String metalakeName = "metalake";
+    String catalogName = "catalog";
+    String schemaName = "schema";
+    String tableName = "table";
+
+    MetadataObject metadataObject =
+        MetadataObjects.of(
+            Lists.newArrayList(catalogName, schemaName, tableName), 
MetadataObject.Type.TABLE);
+
+    EntityStore entityStore = mock(EntityStore.class);
+    TableEntity tableEntity = mock(TableEntity.class);
+    when(entityStore.get(any(), any(), any())).thenReturn(tableEntity);
+    when(tableEntity.id()).thenReturn(101L);
+    FieldUtils.writeField(GravitinoEnv.getInstance(), "entityStore", 
entityStore, true);
+
+    String location = 
Files.createTempDirectory("lance_stats_exceed_test").toString();
+    Map<String, String> properties = Maps.newHashMap();
+    properties.put("location", location);
+    // Default limit is 100, generate 101 statistics to exceed the limit
+
+    LancePartitionStatisticStorage storage =
+        (LancePartitionStatisticStorage) factory.create(properties);
+
+    try {
+      // Generate 101 statistics which exceeds the default limit of 100
+      int count = 101;
+      Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>> 
originData =
+          generateData(metadataObject, count, 1);
+      Map<MetadataObject, List<PartitionStatisticsUpdate>> statisticsToUpdate =
+          convertData(originData);
+
+      List<MetadataObjectStatisticsUpdate> objectUpdates = 
Lists.newArrayList();
+      for (Map.Entry<MetadataObject, List<PartitionStatisticsUpdate>> entry :
+          statisticsToUpdate.entrySet()) {
+        objectUpdates.add(MetadataObjectStatisticsUpdate.of(entry.getKey(), 
entry.getValue()));
+      }
+
+      IllegalArgumentException exception =
+          Assertions.assertThrows(
+              IllegalArgumentException.class,
+              () -> storage.updateStatistics(metalakeName, objectUpdates));
+      Assertions.assertTrue(exception.getMessage().contains("exceeds the 
maximum limit"));
+    } finally {
+      FileUtils.deleteDirectory(new File(location + "/" + tableEntity.id() + 
".lance"));
+      storage.close();
+    }
+  }
+
+  @Test
+  public void testInvalidMaxStatisticsPerUpdateConfiguration() throws 
Exception {
+    String location = 
Files.createTempDirectory("lance_stats_invalid_config_test").toString();
+
+    try {
+      // Test that zero value throws an exception
+      Map<String, String> properties = Maps.newHashMap();
+      properties.put("location", location);
+      properties.put("maxStatisticsPerUpdate", "0");
+
+      IllegalArgumentException exception =
+          Assertions.assertThrows(
+              IllegalArgumentException.class, () -> new 
LancePartitionStatisticStorage(properties));
+      Assertions.assertTrue(
+          exception.getMessage().contains("maxStatisticsPerUpdate must be 
positive"));
+
+      // Test that negative value throws an exception
+      properties.put("maxStatisticsPerUpdate", "-1");
+
+      exception =
+          Assertions.assertThrows(
+              IllegalArgumentException.class, () -> new 
LancePartitionStatisticStorage(properties));
+      Assertions.assertTrue(
+          exception.getMessage().contains("maxStatisticsPerUpdate must be 
positive"));
+    } finally {
+      FileUtils.deleteDirectory(new File(location));
+    }
+  }
+
   private Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>> 
generateData(
       MetadataObject metadataObject, int count, int partitions) {
     Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>> 
statisticsToUpdate =
diff --git a/docs/manage-statistics-in-gravitino.md 
b/docs/manage-statistics-in-gravitino.md
index 139790fd39..401293c59e 100644
--- a/docs/manage-statistics-in-gravitino.md
+++ b/docs/manage-statistics-in-gravitino.md
@@ -245,16 +245,17 @@ For example, if you set an extra property `foo` to `bar` 
for Lance storage optio
 For Lance remote storage, you can refer to the document 
[here](https://lancedb.github.io/lance/usage/storage/).
 
 
-| Configuration item                                                   | 
Description                          | Default value                        | 
Required | Since version |
-|----------------------------------------------------------------------|--------------------------------------|--------------------------------------|----------|---------------|
-| `gravitino.stats.partition.storageOption.location`                   | The 
location of Lance files          | `${GRAVITINO_HOME}/data/lance`       | No    
   | 1.0.0         |
-| `gravitino.stats.partition.storageOption.maxRowsPerFile`             | The 
maximum rows per file            | `1000000`                            | No    
   | 1.0.0         |
-| `gravitino.stats.partition.storageOption.maxBytesPerFile`            | The 
maximum bytes per file           | `104857600`                          | No    
   | 1.0.0         |
-| `gravitino.stats.partition.storageOption.maxRowsPerGroup`            | The 
maximum rows per group           | `1000000`                            | No    
   | 1.0.0         |
-| `gravitino.stats.partition.storageOption.readBatchSize`              | The 
batch record number when reading | `10000`                              | No    
   | 1.0.0         |
-| `gravitino.stats.partition.storageOption.datasetCacheSize`           | size 
of dataset cache for Lance      | `0`, It means we don't use the cache | No     
  | 1.0.0         |
-| `gravitino.stats.partition.storageOption.metadataFileCacheSizeBytes` | The 
Lance's metadata file cache size | `102400`                             | No    
   | 1.0.0         |
-| `gravitino.stats.partition.storageOption.indexCacheSizeBytes`        | The 
Lance's index cache size         | `102400`                             | No    
   | 1.0.0         |
+| Configuration item                                                   | 
Description                                                | Default value      
                  | Required | Since version |
+|----------------------------------------------------------------------|------------------------------------------------------------|--------------------------------------|----------|---------------|
+| `gravitino.stats.partition.storageOption.location`                   | The 
location of Lance files                                | 
`${GRAVITINO_HOME}/data/lance`       | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.maxRowsPerFile`             | The 
maximum rows per file                                  | `1000000`              
              | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.maxBytesPerFile`            | The 
maximum bytes per file                                 | `104857600`            
              | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.maxRowsPerGroup`            | The 
maximum rows per group                                 | `1000000`              
              | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.readBatchSize`              | The 
batch record number when reading                       | `10000`                
              | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.datasetCacheSize`           | size 
of dataset cache for Lance                            | `0`, It means we don't 
use the cache | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.metadataFileCacheSizeBytes` | The 
Lance's metadata file cache size                       | `102400`               
              | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.indexCacheSizeBytes`        | The 
Lance's index cache size                               | `102400`               
              | No       | 1.0.0         |
+| `gravitino.stats.partition.storageOption.maxStatisticsPerUpdate`     | 
Maximum number of statistics allowed per update operation  | `100`              
                  | No       | 1.2.0         |
 
 If you have many tables with a small number of partitions, you should set a 
smaller metadataFileCacheSizeBytes and indexCacheSizeBytes.
 

Reply via email to