This is an automated email from the ASF dual-hosted git repository.
jshao pushed a commit to branch branch-1.1
in repository https://gitbox.apache.org/repos/asf/gravitino.git
The following commit(s) were added to refs/heads/branch-1.1 by this push:
new 8f80781bbe [Cherry-pick to branch-1.1] [#9650]
improvement(statistics): Add `maxStatisticsPerUpdate` configuration for Lance
partition storage (#10149) (#10160)
8f80781bbe is described below
commit 8f80781bbe87ab85a34490c6d774c550783c7eab
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Mar 4 15:56:42 2026 +0800
[Cherry-pick to branch-1.1] [#9650] improvement(statistics): Add
`maxStatisticsPerUpdate` configuration for Lance partition storage (#10149)
(#10160)
**Cherry-pick Information:**
- Original commit: 615568a1637c36cab67eac15d1a94e12767cd202
- Target branch: `branch-1.1`
- Status: ✅ Clean cherry-pick (no conflicts)
Co-authored-by: roryqi <[email protected]>
---
.../storage/LancePartitionStatisticStorage.java | 22 ++++++
.../TestLancePartitionStatisticStorage.java | 81 ++++++++++++++++++++++
docs/manage-statistics-in-gravitino.md | 21 +++---
3 files changed, 114 insertions(+), 10 deletions(-)
diff --git
a/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
b/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
index c6e2398b29..a2bfb0c720 100644
---
a/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
+++
b/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java
@@ -97,6 +97,8 @@ public class LancePartitionStatisticStorage implements
PartitionStatisticStorage
private static final long DEFAULT_METADATA_FILE_CACHE_SIZE = 100L * 1024; //
100KB
private static final String INDEX_CACHE_SIZE = "indexCacheSizeBytes";
private static final long DEFAULT_INDEX_CACHE_SIZE = 100L * 1024; // 100KB
+ private static final String MAX_STATISTICS_PER_UPDATE =
"maxStatisticsPerUpdate";
+ private static final int DEFAULT_MAX_STATISTICS_PER_UPDATE = 100;
// The schema is `table_id`, `partition_name`, `statistic_name`,
`statistic_value`, `audit_info`
private static final String TABLE_ID_COLUMN = "table_id";
private static final String PARTITION_NAME_COLUMN = "partition_name";
@@ -124,6 +126,7 @@ public class LancePartitionStatisticStorage implements
PartitionStatisticStorage
private final int readBatchSize;
private final long metadataFileCacheSize;
private final long indexCacheSize;
+ private final int maxStatisticsPerUpdate;
private final ScheduledThreadPoolExecutor scheduler;
private final EntityStore entityStore =
GravitinoEnv.getInstance().entityStore();
@@ -177,6 +180,14 @@ public class LancePartitionStatisticStorage implements
PartitionStatisticStorage
indexCacheSize > 0,
"Lance partition statistics storage indexCacheSizeBytes must be
positive");
+ this.maxStatisticsPerUpdate =
+ Integer.parseInt(
+ properties.getOrDefault(
+ MAX_STATISTICS_PER_UPDATE,
String.valueOf(DEFAULT_MAX_STATISTICS_PER_UPDATE)));
+ Preconditions.checkArgument(
+ maxStatisticsPerUpdate > 0,
+ "Lance partition statistics storage maxStatisticsPerUpdate must be
positive");
+
this.properties = properties;
if (datasetCacheSize != 0) {
this.scheduler =
@@ -234,6 +245,17 @@ public class LancePartitionStatisticStorage implements
PartitionStatisticStorage
@Override
public void updateStatistics(
String metalake, List<MetadataObjectStatisticsUpdate>
statisticsToUpdate) throws IOException {
+ int totalStatisticsCount =
+ statisticsToUpdate.stream()
+ .flatMap(update -> update.partitionUpdates().stream())
+ .mapToInt(partitionUpdate -> partitionUpdate.statistics().size())
+ .sum();
+ Preconditions.checkArgument(
+ totalStatisticsCount <= maxStatisticsPerUpdate,
+ "Total statistics count %s exceeds the maximum limit %s",
+ totalStatisticsCount,
+ maxStatisticsPerUpdate);
+
try {
// TODO: The small updates and deletion may cause performance issues.
The storage need to add
// compaction operations.
diff --git
a/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
b/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
index bafe0b4b94..6f2527a24b 100644
---
a/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
+++
b/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java
@@ -378,6 +378,87 @@ public class TestLancePartitionStatisticStorage {
storage.close();
}
+ @Test
+ public void testExceedMaxStatisticsPerUpdateLimit() throws Exception {
+ PartitionStatisticStorageFactory factory = new
LancePartitionStatisticStorageFactory();
+
+ String metalakeName = "metalake";
+ String catalogName = "catalog";
+ String schemaName = "schema";
+ String tableName = "table";
+
+ MetadataObject metadataObject =
+ MetadataObjects.of(
+ Lists.newArrayList(catalogName, schemaName, tableName),
MetadataObject.Type.TABLE);
+
+ EntityStore entityStore = mock(EntityStore.class);
+ TableEntity tableEntity = mock(TableEntity.class);
+ when(entityStore.get(any(), any(), any())).thenReturn(tableEntity);
+ when(tableEntity.id()).thenReturn(101L);
+ FieldUtils.writeField(GravitinoEnv.getInstance(), "entityStore",
entityStore, true);
+
+ String location =
Files.createTempDirectory("lance_stats_exceed_test").toString();
+ Map<String, String> properties = Maps.newHashMap();
+ properties.put("location", location);
+ // Default limit is 100, generate 101 statistics to exceed the limit
+
+ LancePartitionStatisticStorage storage =
+ (LancePartitionStatisticStorage) factory.create(properties);
+
+ try {
+ // Generate 101 statistics which exceeds the default limit of 100
+ int count = 101;
+ Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>>
originData =
+ generateData(metadataObject, count, 1);
+ Map<MetadataObject, List<PartitionStatisticsUpdate>> statisticsToUpdate =
+ convertData(originData);
+
+ List<MetadataObjectStatisticsUpdate> objectUpdates =
Lists.newArrayList();
+ for (Map.Entry<MetadataObject, List<PartitionStatisticsUpdate>> entry :
+ statisticsToUpdate.entrySet()) {
+ objectUpdates.add(MetadataObjectStatisticsUpdate.of(entry.getKey(),
entry.getValue()));
+ }
+
+ IllegalArgumentException exception =
+ Assertions.assertThrows(
+ IllegalArgumentException.class,
+ () -> storage.updateStatistics(metalakeName, objectUpdates));
+ Assertions.assertTrue(exception.getMessage().contains("exceeds the
maximum limit"));
+ } finally {
+ FileUtils.deleteDirectory(new File(location + "/" + tableEntity.id() +
".lance"));
+ storage.close();
+ }
+ }
+
+ @Test
+ public void testInvalidMaxStatisticsPerUpdateConfiguration() throws
Exception {
+ String location =
Files.createTempDirectory("lance_stats_invalid_config_test").toString();
+
+ try {
+ // Test that zero value throws an exception
+ Map<String, String> properties = Maps.newHashMap();
+ properties.put("location", location);
+ properties.put("maxStatisticsPerUpdate", "0");
+
+ IllegalArgumentException exception =
+ Assertions.assertThrows(
+ IllegalArgumentException.class, () -> new
LancePartitionStatisticStorage(properties));
+ Assertions.assertTrue(
+ exception.getMessage().contains("maxStatisticsPerUpdate must be
positive"));
+
+ // Test that negative value throws an exception
+ properties.put("maxStatisticsPerUpdate", "-1");
+
+ exception =
+ Assertions.assertThrows(
+ IllegalArgumentException.class, () -> new
LancePartitionStatisticStorage(properties));
+ Assertions.assertTrue(
+ exception.getMessage().contains("maxStatisticsPerUpdate must be
positive"));
+ } finally {
+ FileUtils.deleteDirectory(new File(location));
+ }
+ }
+
private Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>>
generateData(
MetadataObject metadataObject, int count, int partitions) {
Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>>
statisticsToUpdate =
diff --git a/docs/manage-statistics-in-gravitino.md
b/docs/manage-statistics-in-gravitino.md
index 139790fd39..401293c59e 100644
--- a/docs/manage-statistics-in-gravitino.md
+++ b/docs/manage-statistics-in-gravitino.md
@@ -245,16 +245,17 @@ For example, if you set an extra property `foo` to `bar`
for Lance storage optio
For Lance remote storage, you can refer to the document
[here](https://lancedb.github.io/lance/usage/storage/).
-| Configuration item |
Description | Default value |
Required | Since version |
-|----------------------------------------------------------------------|--------------------------------------|--------------------------------------|----------|---------------|
-| `gravitino.stats.partition.storageOption.location` | The
location of Lance files | `${GRAVITINO_HOME}/data/lance` | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.maxRowsPerFile` | The
maximum rows per file | `1000000` | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.maxBytesPerFile` | The
maximum bytes per file | `104857600` | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.maxRowsPerGroup` | The
maximum rows per group | `1000000` | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.readBatchSize` | The
batch record number when reading | `10000` | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.datasetCacheSize` | size
of dataset cache for Lance | `0`, It means we don't use the cache | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.metadataFileCacheSizeBytes` | The
Lance's metadata file cache size | `102400` | No
| 1.0.0 |
-| `gravitino.stats.partition.storageOption.indexCacheSizeBytes` | The
Lance's index cache size | `102400` | No
| 1.0.0 |
+| Configuration item |
Description | Default value
| Required | Since version |
+|----------------------------------------------------------------------|------------------------------------------------------------|--------------------------------------|----------|---------------|
+| `gravitino.stats.partition.storageOption.location` | The
location of Lance files |
`${GRAVITINO_HOME}/data/lance` | No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.maxRowsPerFile` | The
maximum rows per file | `1000000`
| No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.maxBytesPerFile` | The
maximum bytes per file | `104857600`
| No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.maxRowsPerGroup` | The
maximum rows per group | `1000000`
| No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.readBatchSize` | The
batch record number when reading | `10000`
| No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.datasetCacheSize` | size
of dataset cache for Lance | `0`, It means we don't
use the cache | No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.metadataFileCacheSizeBytes` | The
Lance's metadata file cache size | `102400`
| No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.indexCacheSizeBytes` | The
Lance's index cache size | `102400`
| No | 1.0.0 |
+| `gravitino.stats.partition.storageOption.maxStatisticsPerUpdate` |
Maximum number of statistics allowed per update operation | `100`
| No | 1.2.0 |
If you have many tables with a small number of partitions, you should set a
smaller metadataFileCacheSizeBytes and indexCacheSizeBytes.