This is an automated email from the ASF dual-hosted git repository. github-bot pushed a commit to branch cherry-pick-615568a1-to-branch-1.1 in repository https://gitbox.apache.org/repos/asf/gravitino.git
commit a0e43c1138eafe715705a594802193ee5be14366 Author: roryqi <[email protected]> AuthorDate: Tue Mar 3 22:04:09 2026 +0800 [#9650] improvement(statistics): Add `maxStatisticsPerUpdate` configuration for Lance partition storage (#10149) ### What changes were proposed in this pull request? - Add MAX_STATISTICS_PER_UPDATE configuration with default value of 100 - Validate total statistics count in updateStatistics method - Add test cases for exceeding limit and invalid configuration ### Why are the changes needed? Fix: #9650 ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added test cases. --- .../storage/LancePartitionStatisticStorage.java | 22 ++++++ .../TestLancePartitionStatisticStorage.java | 81 ++++++++++++++++++++++ docs/manage-statistics-in-gravitino.md | 21 +++--- 3 files changed, 114 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java b/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java index c6e2398b29..a2bfb0c720 100644 --- a/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java +++ b/core/src/main/java/org/apache/gravitino/stats/storage/LancePartitionStatisticStorage.java @@ -97,6 +97,8 @@ public class LancePartitionStatisticStorage implements PartitionStatisticStorage private static final long DEFAULT_METADATA_FILE_CACHE_SIZE = 100L * 1024; // 100KB private static final String INDEX_CACHE_SIZE = "indexCacheSizeBytes"; private static final long DEFAULT_INDEX_CACHE_SIZE = 100L * 1024; // 100KB + private static final String MAX_STATISTICS_PER_UPDATE = "maxStatisticsPerUpdate"; + private static final int DEFAULT_MAX_STATISTICS_PER_UPDATE = 100; // The schema is `table_id`, `partition_name`, `statistic_name`, `statistic_value`, `audit_info` private static final String TABLE_ID_COLUMN = "table_id"; private static final String PARTITION_NAME_COLUMN = "partition_name"; @@ -124,6 +126,7 @@ public class LancePartitionStatisticStorage implements PartitionStatisticStorage private final int readBatchSize; private final long metadataFileCacheSize; private final long indexCacheSize; + private final int maxStatisticsPerUpdate; private final ScheduledThreadPoolExecutor scheduler; private final EntityStore entityStore = GravitinoEnv.getInstance().entityStore(); @@ -177,6 +180,14 @@ public class LancePartitionStatisticStorage implements PartitionStatisticStorage indexCacheSize > 0, "Lance partition statistics storage indexCacheSizeBytes must be positive"); + this.maxStatisticsPerUpdate = + Integer.parseInt( + properties.getOrDefault( + MAX_STATISTICS_PER_UPDATE, String.valueOf(DEFAULT_MAX_STATISTICS_PER_UPDATE))); + Preconditions.checkArgument( + maxStatisticsPerUpdate > 0, + "Lance partition statistics storage maxStatisticsPerUpdate must be positive"); + this.properties = properties; if (datasetCacheSize != 0) { this.scheduler = @@ -234,6 +245,17 @@ public class LancePartitionStatisticStorage implements PartitionStatisticStorage @Override public void updateStatistics( String metalake, List<MetadataObjectStatisticsUpdate> statisticsToUpdate) throws IOException { + int totalStatisticsCount = + statisticsToUpdate.stream() + .flatMap(update -> update.partitionUpdates().stream()) + .mapToInt(partitionUpdate -> partitionUpdate.statistics().size()) + .sum(); + Preconditions.checkArgument( + totalStatisticsCount <= maxStatisticsPerUpdate, + "Total statistics count %s exceeds the maximum limit %s", + totalStatisticsCount, + maxStatisticsPerUpdate); + try { // TODO: The small updates and deletion may cause performance issues. The storage need to add // compaction operations. diff --git a/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java b/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java index bafe0b4b94..6f2527a24b 100644 --- a/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java +++ b/core/src/test/java/org/apache/gravitino/stats/storage/TestLancePartitionStatisticStorage.java @@ -378,6 +378,87 @@ public class TestLancePartitionStatisticStorage { storage.close(); } + @Test + public void testExceedMaxStatisticsPerUpdateLimit() throws Exception { + PartitionStatisticStorageFactory factory = new LancePartitionStatisticStorageFactory(); + + String metalakeName = "metalake"; + String catalogName = "catalog"; + String schemaName = "schema"; + String tableName = "table"; + + MetadataObject metadataObject = + MetadataObjects.of( + Lists.newArrayList(catalogName, schemaName, tableName), MetadataObject.Type.TABLE); + + EntityStore entityStore = mock(EntityStore.class); + TableEntity tableEntity = mock(TableEntity.class); + when(entityStore.get(any(), any(), any())).thenReturn(tableEntity); + when(tableEntity.id()).thenReturn(101L); + FieldUtils.writeField(GravitinoEnv.getInstance(), "entityStore", entityStore, true); + + String location = Files.createTempDirectory("lance_stats_exceed_test").toString(); + Map<String, String> properties = Maps.newHashMap(); + properties.put("location", location); + // Default limit is 100, generate 101 statistics to exceed the limit + + LancePartitionStatisticStorage storage = + (LancePartitionStatisticStorage) factory.create(properties); + + try { + // Generate 101 statistics which exceeds the default limit of 100 + int count = 101; + Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>> originData = + generateData(metadataObject, count, 1); + Map<MetadataObject, List<PartitionStatisticsUpdate>> statisticsToUpdate = + convertData(originData); + + List<MetadataObjectStatisticsUpdate> objectUpdates = Lists.newArrayList(); + for (Map.Entry<MetadataObject, List<PartitionStatisticsUpdate>> entry : + statisticsToUpdate.entrySet()) { + objectUpdates.add(MetadataObjectStatisticsUpdate.of(entry.getKey(), entry.getValue())); + } + + IllegalArgumentException exception = + Assertions.assertThrows( + IllegalArgumentException.class, + () -> storage.updateStatistics(metalakeName, objectUpdates)); + Assertions.assertTrue(exception.getMessage().contains("exceeds the maximum limit")); + } finally { + FileUtils.deleteDirectory(new File(location + "/" + tableEntity.id() + ".lance")); + storage.close(); + } + } + + @Test + public void testInvalidMaxStatisticsPerUpdateConfiguration() throws Exception { + String location = Files.createTempDirectory("lance_stats_invalid_config_test").toString(); + + try { + // Test that zero value throws an exception + Map<String, String> properties = Maps.newHashMap(); + properties.put("location", location); + properties.put("maxStatisticsPerUpdate", "0"); + + IllegalArgumentException exception = + Assertions.assertThrows( + IllegalArgumentException.class, () -> new LancePartitionStatisticStorage(properties)); + Assertions.assertTrue( + exception.getMessage().contains("maxStatisticsPerUpdate must be positive")); + + // Test that negative value throws an exception + properties.put("maxStatisticsPerUpdate", "-1"); + + exception = + Assertions.assertThrows( + IllegalArgumentException.class, () -> new LancePartitionStatisticStorage(properties)); + Assertions.assertTrue( + exception.getMessage().contains("maxStatisticsPerUpdate must be positive")); + } finally { + FileUtils.deleteDirectory(new File(location)); + } + } + private Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>> generateData( MetadataObject metadataObject, int count, int partitions) { Map<MetadataObject, Map<String, Map<String, StatisticValue<?>>>> statisticsToUpdate = diff --git a/docs/manage-statistics-in-gravitino.md b/docs/manage-statistics-in-gravitino.md index 139790fd39..401293c59e 100644 --- a/docs/manage-statistics-in-gravitino.md +++ b/docs/manage-statistics-in-gravitino.md @@ -245,16 +245,17 @@ For example, if you set an extra property `foo` to `bar` for Lance storage optio For Lance remote storage, you can refer to the document [here](https://lancedb.github.io/lance/usage/storage/). -| Configuration item | Description | Default value | Required | Since version | -|----------------------------------------------------------------------|--------------------------------------|--------------------------------------|----------|---------------| -| `gravitino.stats.partition.storageOption.location` | The location of Lance files | `${GRAVITINO_HOME}/data/lance` | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.maxRowsPerFile` | The maximum rows per file | `1000000` | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.maxBytesPerFile` | The maximum bytes per file | `104857600` | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.maxRowsPerGroup` | The maximum rows per group | `1000000` | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.readBatchSize` | The batch record number when reading | `10000` | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.datasetCacheSize` | size of dataset cache for Lance | `0`, It means we don't use the cache | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.metadataFileCacheSizeBytes` | The Lance's metadata file cache size | `102400` | No | 1.0.0 | -| `gravitino.stats.partition.storageOption.indexCacheSizeBytes` | The Lance's index cache size | `102400` | No | 1.0.0 | +| Configuration item | Description | Default value | Required | Since version | +|----------------------------------------------------------------------|------------------------------------------------------------|--------------------------------------|----------|---------------| +| `gravitino.stats.partition.storageOption.location` | The location of Lance files | `${GRAVITINO_HOME}/data/lance` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.maxRowsPerFile` | The maximum rows per file | `1000000` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.maxBytesPerFile` | The maximum bytes per file | `104857600` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.maxRowsPerGroup` | The maximum rows per group | `1000000` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.readBatchSize` | The batch record number when reading | `10000` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.datasetCacheSize` | size of dataset cache for Lance | `0`, It means we don't use the cache | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.metadataFileCacheSizeBytes` | The Lance's metadata file cache size | `102400` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.indexCacheSizeBytes` | The Lance's index cache size | `102400` | No | 1.0.0 | +| `gravitino.stats.partition.storageOption.maxStatisticsPerUpdate` | Maximum number of statistics allowed per update operation | `100` | No | 1.2.0 | If you have many tables with a small number of partitions, you should set a smaller metadataFileCacheSizeBytes and indexCacheSizeBytes.
