This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new d2ce7ae5d3 [python] Fix local-cache silently bypassed in REST catalog
(#7981)
d2ce7ae5d3 is described below
commit d2ce7ae5d3c853ed523f87fab0244aed531c1896
Author: XiaoHongbo <[email protected]>
AuthorDate: Wed May 27 09:09:41 2026 +0800
[python] Fix local-cache silently bypassed in REST catalog (#7981)
Two issues prevent the local cache from working safely in REST catalog
mode:
1. With `data-token.enabled=true` (DLF mode), enabling
`local-cache.enabled=true`
had no effect — reads silently went uncached.
2. When `local-cache.max-size` was unset, the cache could grow without
bound
(default 2^63-1 bytes), risking disk full / OOM.
This PR fixes them by:
- Routing `RESTTokenFileIO` through
`CachingFileIO.wrap_with_caching_if_needed`.
- Defaulting `local-cache.max-size` to 256 MB (memory) / 10 GB (disk)
when unset.
---
.../pypaimon/catalog/rest/rest_catalog.py | 12 +++--
.../pypaimon/filesystem/caching_file_io.py | 9 +++-
.../pypaimon/tests/caching_file_io_test.py | 55 ++++++++++++++++++++++
.../pypaimon/tests/rest/test_fuse_local_path.py | 1 +
4 files changed, 71 insertions(+), 6 deletions(-)
diff --git a/paimon-python/pypaimon/catalog/rest/rest_catalog.py
b/paimon-python/pypaimon/catalog/rest/rest_catalog.py
index fe1fab1b63..d6e89d50b9 100644
--- a/paimon-python/pypaimon/catalog/rest/rest_catalog.py
+++ b/paimon-python/pypaimon/catalog/rest/rest_catalog.py
@@ -646,16 +646,18 @@ class RESTCatalog(Catalog):
Get FileIO for data access, supporting FUSE local path mapping.
"""
if self._fuse_resolver is not None:
- return self._fuse_resolver.get_file_io(
+ file_io = self._fuse_resolver.get_file_io(
table_path, identifier, self.data_token_enabled,
rest_token_file_io_factory=lambda: RESTTokenFileIO(
identifier, table_path, self.context.options),
default_file_io_factory=lambda:
self.file_io_from_options(table_path),
)
-
- # Fallback to original logic
- return RESTTokenFileIO(identifier, table_path, self.context.options) \
- if self.data_token_enabled else
self.file_io_from_options(table_path)
+ elif self.data_token_enabled:
+ file_io = RESTTokenFileIO(identifier, table_path,
self.context.options)
+ else:
+ file_io = self.file_io_from_options(table_path)
+ return CachingFileIO.wrap_with_caching_if_needed(
+ file_io, self.context.options, self._cache_manager)
def load_table(self,
identifier: Identifier,
diff --git a/paimon-python/pypaimon/filesystem/caching_file_io.py
b/paimon-python/pypaimon/filesystem/caching_file_io.py
index d583adb4e0..778614c56c 100644
--- a/paimon-python/pypaimon/filesystem/caching_file_io.py
+++ b/paimon-python/pypaimon/filesystem/caching_file_io.py
@@ -310,6 +310,10 @@ class CachingFileIO(FileIO):
else:
self._whitelist = whitelist
+ # Fallback caps when local-cache.max-size is unset (memory shares the
heap).
+ _DEFAULT_MEMORY_CACHE_MAX_SIZE = 256 * 1024 * 1024
+ _DEFAULT_DISK_CACHE_MAX_SIZE = 10 * 1024 * 1024 * 1024
+
@staticmethod
def create_cache_manager(options):
"""Creates a cache manager from options, or returns None if caching is
not enabled."""
@@ -319,11 +323,14 @@ class CachingFileIO(FileIO):
return None
cache_dir = opts.local_cache_dir()
max_size_opt = opts.local_cache_max_size()
- max_size = max_size_opt.get_bytes() if max_size_opt is not None else
(2 ** 63 - 1)
block_size = opts.local_cache_block_size().get_bytes()
if cache_dir is not None:
+ max_size = (max_size_opt.get_bytes() if max_size_opt is not None
+ else CachingFileIO._DEFAULT_DISK_CACHE_MAX_SIZE)
return LocalDiskCacheManager(cache_dir, max_size, block_size)
else:
+ max_size = (max_size_opt.get_bytes() if max_size_opt is not None
+ else CachingFileIO._DEFAULT_MEMORY_CACHE_MAX_SIZE)
return LocalMemoryCacheManager(max_size, block_size)
@staticmethod
diff --git a/paimon-python/pypaimon/tests/caching_file_io_test.py
b/paimon-python/pypaimon/tests/caching_file_io_test.py
index 5f22c8934e..12288ef120 100644
--- a/paimon-python/pypaimon/tests/caching_file_io_test.py
+++ b/paimon-python/pypaimon/tests/caching_file_io_test.py
@@ -409,6 +409,61 @@ class CachingFileIOTest(unittest.TestCase):
self.assertTrue(os.path.exists(out_path))
self.assertEqual(table, pa.parquet.read_table(out_path))
+ def test_file_io_for_data_wraps_cache_when_data_token_enabled(self):
+ from pypaimon.catalog.rest.rest_catalog import RESTCatalog
+ from pypaimon.common.identifier import Identifier
+ from pypaimon.common.options.options import Options
+
+ catalog = MagicMock(spec=RESTCatalog)
+ catalog.data_token_enabled = True
+ catalog.fuse_enabled = False
+ catalog._fuse_resolver = None
+ catalog.context = MagicMock()
+ catalog.context.options = Options({
+ 'local-cache.enabled': 'true',
+ 'local-cache.dir': self.cache_dir,
+ 'local-cache.whitelist': 'meta,global-index,data',
+ })
+ catalog._cache_manager = CachingFileIO.create_cache_manager(
+ catalog.context.options)
+ catalog.file_io_for_data = RESTCatalog.file_io_for_data.__get__(
+ catalog, RESTCatalog)
+
+ file_io = catalog.file_io_for_data(
+ "oss://catalog/db1/table1", Identifier.create("db1", "table1"))
+
+ self.assertIsInstance(
+ file_io, CachingFileIO,
+ msg="Cache wrap should apply even when data-token.enabled=true; "
+ "currently bypassed in DLF mode (RESTTokenFileIO returned).")
+
+ def test_default_memory_cache_max_size_capped(self):
+ from pypaimon.common.options.options import Options
+ from pypaimon.filesystem.caching_file_io import LocalMemoryCacheManager
+
+ cache = CachingFileIO.create_cache_manager(Options({
+ 'local-cache.enabled': 'true',
+ }))
+ self.assertIsInstance(cache, LocalMemoryCacheManager)
+ self.assertEqual(
+ cache._max_size_bytes, 256 * 1024 * 1024,
+ msg="Memory cache without explicit max-size should default to "
+ "256 MB, not unlimited (OOM risk).")
+
+ def test_default_disk_cache_max_size_capped(self):
+ from pypaimon.common.options.options import Options
+ from pypaimon.filesystem.caching_file_io import LocalDiskCacheManager
+
+ cache = CachingFileIO.create_cache_manager(Options({
+ 'local-cache.enabled': 'true',
+ 'local-cache.dir': self.cache_dir,
+ }))
+ self.assertIsInstance(cache, LocalDiskCacheManager)
+ self.assertEqual(
+ cache._max_size_bytes, 10 * 1024 * 1024 * 1024,
+ msg="Disk cache without explicit max-size should default to "
+ "10 GB, not unlimited (disk-full risk).")
+
class ConfigOptionsTest(unittest.TestCase):
diff --git a/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
b/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
index 3235240c17..fb4c01430e 100644
--- a/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
+++ b/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
@@ -65,6 +65,7 @@ class TestFuseLocalPath(unittest.TestCase):
catalog = MagicMock(spec=RESTCatalog)
catalog.fuse_enabled = enabled
catalog.data_token_enabled = False
+ catalog._cache_manager = None
catalog.rest_api = MagicMock()
catalog.context = MagicMock()
catalog.context.options = options