This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new d2ce7ae5d3 [python] Fix local-cache silently bypassed in REST catalog 
(#7981)
d2ce7ae5d3 is described below

commit d2ce7ae5d3c853ed523f87fab0244aed531c1896
Author: XiaoHongbo <[email protected]>
AuthorDate: Wed May 27 09:09:41 2026 +0800

    [python] Fix local-cache silently bypassed in REST catalog (#7981)
    
    Two issues prevent the local cache from working safely in REST catalog
    mode:
    
    1. With `data-token.enabled=true` (DLF mode), enabling
    `local-cache.enabled=true`
         had no effect — reads silently went uncached.
    2. When `local-cache.max-size` was unset, the cache could grow without
    bound
         (default 2^63-1 bytes), risking disk full / OOM.
    
      This PR fixes them by:
    - Routing `RESTTokenFileIO` through
    `CachingFileIO.wrap_with_caching_if_needed`.
    - Defaulting `local-cache.max-size` to 256 MB (memory) / 10 GB (disk)
    when unset.
---
 .../pypaimon/catalog/rest/rest_catalog.py          | 12 +++--
 .../pypaimon/filesystem/caching_file_io.py         |  9 +++-
 .../pypaimon/tests/caching_file_io_test.py         | 55 ++++++++++++++++++++++
 .../pypaimon/tests/rest/test_fuse_local_path.py    |  1 +
 4 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/paimon-python/pypaimon/catalog/rest/rest_catalog.py 
b/paimon-python/pypaimon/catalog/rest/rest_catalog.py
index fe1fab1b63..d6e89d50b9 100644
--- a/paimon-python/pypaimon/catalog/rest/rest_catalog.py
+++ b/paimon-python/pypaimon/catalog/rest/rest_catalog.py
@@ -646,16 +646,18 @@ class RESTCatalog(Catalog):
         Get FileIO for data access, supporting FUSE local path mapping.
         """
         if self._fuse_resolver is not None:
-            return self._fuse_resolver.get_file_io(
+            file_io = self._fuse_resolver.get_file_io(
                 table_path, identifier, self.data_token_enabled,
                 rest_token_file_io_factory=lambda: RESTTokenFileIO(
                     identifier, table_path, self.context.options),
                 default_file_io_factory=lambda: 
self.file_io_from_options(table_path),
             )
-
-        # Fallback to original logic
-        return RESTTokenFileIO(identifier, table_path, self.context.options) \
-            if self.data_token_enabled else 
self.file_io_from_options(table_path)
+        elif self.data_token_enabled:
+            file_io = RESTTokenFileIO(identifier, table_path, 
self.context.options)
+        else:
+            file_io = self.file_io_from_options(table_path)
+        return CachingFileIO.wrap_with_caching_if_needed(
+            file_io, self.context.options, self._cache_manager)
 
     def load_table(self,
                    identifier: Identifier,
diff --git a/paimon-python/pypaimon/filesystem/caching_file_io.py 
b/paimon-python/pypaimon/filesystem/caching_file_io.py
index d583adb4e0..778614c56c 100644
--- a/paimon-python/pypaimon/filesystem/caching_file_io.py
+++ b/paimon-python/pypaimon/filesystem/caching_file_io.py
@@ -310,6 +310,10 @@ class CachingFileIO(FileIO):
         else:
             self._whitelist = whitelist
 
+    # Fallback caps when local-cache.max-size is unset (memory shares the 
heap).
+    _DEFAULT_MEMORY_CACHE_MAX_SIZE = 256 * 1024 * 1024
+    _DEFAULT_DISK_CACHE_MAX_SIZE = 10 * 1024 * 1024 * 1024
+
     @staticmethod
     def create_cache_manager(options):
         """Creates a cache manager from options, or returns None if caching is 
not enabled."""
@@ -319,11 +323,14 @@ class CachingFileIO(FileIO):
             return None
         cache_dir = opts.local_cache_dir()
         max_size_opt = opts.local_cache_max_size()
-        max_size = max_size_opt.get_bytes() if max_size_opt is not None else 
(2 ** 63 - 1)
         block_size = opts.local_cache_block_size().get_bytes()
         if cache_dir is not None:
+            max_size = (max_size_opt.get_bytes() if max_size_opt is not None
+                        else CachingFileIO._DEFAULT_DISK_CACHE_MAX_SIZE)
             return LocalDiskCacheManager(cache_dir, max_size, block_size)
         else:
+            max_size = (max_size_opt.get_bytes() if max_size_opt is not None
+                        else CachingFileIO._DEFAULT_MEMORY_CACHE_MAX_SIZE)
             return LocalMemoryCacheManager(max_size, block_size)
 
     @staticmethod
diff --git a/paimon-python/pypaimon/tests/caching_file_io_test.py 
b/paimon-python/pypaimon/tests/caching_file_io_test.py
index 5f22c8934e..12288ef120 100644
--- a/paimon-python/pypaimon/tests/caching_file_io_test.py
+++ b/paimon-python/pypaimon/tests/caching_file_io_test.py
@@ -409,6 +409,61 @@ class CachingFileIOTest(unittest.TestCase):
         self.assertTrue(os.path.exists(out_path))
         self.assertEqual(table, pa.parquet.read_table(out_path))
 
+    def test_file_io_for_data_wraps_cache_when_data_token_enabled(self):
+        from pypaimon.catalog.rest.rest_catalog import RESTCatalog
+        from pypaimon.common.identifier import Identifier
+        from pypaimon.common.options.options import Options
+
+        catalog = MagicMock(spec=RESTCatalog)
+        catalog.data_token_enabled = True
+        catalog.fuse_enabled = False
+        catalog._fuse_resolver = None
+        catalog.context = MagicMock()
+        catalog.context.options = Options({
+            'local-cache.enabled': 'true',
+            'local-cache.dir': self.cache_dir,
+            'local-cache.whitelist': 'meta,global-index,data',
+        })
+        catalog._cache_manager = CachingFileIO.create_cache_manager(
+            catalog.context.options)
+        catalog.file_io_for_data = RESTCatalog.file_io_for_data.__get__(
+            catalog, RESTCatalog)
+
+        file_io = catalog.file_io_for_data(
+            "oss://catalog/db1/table1", Identifier.create("db1", "table1"))
+
+        self.assertIsInstance(
+            file_io, CachingFileIO,
+            msg="Cache wrap should apply even when data-token.enabled=true; "
+                "currently bypassed in DLF mode (RESTTokenFileIO returned).")
+
+    def test_default_memory_cache_max_size_capped(self):
+        from pypaimon.common.options.options import Options
+        from pypaimon.filesystem.caching_file_io import LocalMemoryCacheManager
+
+        cache = CachingFileIO.create_cache_manager(Options({
+            'local-cache.enabled': 'true',
+        }))
+        self.assertIsInstance(cache, LocalMemoryCacheManager)
+        self.assertEqual(
+            cache._max_size_bytes, 256 * 1024 * 1024,
+            msg="Memory cache without explicit max-size should default to "
+                "256 MB, not unlimited (OOM risk).")
+
+    def test_default_disk_cache_max_size_capped(self):
+        from pypaimon.common.options.options import Options
+        from pypaimon.filesystem.caching_file_io import LocalDiskCacheManager
+
+        cache = CachingFileIO.create_cache_manager(Options({
+            'local-cache.enabled': 'true',
+            'local-cache.dir': self.cache_dir,
+        }))
+        self.assertIsInstance(cache, LocalDiskCacheManager)
+        self.assertEqual(
+            cache._max_size_bytes, 10 * 1024 * 1024 * 1024,
+            msg="Disk cache without explicit max-size should default to "
+                "10 GB, not unlimited (disk-full risk).")
+
 
 class ConfigOptionsTest(unittest.TestCase):
 
diff --git a/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py 
b/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
index 3235240c17..fb4c01430e 100644
--- a/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
+++ b/paimon-python/pypaimon/tests/rest/test_fuse_local_path.py
@@ -65,6 +65,7 @@ class TestFuseLocalPath(unittest.TestCase):
         catalog = MagicMock(spec=RESTCatalog)
         catalog.fuse_enabled = enabled
         catalog.data_token_enabled = False
+        catalog._cache_manager = None
         catalog.rest_api = MagicMock()
         catalog.context = MagicMock()
         catalog.context.options = options

Reply via email to