jayceslesar commented on code in PR #2951:
URL: https://github.com/apache/iceberg-python/pull/2951#discussion_r2729018999
##########
pyiceberg/manifest.py:
##########
@@ -892,15 +891,53 @@ def __hash__(self) -> int:
return hash(self.manifest_path)
-# Global cache for manifest lists
-_manifest_cache: LRUCache[Any, tuple[ManifestFile, ...]] =
LRUCache(maxsize=128)
+# Global cache for ManifestFile objects, keyed by manifest_path.
+# This deduplicates ManifestFile objects across manifest lists, which commonly
+# share manifests after append operations.
+_manifest_cache: LRUCache[str, ManifestFile] = LRUCache(maxsize=512)
+
+# Lock for thread-safe cache access
+_manifest_cache_lock = threading.RLock()
-@cached(cache=_manifest_cache, key=lambda io, manifest_list:
hashkey(manifest_list), lock=threading.RLock())
def _manifests(io: FileIO, manifest_list: str) -> tuple[ManifestFile, ...]:
- """Read and cache manifests from the given manifest list, returning a
tuple to prevent modification."""
+ """Read manifests from a manifest list, deduplicating ManifestFile objects
via cache.
+
+ Caches individual ManifestFile objects by manifest_path. This is
memory-efficient
+ because consecutive manifest lists typically share most of their manifests:
+
+ ManifestList1: [ManifestFile1]
+ ManifestList2: [ManifestFile1, ManifestFile2]
+ ManifestList3: [ManifestFile1, ManifestFile2, ManifestFile3]
+
+ With per-ManifestFile caching, each ManifestFile is stored once and reused.
+
+ Note: The manifest list file is re-read on each call. This is intentional
to
+ keep the implementation simple and avoid O(N²) memory growth from caching
+ overlapping manifest list tuples. Re-reading is cheap since manifest lists
+ are small metadata files.
+
+ Args:
+ io: FileIO instance for reading the manifest list.
+ manifest_list: Path to the manifest list file.
+
+ Returns:
+ A tuple of ManifestFile objects.
+ """
file = io.new_input(manifest_list)
- return tuple(read_manifest_list(file))
+ manifest_files = list(read_manifest_list(file))
Review Comment:
why do you need to materialize the iterator here?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]