This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git


The following commit(s) were added to refs/heads/main by this push:
     new 9ba1d2a92 feat(python): add configurable size guardrails (#3429)
9ba1d2a92 is described below

commit 9ba1d2a9228d2cfb3e6bb34e3e04af8c6dfec01f
Author: Eyad Hazem <[email protected]>
AuthorDate: Fri Mar 6 11:46:44 2026 +0200

    feat(python): add configurable size guardrails (#3429)
    
    This commit:
    - provides configurable size guardrails for untrusted payloads (lists,
    sets, tuples, maps, and strings)
    - tests size guardrails
    
    ## Why?
    
    There are currently no configurable limits for payload-driven lengths.
    Untrusted string/map/list lengths can trigger large allocations and
    memory pressure.
    
    ## What does this PR do?
    
    - Adds guardrail fields in Python runtime configuration.
    - Enforces length limits for lists, tuples, sets, dicts(maps), and
    string bytes.
    - Raises an exception when a configured limit is exceeded.
    
    ## Related issues
    
    - Fixes #3416
---
 python/pyfory/_fory.py                      |  26 +++++-
 python/pyfory/buffer.pxi                    |  15 ++-
 python/pyfory/collection.pxi                |  12 +++
 python/pyfory/collection.py                 |   6 ++
 python/pyfory/serialization.pyx             |  21 ++++-
 python/pyfory/tests/test_size_guardrails.py | 136 ++++++++++++++++++++++++++++
 6 files changed, 209 insertions(+), 7 deletions(-)

diff --git a/python/pyfory/_fory.py b/python/pyfory/_fory.py
index b57f639da..f7b941d59 100644
--- a/python/pyfory/_fory.py
+++ b/python/pyfory/_fory.py
@@ -160,6 +160,8 @@ class Fory:
         "_output_stream",
         "field_nullable",
         "policy",
+        "max_collection_size",
+        "max_binary_size",
     )
 
     def __init__(
@@ -172,6 +174,8 @@ class Fory:
         policy: DeserializationPolicy = None,
         field_nullable: bool = False,
         meta_compressor=None,
+        max_collection_size: int = 1_000_000,
+        max_binary_size: int = 64 * 1024 * 1024,
     ):
         """
         Initialize a Fory serialization instance.
@@ -210,6 +214,17 @@ class Fory:
             field_nullable: Treat all dataclass fields as nullable regardless 
of
                 Optional annotation.
 
+            max_collection_size: Maximum allowed size for collections (lists, 
sets, tuples)
+                and maps (dicts) during deserialization. This limit is used to 
prevent
+                out-of-memory attacks from malicious payloads that claim 
extremely large
+                collection sizes, as collections preallocate memory based on 
the declared
+                size. Raises an exception if exceeded. Default is 1,000,000.
+
+            max_binary_size: Maximum allowed size in bytes for binary data 
reads during
+                deserialization (default: 64 MB). Raises an exception if a 
single binary
+                read exceeds this limit, preventing out-of-memory attacks from 
malicious
+                payloads that claim extremely large binary sizes.
+
         Example:
             >>> # Python-native mode with reference tracking
             >>> fory = Fory(ref=True)
@@ -235,7 +250,8 @@ class Fory:
         self.serialization_context = SerializationContext(fory=self, 
scoped_meta_share_enabled=compatible)
         self.type_resolver.initialize()
 
-        self.buffer = Buffer.allocate(32)
+        self.max_binary_size = max_binary_size
+        self.buffer = Buffer.allocate(32, max_binary_size=max_binary_size)
         self.buffer_callback = None
         self._buffers = None
         self._unsupported_callback = None
@@ -243,6 +259,7 @@ class Fory:
         self.is_peer_out_of_band_enabled = False
         self.max_depth = max_depth
         self.depth = 0
+        self.max_collection_size = max_collection_size
         self._output_stream = None
 
     def register(
@@ -621,7 +638,7 @@ class Fory:
         assert self.depth == 0, "Nested deserialization should use 
read_ref/read_no_ref."
         self.depth += 1
         if isinstance(buffer, bytes):
-            buffer = Buffer(buffer)
+            buffer = Buffer(buffer, max_binary_size=self.max_binary_size)
         if unsupported_objects is not None:
             self._unsupported_objects = iter(unsupported_objects)
         reader_index = buffer.get_reader_index()
@@ -666,6 +683,7 @@ class Fory:
         """Internal method to read without modifying read_ref_ids."""
         if serializer is None:
             serializer = self.type_resolver.read_type_info(buffer).serializer
+
         self.inc_depth()
         o = serializer.read(buffer)
         self.dec_depth()
@@ -812,6 +830,10 @@ class ThreadSafeFory:
         strict (bool): Whether to require type registration. Defaults to True.
         compatible (bool): Whether to enable compatible mode. Defaults to 
False.
         max_depth (int): Maximum depth for deserialization. Defaults to 50.
+        max_collection_size (int): Maximum allowed size for collections and 
maps during
+            deserialization. Defaults to 1,000,000.
+        max_binary_size (int): Maximum allowed size in bytes for binary data 
reads during
+            deserialization. Defaults to 64 MB.
 
     Example:
         >>> import pyfury
diff --git a/python/pyfory/buffer.pxi b/python/pyfory/buffer.pxi
index 2b98353f4..9424189bf 100644
--- a/python/pyfory/buffer.pxi
+++ b/python/pyfory/buffer.pxi
@@ -124,9 +124,11 @@ cdef class Buffer:
         object output_stream
         Py_ssize_t shape[1]
         Py_ssize_t stride[1]
+        int32_t max_binary_size
 
-    def __init__(self,  data not None, int32_t offset=0, length=None):
+    def __init__(self,  data not None, int32_t offset=0, length=None, int32_t 
max_binary_size= 64 * 1024 * 1024):
         self.data = data
+        self.max_binary_size = max_binary_size
         cdef int32_t buffer_len = len(data)
         cdef int length_
         if length is None:
@@ -146,7 +148,7 @@ cdef class Buffer:
         self.output_stream = None
 
     @classmethod
-    def from_stream(cls, stream not None, uint32_t buffer_size=4096):
+    def from_stream(cls, stream not None, uint32_t buffer_size=4096, int32_t 
max_binary_size=64 * 1024 * 1024):
         cdef CBuffer* stream_buffer
         cdef c_string stream_error
         if Fory_PyCreateBufferFromStream(
@@ -156,6 +158,7 @@ cdef class Buffer:
         if stream_buffer == NULL:
             raise ValueError("failed to create stream buffer")
         cdef Buffer buffer = Buffer.__new__(Buffer)
+        buffer.max_binary_size = max_binary_size
         buffer.c_buffer = move(deref(stream_buffer))
         del stream_buffer
         buffer.data = stream
@@ -167,6 +170,7 @@ cdef class Buffer:
     @staticmethod
     cdef Buffer wrap(shared_ptr[CBuffer] c_buffer):
         cdef Buffer buffer = Buffer.__new__(Buffer)
+        buffer.max_binary_size = 64 * 1024 * 1024
         cdef CBuffer* ptr = c_buffer.get()
         buffer.c_buffer = CBuffer(ptr.data(), ptr.size(), False)
         cdef _SharedBufferOwner owner = 
_SharedBufferOwner.__new__(_SharedBufferOwner)
@@ -178,11 +182,12 @@ cdef class Buffer:
         return buffer
 
     @classmethod
-    def allocate(cls, int32_t size):
+    def allocate(cls, int32_t size, int32_t max_binary_size=64 * 1024 * 1024):
         cdef CBuffer* buf = allocate_buffer(size)
         if buf == NULL:
             raise MemoryError("out of memory")
         cdef Buffer buffer = Buffer.__new__(Buffer)
+        buffer.max_binary_size = max_binary_size
         buffer.c_buffer = move(deref(buf))
         del buf
         buffer.data = None
@@ -407,6 +412,10 @@ cdef class Buffer:
     cpdef inline bytes read_bytes(self, int32_t length):
         if length == 0:
             return b""
+
+        if length > self.max_binary_size:
+            raise ValueError(f"Binary size {length} exceeds the configured 
limit of {self.max_binary_size}")
+
         cdef bytes py_bytes = PyBytes_FromStringAndSize(NULL, length)
         if py_bytes is None:
             raise MemoryError("out of memory")
diff --git a/python/pyfory/collection.pxi b/python/pyfory/collection.pxi
index 150f354e6..ecca3bff3 100644
--- a/python/pyfory/collection.pxi
+++ b/python/pyfory/collection.pxi
@@ -393,6 +393,9 @@ cdef class ListSerializer(CollectionSerializer):
         cdef MapRefResolver ref_resolver = self.fory.ref_resolver
         cdef TypeResolver type_resolver = self.fory.type_resolver
         cdef int32_t len_ = buffer.read_var_uint32()
+        # Check size limit before PyList_New preallocation to prevent OOM 
attacks
+        if len_ > self.fory.max_collection_size:
+            raise ValueError(f"List size {len_} exceeds the configured limit 
of {self.fory.max_collection_size}")
         cdef list list_ = PyList_New(len_)
         if len_ == 0:
             return list_
@@ -493,6 +496,9 @@ cdef class TupleSerializer(CollectionSerializer):
         cdef MapRefResolver ref_resolver = self.fory.ref_resolver
         cdef TypeResolver type_resolver = self.fory.type_resolver
         cdef int32_t len_ = buffer.read_var_uint32()
+        # Check size limit before PyTuple_New preallocation to prevent OOM 
attacks
+        if len_ > self.fory.max_collection_size:
+            raise ValueError(f"Tuple size {len_} exceeds the configured limit 
of {self.fory.max_collection_size}")
         cdef tuple tuple_ = PyTuple_New(len_)
         if len_ == 0:
             return tuple_
@@ -575,6 +581,9 @@ cdef class SetSerializer(CollectionSerializer):
         cdef set instance = set()
         ref_resolver.reference(instance)
         cdef int32_t len_ = buffer.read_var_uint32()
+        # Check size limit to prevent OOM attacks from malicious payloads
+        if len_ > self.fory.max_collection_size:
+            raise ValueError(f"Set size {len_} exceeds the configured limit of 
{self.fory.max_collection_size}")
         if len_ == 0:
             return instance
         cdef int8_t collect_flag = buffer.read_int8()
@@ -897,6 +906,9 @@ cdef class MapSerializer(Serializer):
         cdef MapRefResolver ref_resolver = self.ref_resolver
         cdef TypeResolver type_resolver = self.type_resolver
         cdef int32_t size = buffer.read_var_uint32()
+        # Check size limit before _PyDict_NewPresized preallocation to prevent 
OOM attacks
+        if size > self.fory.max_collection_size:
+            raise ValueError(f"Map size {size} exceeds the configured limit of 
{self.fory.max_collection_size}")
         cdef dict map_ = _PyDict_NewPresized(size)
         ref_resolver.reference(map_)
         cdef int32_t ref_id
diff --git a/python/pyfory/collection.py b/python/pyfory/collection.py
index 5bb88cbaa..b44fec026 100644
--- a/python/pyfory/collection.py
+++ b/python/pyfory/collection.py
@@ -167,6 +167,9 @@ class CollectionSerializer(Serializer):
 
     def read(self, buffer):
         len_ = buffer.read_var_uint32()
+        # Check size limit before collection preallocation to prevent OOM 
attacks
+        if len_ > self.fory.max_collection_size:
+            raise ValueError(f"Collection size {len_} exceeds the configured 
limit of {self.fory.max_collection_size}")
         collection_ = self.new_instance(self.type_)
         if len_ == 0:
             return collection_
@@ -481,6 +484,9 @@ class MapSerializer(Serializer):
         ref_resolver = self.ref_resolver
         type_resolver = self.type_resolver
         size = buffer.read_var_uint32()
+        # Check size limit to prevent OOM attacks from malicious payloads
+        if size > fory.max_collection_size:
+            raise ValueError(f"Map size {size} exceeds the configured limit of 
{fory.max_collection_size}")
         map_ = {}
         ref_resolver.reference(map_)
         chunk_header = 0
diff --git a/python/pyfory/serialization.pyx b/python/pyfory/serialization.pyx
index dea890fbc..d44363564 100644
--- a/python/pyfory/serialization.pyx
+++ b/python/pyfory/serialization.pyx
@@ -1078,6 +1078,8 @@ cdef class Fory:
     cdef public bint is_peer_out_of_band_enabled
     cdef int32_t max_depth
     cdef int32_t depth
+    cdef public int32_t max_collection_size
+    cdef public int32_t max_binary_size
     cdef object _output_stream
 
     def __init__(
@@ -1090,6 +1092,8 @@ cdef class Fory:
             max_depth: int = 50,
             field_nullable: bool = False,
             meta_compressor=None,
+            max_collection_size: int = 1_000_000,
+            max_binary_size: int = 64 * 1024 * 1024,
     ):
         """
         Initialize a Fory serialization instance.
@@ -1128,6 +1132,17 @@ cdef class Fory:
             field_nullable: Treat all dataclass fields as nullable regardless 
of
                 Optional annotation.
 
+            max_collection_size: Maximum allowed size for collections (lists, 
sets, tuples)
+                and maps (dicts) during deserialization. This limit is used to 
prevent
+                out-of-memory attacks from malicious payloads that claim 
extremely large
+                collection sizes, as collections preallocate memory based on 
the declared
+                size. Raises an exception if exceeded. Default is 1,000,000.
+
+            max_binary_size: Maximum allowed size in bytes for binary data 
reads during
+                deserialization (default: 64 MB). Raises an exception if a 
single binary
+                read exceeds this limit, preventing out-of-memory attacks from 
malicious
+                payloads that claim extremely large binary sizes.
+
         Example:
             >>> # Python-native mode with reference tracking
             >>> fory = Fory(ref=True)
@@ -1149,7 +1164,8 @@ cdef class Fory:
         self.type_resolver = TypeResolver(self, meta_share=compatible, 
meta_compressor=meta_compressor)
         self.serialization_context = SerializationContext(fory=self, 
scoped_meta_share_enabled=compatible)
         self.type_resolver.initialize()
-        self.buffer = Buffer.allocate(32)
+        self.max_binary_size = max_binary_size
+        self.buffer = Buffer.allocate(32, max_binary_size=max_binary_size)
         self.buffer_callback = None
         self._buffers = None
         self._unsupported_callback = None
@@ -1157,6 +1173,7 @@ cdef class Fory:
         self.is_peer_out_of_band_enabled = False
         self.depth = 0
         self.max_depth = max_depth
+        self.max_collection_size = max_collection_size
         self._output_stream = None
 
     def register_serializer(self, cls: Union[type, TypeVar], Serializer 
serializer):
@@ -1508,7 +1525,7 @@ cdef class Fory:
         """
         try:
             if type(buffer) == bytes:
-                buffer = Buffer(buffer)
+                buffer = Buffer(buffer, max_binary_size=self.max_binary_size)
             return self._deserialize(buffer, buffers, unsupported_objects)
         finally:
             self.reset_read()
diff --git a/python/pyfory/tests/test_size_guardrails.py 
b/python/pyfory/tests/test_size_guardrails.py
new file mode 100644
index 000000000..700b2baa2
--- /dev/null
+++ b/python/pyfory/tests/test_size_guardrails.py
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Test max_collection_size and max_binary_size guardrails to prevent OOM attacks
+from malicious payloads.
+
+Collections preallocate memory based on declared size, so they need guardrails.
+Binary reads are guarded by max_binary_size on the Buffer.
+"""
+
+from dataclasses import dataclass
+from typing import List
+
+import pytest
+
+import pyfory
+from pyfory import Fory
+from pyfory.serialization import Buffer
+
+
+def roundtrip(data, limit, xlang=False, ref=False):
+    """Serialize and deserialize with given collection size limit."""
+    writer = Fory(xlang=xlang, ref=ref)
+    reader = Fory(xlang=xlang, ref=ref, max_collection_size=limit)
+    return reader.deserialize(writer.serialize(data))
+
+
+def roundtrip_binary(data, max_binary_size, xlang=False, ref=False):
+    """Serialize and deserialize with given binary size limit."""
+    writer = Fory(xlang=xlang, ref=ref)
+    reader = Fory(xlang=xlang, ref=ref, max_binary_size=max_binary_size)
+    return reader.deserialize(writer.serialize(data))
+
+
+class TestCollectionSizeLimit:
+    """Collections (list/set/dict) preallocate memory, so need size limits."""
+
+    @pytest.mark.parametrize("xlang", [False, True])
+    @pytest.mark.parametrize(
+        "data,limit",
+        [
+            ([1, 2, 3], 10),  # list within limit
+            ({1, 2, 3}, 10),  # set within limit
+            ({"a": 1}, 10),  # dict within limit
+            ([], 0),  # empty list ok
+            (set(), 0),  # empty set ok
+            ({}, 0),  # empty dict ok
+        ],
+    )
+    def test_within_limit_succeeds(self, xlang, data, limit):
+        assert roundtrip(data, limit, xlang=xlang) == data
+
+    @pytest.mark.parametrize("xlang", [False, True])
+    @pytest.mark.parametrize(
+        "data,limit",
+        [
+            (list(range(10)), 5),  # list exceeds
+            (set(range(10)), 5),  # set exceeds
+            ({str(i): i for i in range(10)}, 5),  # dict exceeds
+            ([[1], list(range(10))], 5),  # nested inner exceeds
+        ],
+    )
+    def test_exceeds_limit_fails(self, xlang, data, limit):
+        with pytest.raises(ValueError, match="exceeds the configured limit"):
+            roundtrip(data, limit, xlang=xlang)
+
+    @pytest.mark.parametrize("ref", [False, True])
+    @pytest.mark.parametrize(
+        "data,limit,should_fail",
+        [
+            ((1, 2, 3), 10, False),
+            (tuple(range(10)), 5, True),
+        ],
+    )
+    def test_tuple_limit(self, ref, data, limit, should_fail):
+        """Tuple only works in xlang=False mode."""
+        if should_fail:
+            with pytest.raises(ValueError, match="exceeds the configured 
limit"):
+                roundtrip(data, limit, xlang=False, ref=ref)
+        else:
+            assert roundtrip(data, limit, xlang=False, ref=ref) == data
+
+    def test_default_limit_is_one_million(self):
+        assert Fory().max_collection_size == 1_000_000
+
+    def test_dataclass_list_field_exceeds_limit(self):
+        @dataclass
+        class Container:
+            items: List[pyfory.int32]
+
+        writer = Fory(xlang=True)
+        writer.register(Container)
+        reader = Fory(xlang=True, max_collection_size=5)
+        reader.register(Container)
+
+        with pytest.raises(ValueError, match="exceeds the configured limit"):
+            
reader.deserialize(writer.serialize(Container(items=list(range(10)))))
+
+
+class TestBinarySizeLimit:
+    """Binary reads are guarded by max_binary_size on the Buffer."""
+
+    def test_default_limit_is_64mib(self):
+        assert Fory().max_binary_size == 64 * 1024 * 1024
+
+    @pytest.mark.parametrize("xlang", [False, True])
+    def test_within_limit_succeeds(self, xlang):
+        assert roundtrip_binary(b"x" * 100, max_binary_size=1024, xlang=xlang) 
== b"x" * 100
+
+    @pytest.mark.parametrize("xlang", [False, True])
+    def test_exceeds_limit_fails(self, xlang):
+        with pytest.raises(ValueError, match="exceeds the configured limit"):
+            roundtrip_binary(b"x" * 200, max_binary_size=100, xlang=xlang)
+
+    def test_from_stream_respects_limit(self):
+        import io
+
+        payload = Fory().serialize(b"x" * 200)
+        buf = Buffer.from_stream(io.BytesIO(payload), max_binary_size=100)
+        with pytest.raises(ValueError, match="exceeds the configured limit"):
+            Fory(max_binary_size=100).deserialize(buf)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to