This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git


The following commit(s) were added to refs/heads/main by this push:
     new 9295e58a perf(python): Pre-allocate size for the dictionary (#1949)
9295e58a is described below

commit 9295e58ae706a3734f6fa80704c41502ba73131e
Author: penguin_wwy <[email protected]>
AuthorDate: Thu Nov 21 00:34:34 2024 +0800

    perf(python): Pre-allocate size for the dictionary (#1949)
    
    ## What does this PR do?
    
    Pre-allocate memory for the dictionary based on the data size to avoid
    resizing and improve deserialization performance.
    
    ## Related issues
    
    ## Does this PR introduce any user-facing change?
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    ```
    # python format
    fury_large_dict: Mean +- std dev: [dict_base] 548 us +- 33 us -> 
[dict_resize] 531 us +- 33 us: 1.03x faster
    
    # xlang format
    fury_large_dict: Mean +- std dev: [dict_xlang_base] 550 us +- 39 us -> 
[dict_xlang_resize] 527 us +- 35 us: 1.05x faster
    ```
---
 integration_tests/cpython_benchmark/fury_benchmark.py | 4 ++++
 python/pyfury/_serialization.pyx                      | 7 ++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/integration_tests/cpython_benchmark/fury_benchmark.py 
b/integration_tests/cpython_benchmark/fury_benchmark.py
index 8b493aec..01be1d49 100644
--- a/integration_tests/cpython_benchmark/fury_benchmark.py
+++ b/integration_tests/cpython_benchmark/fury_benchmark.py
@@ -64,6 +64,7 @@ DICT = {
     "view_count": 7,
     "zip": "",
 }
+LARGE_DICT = {str(i): i for i in range(2**10 + 1)}
 
 TUPLE = (
     [
@@ -177,6 +178,9 @@ def micro_benchmark():
     runner.parse_args()
     language = pyfury.Language.XLANG if args.xlang else pyfury.Language.PYTHON
     runner.bench_func("fury_dict", fury_object, language, not args.no_ref, 
DICT)
+    runner.bench_func(
+        "fury_large_dict", fury_object, language, not args.no_ref, LARGE_DICT
+    )
     runner.bench_func(
         "fury_dict_group", fury_object, language, not args.no_ref, DICT_GROUP
     )
diff --git a/python/pyfury/_serialization.pyx b/python/pyfury/_serialization.pyx
index 0175f615..ce1443c6 100644
--- a/python/pyfury/_serialization.pyx
+++ b/python/pyfury/_serialization.pyx
@@ -76,6 +76,7 @@ cdef extern from *:
     """
     object int2obj(int64_t obj_addr)
     int64_t obj2int(object obj_ref)
+    dict _PyDict_NewPresized(Py_ssize_t minused)
 
 
 cdef int8_t NULL_FLAG = -3
@@ -2081,9 +2082,9 @@ cdef class MapSerializer(Serializer):
     cpdef inline read(self, Buffer buffer):
         cdef MapRefResolver ref_resolver = self.ref_resolver
         cdef ClassResolver class_resolver = self.class_resolver
-        cdef dict map_ = {}
-        ref_resolver.reference(map_)
         cdef int32_t len_ = buffer.read_varint32()
+        cdef dict map_ = _PyDict_NewPresized(len_)
+        ref_resolver.reference(map_)
         cdef int32_t ref_id
         cdef ClassInfo key_classinfo
         cdef ClassInfo value_classinfo
@@ -2131,7 +2132,7 @@ cdef class MapSerializer(Serializer):
 
     cpdef inline xread(self, Buffer buffer):
         cdef int32_t len_ = buffer.read_varint32()
-        cdef dict map_ = {}
+        cdef dict map_ = _PyDict_NewPresized(len_)
         self.fury.ref_resolver.reference(map_)
         for i in range(len_):
             k = self.fury.xdeserialize_ref(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to