This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git


The following commit(s) were added to refs/heads/main by this push:
     new 4313a5dcb perf(python): optimize pyfory perf (#2829)
4313a5dcb is described below

commit 4313a5dcb7453ccfeac7d43479eabfab63968232
Author: Shawn Yang <[email protected]>
AuthorDate: Fri Oct 24 14:13:10 2025 +0800

    perf(python): optimize pyfory perf (#2829)
    
    <!--
    **Thanks for contributing to Apache Fory™.**
    
    **If this is your first time opening a PR on fory, you can refer to
    
[CONTRIBUTING.md](https://github.com/apache/fory/blob/main/CONTRIBUTING.md).**
    
    Contribution Checklist
    
    - The **Apache Fory™** community has requirements on the naming of pr
    titles. You can also find instructions in
    [CONTRIBUTING.md](https://github.com/apache/fory/blob/main/CONTRIBUTING.md).
    
    - Apache Fory™ has a strong focus on performance. If the PR you submit
    will have an impact on performance, please benchmark it first and
    provide the benchmark result here.
    -->
    
    ## Why?
    
    <!-- Describe the purpose of this PR. -->
    
    ## What does this PR do?
    
    optimize pyfory perf:
    - remove simd utf16 check
    - optimize cython Buffer by access pointer directly
    - use load factor 0.5 for abceil hashmap
    
    ## Related issues
    
    <!--
    Is there any related issue? If this PR closes them you say say
    fix/closes:
    
    - #xxxx0
    - #xxxx1
    - Fixes #xxxx2
    -->
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fory/issues/new/choose) describing the
    need to do so and update the document if necessary.
    
    Delete section if not applicable.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    
    Delete section if not applicable.
    -->
---
 cpp/fory/benchmark/benchmark_string_util.cc        | 150 ++++++++
 cpp/fory/util/string_util.h                        |  69 ++--
 integration_tests/cpython_benchmark/README.md      | 237 ++++++++++++-
 .../cpython_benchmark/fory_benchmark.py            | 380 +++++++++++++++++----
 .../cpython_benchmark/requirements.txt             |   1 -
 python/pyfory/_util.pxd                            |   1 +
 python/pyfory/_util.pyx                            | 110 +++---
 python/pyfory/serialization.pyx                    |  42 +--
 python/pyfory/serializer.py                        |  16 +-
 9 files changed, 788 insertions(+), 218 deletions(-)

diff --git a/cpp/fory/benchmark/benchmark_string_util.cc 
b/cpp/fory/benchmark/benchmark_string_util.cc
index 1226a9c9a..535e5014c 100644
--- a/cpp/fory/benchmark/benchmark_string_util.cc
+++ b/cpp/fory/benchmark/benchmark_string_util.cc
@@ -347,6 +347,20 @@ bool utf16HasSurrogatePairs_BaseLine(const std::u16string 
&str) {
   return false;
 }
 
+// Generate test strings of various sizes for threshold benchmarking
+const std::vector<std::u16string> test_utf16_strings_small_16 =
+    generateUTF16String(num_tests, 16);
+const std::vector<std::u16string> test_utf16_strings_small_32 =
+    generateUTF16String(num_tests, 32);
+const std::vector<std::u16string> test_utf16_strings_small_64 =
+    generateUTF16String(num_tests, 64);
+const std::vector<std::u16string> test_utf16_strings_medium_128 =
+    generateUTF16String(num_tests, 128);
+const std::vector<std::u16string> test_utf16_strings_medium_256 =
+    generateUTF16String(num_tests, 256);
+const std::vector<std::u16string> test_utf16_strings_large_512 =
+    generateUTF16String(num_tests, 512);
+
 // Benchmark function for checking if a UTF-16 string contains surrogate pairs
 static void BM_Utf16HasSurrogatePairs_BaseLine(benchmark::State &state) {
   for (auto _ : state) {
@@ -372,6 +386,142 @@ static void 
BM_Utf16HasSurrogatePairs_FORY(benchmark::State &state) {
 
 BENCHMARK(BM_Utf16HasSurrogatePairs_FORY);
 
+// Benchmarks for different string sizes to determine optimal threshold
+static void BM_Utf16HasSurrogatePairs_BaseLine_Size16(benchmark::State &state) 
{
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_small_16) {
+      bool result = utf16HasSurrogatePairs_BaseLine(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine_Size16);
+
+static void BM_Utf16HasSurrogatePairs_FORY_Size16(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_small_16) {
+      bool result = fory::utf16HasSurrogatePairs(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FORY_Size16);
+
+static void BM_Utf16HasSurrogatePairs_BaseLine_Size32(benchmark::State &state) 
{
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_small_32) {
+      bool result = utf16HasSurrogatePairs_BaseLine(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine_Size32);
+
+static void BM_Utf16HasSurrogatePairs_FORY_Size32(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_small_32) {
+      bool result = fory::utf16HasSurrogatePairs(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FORY_Size32);
+
+static void BM_Utf16HasSurrogatePairs_BaseLine_Size64(benchmark::State &state) 
{
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_small_64) {
+      bool result = utf16HasSurrogatePairs_BaseLine(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine_Size64);
+
+static void BM_Utf16HasSurrogatePairs_FORY_Size64(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_small_64) {
+      bool result = fory::utf16HasSurrogatePairs(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FORY_Size64);
+
+static void
+BM_Utf16HasSurrogatePairs_BaseLine_Size128(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_medium_128) {
+      bool result = utf16HasSurrogatePairs_BaseLine(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine_Size128);
+
+static void BM_Utf16HasSurrogatePairs_FORY_Size128(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_medium_128) {
+      bool result = fory::utf16HasSurrogatePairs(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FORY_Size128);
+
+static void
+BM_Utf16HasSurrogatePairs_BaseLine_Size256(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_medium_256) {
+      bool result = utf16HasSurrogatePairs_BaseLine(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine_Size256);
+
+static void BM_Utf16HasSurrogatePairs_FORY_Size256(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_medium_256) {
+      bool result = fory::utf16HasSurrogatePairs(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FORY_Size256);
+
+static void
+BM_Utf16HasSurrogatePairs_BaseLine_Size512(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_large_512) {
+      bool result = utf16HasSurrogatePairs_BaseLine(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine_Size512);
+
+static void BM_Utf16HasSurrogatePairs_FORY_Size512(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const std::u16string &str : test_utf16_strings_large_512) {
+      bool result = fory::utf16HasSurrogatePairs(str);
+      benchmark::DoNotOptimize(result);
+    }
+  }
+}
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FORY_Size512);
+
 /*
  * TEST Utf16ToUtf8
  */
diff --git a/cpp/fory/util/string_util.h b/cpp/fory/util/string_util.h
index 4a4a41940..7c34993f2 100644
--- a/cpp/fory/util/string_util.h
+++ b/cpp/fory/util/string_util.h
@@ -133,23 +133,11 @@ inline bool isLatin1(const uint16_t *data, size_t length) 
{
                           length % VECTOR_SIZE);
 }
 inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
-  constexpr size_t VECTOR_SIZE = 16;
-  const auto *ptr = reinterpret_cast<const __m256i *>(data);
-  const auto *end = ptr + length / VECTOR_SIZE;
-  const __m256i lower_bound = _mm256_set1_epi16(0xD800);
-  const __m256i higher_bound = _mm256_set1_epi16(0xDFFF);
-
-  for (; ptr < end; ++ptr) {
-    __m256i vec = _mm256_loadu_si256(ptr);
-    __m256i mask1 = _mm256_cmpgt_epi16(vec, lower_bound);
-    __m256i mask2 = _mm256_cmpgt_epi16(higher_bound, vec);
-    __m256i result = _mm256_and_si256(mask1, mask2);
-    if (!_mm256_testz_si256(result, result))
-      return true;
-  }
-
-  return hasSurrogatePairFallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE,
-                                  length % VECTOR_SIZE);
+  // Direct fallback implementation - SIMD versions were consistently slower
+  // due to early-exit characteristics: surrogate pairs are rare and when
+  // present, often appear early in strings, making SIMD setup overhead
+  // outweigh any vectorization benefits.
+  return hasSurrogatePairFallback(data, length);
 }
 
 #elif defined(FORY_HAS_NEON)
@@ -182,18 +170,11 @@ inline bool isLatin1(const uint16_t *data, size_t length) 
{
 }
 
 inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
-  size_t i = 0;
-  uint16x8_t lower_bound = vdupq_n_u16(0xD800);
-  uint16x8_t higher_bound = vdupq_n_u16(0xDFFF);
-  for (; i + 7 < length; i += 8) {
-    uint16x8_t chunk = vld1q_u16(data + i);
-    uint16x8_t mask1 = vcgeq_u16(chunk, lower_bound);
-    uint16x8_t mask2 = vcleq_u16(chunk, higher_bound);
-    if (vmaxvq_u16(mask1 & mask2)) {
-      return true; // Detected a high surrogate
-    }
-  }
-  return hasSurrogatePairFallback(data + i, length - i);
+  // Direct fallback implementation - SIMD versions were consistently slower
+  // due to early-exit characteristics: surrogate pairs are rare and when
+  // present, often appear early in strings, making SIMD setup overhead
+  // outweigh any vectorization benefits.
+  return hasSurrogatePairFallback(data, length);
 }
 #elif defined(FORY_HAS_SSE2)
 inline bool isAscii(const char *data, size_t length) {
@@ -227,19 +208,11 @@ inline bool isLatin1(const uint16_t *data, size_t length) 
{
 }
 
 inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
-  size_t i = 0;
-  __m128i lower_bound = _mm_set1_epi16(0xd7ff);
-  __m128i higher_bound = _mm_set1_epi16(0xe000);
-  for (; i + 7 < length; i += 8) {
-    __m128i chunk =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + i));
-    __m128i cmp1 = _mm_cmpgt_epi16(chunk, lower_bound);
-    __m128i cmp2 = _mm_cmpgt_epi16(higher_bound, chunk);
-    if (_mm_movemask_epi8(_mm_and_si128(cmp1, cmp2)) != 0) {
-      return true; // Detected a surrogate
-    }
-  }
-  return hasSurrogatePairFallback(data + i, length - i);
+  // Direct fallback implementation - SIMD versions were consistently slower
+  // due to early-exit characteristics: surrogate pairs are rare and when
+  // present, often appear early in strings, making SIMD setup overhead
+  // outweigh any vectorization benefits.
+  return hasSurrogatePairFallback(data, length);
 }
 #else
 inline bool isAscii(const char *data, size_t length) {
@@ -266,10 +239,14 @@ inline bool isLatin1(const std::u16string &str) {
 }
 
 inline bool utf16HasSurrogatePairs(const std::u16string &str) {
-  // Get the data pointer
-  const std::uint16_t *data =
-      reinterpret_cast<const std::uint16_t *>(str.data());
-  return utf16HasSurrogatePairs(data, str.size());
+  // Inline implementation for best performance
+  for (size_t i = 0; i < str.size(); ++i) {
+    auto c = str[i];
+    if (c >= 0xD800 && c <= 0xDFFF) {
+      return true;
+    }
+  }
+  return false;
 }
 
 } // namespace fory
diff --git a/integration_tests/cpython_benchmark/README.md 
b/integration_tests/cpython_benchmark/README.md
index 9a7b56647..1cc20d5ec 100644
--- a/integration_tests/cpython_benchmark/README.md
+++ b/integration_tests/cpython_benchmark/README.md
@@ -1,37 +1,244 @@
 # Apache Fory™ CPython Benchmark
 
-Microbenchmark for Apache Fory™ serialization in cpython
+Microbenchmark comparing Apache Fory™ and Pickle serialization performance in 
CPython.
 
-## Benchmark
+## Quick Start
 
-Step 1: Install Apache Fory™ into Python
+### Step 1: Install Apache Fory™ into Python
 
-Step 2: Install the dependencies required for the benchmark script
+Follow the installation instructions from the main documentation.
+
+### Step 2: Execute the benchmark script
 
 ```bash
-pip install -r requirements.txt
+python fory_benchmark.py
 ```
 
-Step 3: Execute the benchmark script
+This will run all benchmarks with both Fory and Pickle serializers using 
default settings.
+
+## Usage
+
+### Basic Usage
 
 ```bash
+# Run all benchmarks with both Fory and Pickle
 python fory_benchmark.py
+
+# Run all benchmarks without reference tracking
+python fory_benchmark.py --no-ref
+
+# Run specific benchmarks
+python fory_benchmark.py --benchmarks dict,large_dict,complex
+
+# Compare only Fory performance
+python fory_benchmark.py --serializers fory
+
+# Compare only Pickle performance
+python fory_benchmark.py --serializers pickle
+
+# Run with more iterations for better accuracy
+python fory_benchmark.py --iterations 50 --repeat 10
+
+# Debug with pure Python mode
+python fory_benchmark.py --disable-cython --benchmarks dict
+```
+
+## Command-Line Options
+
+### Benchmark Selection
+
+#### `--benchmarks BENCHMARK_LIST`
+
+Comma-separated list of benchmarks to run. Default: `all`
+
+Available benchmarks:
+
+- `dict` - Small dictionary serialization (28 fields with mixed types)
+- `large_dict` - Large dictionary (2^10 + 1 entries)
+- `dict_group` - Group of 3 dictionaries
+- `tuple` - Small tuple with nested list
+- `large_tuple` - Large tuple (2^20 + 1 integers)
+- `large_float_tuple` - Large tuple of floats (2^20 + 1 elements)
+- `large_boolean_tuple` - Large tuple of booleans (2^20 + 1 elements)
+- `list` - Nested lists (10x10x10 structure)
+- `large_list` - Large list (2^20 + 1 integers)
+- `complex` - Complex dataclass objects with nested structures
+
+Examples:
+
+```bash
+# Run only dictionary benchmarks
+python fory_benchmark.py --benchmarks dict,large_dict,dict_group
+
+# Run only large data benchmarks
+python fory_benchmark.py --benchmarks large_dict,large_tuple,large_list
+
+# Run only the complex object benchmark
+python fory_benchmark.py --benchmarks complex
 ```
 
-### fory options
+#### `--serializers SERIALIZER_LIST`
 
-`--xlang` specify using cross-language mode, otherwise choose python mode
+Comma-separated list of serializers to benchmark. Default: `all`
 
-`--no-ref` specify ref tracking is true
+Available serializers:
 
-`--disable-cython` disable cython serialization
+- `fory` - Apache Fory™ serialization
+- `pickle` - Python's built-in pickle serialization
 
-### pyperf options
+Examples:
 
-`--affinity CPU_LIST` specify CPU affinity for worker processes
+```bash
+# Compare both serializers (default)
+python fory_benchmark.py --serializers fory,pickle
+
+# Benchmark only Fory
+python fory_benchmark.py --serializers fory
+
+# Benchmark only Pickle
+python fory_benchmark.py --serializers pickle
+```
+
+### Fory Configuration
+
+#### `--no-ref`
+
+Disable reference tracking for Fory. By default, Fory tracks references to 
handle shared and circular references.
+
+```bash
+# Run without reference tracking
+python fory_benchmark.py --no-ref
+```
+
+#### `--disable-cython`
+
+Use pure Python mode instead of Cython serialization for Fory. Useful for 
debugging protocol issues.
+
+```bash
+# Use pure Python serialization
+python fory_benchmark.py --disable-cython
+```
+
+### Benchmark Parameters
+
+These options control the benchmark measurement process:
+
+#### `--warmup N`
+
+Number of warmup iterations before measurement starts. Default: `3`
+
+```bash
+python fory_benchmark.py --warmup 5
+```
 
-`-o FILENAME, --output FILENAME` write results encoded to JSON into FILENAME
+#### `--iterations N`
 
-`--profile PROFILE` collect profile data using cProfile and output to the 
given file
+Number of measurement iterations to collect. Default: `20`
 
-`--help` to get more `pyperf` options
+```bash
+python fory_benchmark.py --iterations 50
+```
+
+#### `--repeat N`
+
+Number of times to repeat each iteration. Default: `5`
+
+```bash
+python fory_benchmark.py --repeat 10
+```
+
+#### `--number N`
+
+Number of times to call the serialization function per measurement (inner 
loop). Default: `100`
+
+```bash
+python fory_benchmark.py --number 1000
+```
+
+#### `--help`
+
+Display help message and exit.
+
+```bash
+python fory_benchmark.py --help
+```
+
+## Examples
+
+### Running Specific Comparisons
+
+```bash
+# Compare Fory and Pickle on dictionary benchmarks
+python fory_benchmark.py --benchmarks dict,large_dict,dict_group
+
+# Compare performance without reference tracking
+python fory_benchmark.py --no-ref
+
+# Test only Fory with high precision
+python fory_benchmark.py --serializers fory --iterations 100 --repeat 10
+```
+
+### Performance Tuning
+
+```bash
+# Quick test with fewer iterations
+python fory_benchmark.py --warmup 1 --iterations 5 --repeat 3
+
+# High-precision benchmark
+python fory_benchmark.py --warmup 10 --iterations 100 --repeat 10
+
+# Benchmark large data structures with more inner loop iterations
+python fory_benchmark.py --benchmarks large_list,large_tuple --number 1000
+```
+
+### Debugging and Development
+
+```bash
+# Debug protocol issues with pure Python mode
+python fory_benchmark.py --disable-cython --benchmarks dict
+
+# Test complex objects only
+python fory_benchmark.py --benchmarks complex --iterations 10
+
+# Compare Fory with and without ref tracking
+python fory_benchmark.py --serializers fory --benchmarks dict
+python fory_benchmark.py --serializers fory --benchmarks dict --no-ref
+```
+
+## Output Format
+
+The benchmark script provides three sections of output:
+
+1. **Progress**: Real-time progress as each benchmark runs
+2. **Summary**: Table of all results showing mean time and standard deviation
+3. **Speedup**: Comparison table showing Fory speedup vs Pickle (only when 
both serializers are tested)
+
+Example output:
+
+```
+Benchmarking 3 benchmark(s) with 2 serializer(s)
+Warmup: 3, Iterations: 20, Repeat: 5, Inner loop: 100
+Fory reference tracking: enabled
+================================================================================
+
+Running fory_dict... 12.34 us ± 0.56 us
+Running pickle_dict... 45.67 us ± 1.23 us
+...
+
+================================================================================
+SUMMARY
+================================================================================
+Serializer      Benchmark                 Mean                 Std Dev
+--------------------------------------------------------------------------------
+fory            dict                      12.34 us             0.56 us
+pickle          dict                      45.67 us             1.23 us
+...
+
+================================================================================
+SPEEDUP (Fory vs Pickle)
+================================================================================
+Benchmark                 Fory                 Pickle               Speedup
+--------------------------------------------------------------------------------
+dict                      12.34 us             45.67 us             3.70x
+...
+```
diff --git a/integration_tests/cpython_benchmark/fory_benchmark.py 
b/integration_tests/cpython_benchmark/fory_benchmark.py
index 5b413a561..48859b514 100644
--- a/integration_tests/cpython_benchmark/fory_benchmark.py
+++ b/integration_tests/cpython_benchmark/fory_benchmark.py
@@ -15,15 +15,75 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""Apache Fory™ vs Pickle CPython Benchmark Suite
+
+Microbenchmark comparing Apache Fory™ and Pickle serialization performance in 
CPython.
+
+Usage:
+    python fory_benchmark.py [OPTIONS]
+
+Benchmark Options:
+    --benchmarks BENCHMARK_LIST
+        Comma-separated list of benchmarks to run. Default: all
+        Available: dict, large_dict, dict_group, tuple, large_tuple,
+                   large_float_tuple, large_boolean_tuple, list, large_list, 
complex
+
+    --serializers SERIALIZER_LIST
+        Comma-separated list of serializers to benchmark. Default: all
+        Available: fory, pickle
+        Example: --serializers fory,pickle
+
+    --no-ref
+        Disable reference tracking for Fory (enabled by default)
+
+    --warmup N
+        Number of warmup iterations (default: 3)
+
+    --iterations N
+        Number of benchmark iterations (default: 20)
+
+    --repeat N
+        Number of times to repeat each iteration (default: 5)
+
+    --number N
+        Number of times to call function per measurement (inner loop, default: 
100)
+
+    --help
+        Show help message and exit
+
+Examples:
+    # Run all benchmarks with both Fory and Pickle
+    python fory_benchmark.py
+
+    # Run specific benchmarks with both serializers
+    python fory_benchmark.py --benchmarks dict,large_dict,complex
+
+    # Compare only Fory performance
+    python fory_benchmark.py --serializers fory
+
+    # Compare only Pickle performance
+    python fory_benchmark.py --serializers pickle
+
+    # Run without reference tracking for Fory
+    python fory_benchmark.py --no-ref
+
+    # Run with more iterations for better accuracy
+    python fory_benchmark.py --iterations 50 --repeat 10
+
+    # Debug with pure Python mode
+    python fory_benchmark.py --disable-cython --benchmarks dict
+"""
+
 import argparse
 from dataclasses import dataclass
 import datetime
-import os
+import pickle
 import random
+import statistics
 import sys
+import timeit
 from typing import Any, Dict, List
 import pyfory
-import pyperf
 
 
 # The benchmark case is rewritten from pyperformance bm_pickle
@@ -117,21 +177,21 @@ class ComplexObject1:
     f1: Any = None
     f2: str = None
     f3: List[str] = None
-    f4: Dict[pyfory.Int8Type, pyfory.Int32Type] = None
-    f5: pyfory.Int8Type = None
-    f6: pyfory.Int16Type = None
-    f7: pyfory.Int32Type = None
-    f8: pyfory.Int64Type = None
-    f9: pyfory.Float32Type = None
-    f10: pyfory.Float64Type = None
-    f11: pyfory.Int16ArrayType = None
-    f12: List[pyfory.Int16Type] = None
+    f4: Dict[pyfory.int8, pyfory.int32] = None
+    f5: pyfory.int8 = None
+    f6: pyfory.int16 = None
+    f7: pyfory.int32 = None
+    f8: pyfory.int64 = None
+    f9: pyfory.float32 = None
+    f10: pyfory.float64 = None
+    f11: pyfory.int16_array = None
+    f12: List[pyfory.int16] = None
 
 
 @dataclass
 class ComplexObject2:
     f1: Any
-    f2: Dict[pyfory.Int8Type, pyfory.Int32Type]
+    f2: Dict[pyfory.int8, pyfory.int32]
 
 
 COMPLEX_OBJECT = ComplexObject1(
@@ -148,88 +208,260 @@ COMPLEX_OBJECT = ComplexObject1(
     f11=[-1, 4],
 )
 
+# Global fory instances
+fory_with_ref = pyfory.Fory(ref=True)
+fory_without_ref = pyfory.Fory(ref=False)
+
+# Register all custom types on both instances
+for fory_instance in (fory_with_ref, fory_without_ref):
+    fory_instance.register_type(ComplexObject1)
+    fory_instance.register_type(ComplexObject2)
 
-def fory_object(xlang, ref, obj):
-    fory = pyfory.Fory(xlang=xlang, ref=ref)
+
+def fory_object(ref, obj):
+    fory = fory_with_ref if ref else fory_without_ref
     binary = fory.serialize(obj)
     fory.deserialize(binary)
 
 
-def fory_data_class(xlang, ref, obj, register_callable):
-    fory = pyfory.Fory(xlang=xlang, ref=ref)
-    register_callable(fory)
+def fory_data_class(ref, obj):
+    fory = fory_with_ref if ref else fory_without_ref
     binary = fory.serialize(obj)
     fory.deserialize(binary)
 
 
-def benchmark_args():
-    parser = argparse.ArgumentParser(description="Fory Benchmark")
-    parser.add_argument("--xlang", action="store_true", default=False)
-    parser.add_argument("--no-ref", action="store_true", default=False)
-    parser.add_argument("--disable-cython", action="store_true", default=False)
+def pickle_object(obj):
+    binary = pickle.dumps(obj)
+    pickle.loads(binary)
 
-    if "--help" in sys.argv:
-        parser.print_help()
-        return None
-    args, unknown_args = parser.parse_known_args()
-    sys.argv = sys.argv[:1] + unknown_args
-    return args
 
+def pickle_data_class(obj):
+    binary = pickle.dumps(obj)
+    pickle.loads(binary)
 
-def micro_benchmark():
-    args = benchmark_args()
-    runner = pyperf.Runner()
-    if args and args.disable_cython:
-        os.environ["ENABLE_FORY_CYTHON_SERIALIZATION"] = "0"
-        sys.argv += ["--inherit-environ", "ENABLE_FORY_CYTHON_SERIALIZATION"]
-    runner.parse_args()
-    xlang = args.xlang
-    runner.bench_func("fory_dict", fory_object, xlang, not args.no_ref, DICT)
-    runner.bench_func(
-        "fory_large_dict", fory_object, xlang, not args.no_ref, LARGE_DICT
+
+def benchmark_args():
+    """Parse command line arguments"""
+    parser = argparse.ArgumentParser(description="Fory vs Pickle Benchmark")
+    parser.add_argument(
+        "--no-ref",
+        action="store_true",
+        default=False,
+        help="Disable reference tracking for Fory",
     )
-    runner.bench_func(
-        "fory_dict_group", fory_object, xlang, not args.no_ref, DICT_GROUP
+    parser.add_argument(
+        "--disable-cython",
+        action="store_true",
+        default=False,
+        help="Use pure Python mode for Fory",
     )
-    runner.bench_func("fory_tuple", fory_object, xlang, not args.no_ref, TUPLE)
-    runner.bench_func(
-        "fory_large_tuple", fory_object, xlang, not args.no_ref, LARGE_TUPLE
+    parser.add_argument(
+        "--benchmarks",
+        type=str,
+        default="all",
+        help="Comma-separated list of benchmarks to run. Available: dict, 
large_dict, "
+        "dict_group, tuple, large_tuple, large_float_tuple, 
large_boolean_tuple, "
+        "list, large_list, complex. Default: all",
     )
-    runner.bench_func(
-        "fory_large_float_tuple",
-        fory_object,
-        xlang,
-        not args.no_ref,
-        LARGE_FLOAT_TUPLE,
+    parser.add_argument(
+        "--serializers",
+        type=str,
+        default="all",
+        help="Comma-separated list of serializers to benchmark. Available: 
fory, pickle. Default: all",
     )
-    runner.bench_func(
-        "fory_large_boolean_tuple",
-        fory_object,
-        xlang,
-        not args.no_ref,
-        LARGE_BOOLEAN_TUPLE,
+    parser.add_argument(
+        "--warmup",
+        type=int,
+        default=3,
+        help="Number of warmup iterations (default: 3)",
     )
-    runner.bench_func("fory_list", fory_object, xlang, not args.no_ref, LIST)
-    runner.bench_func(
-        "fory_large_list", fory_object, xlang, not args.no_ref, LARGE_LIST
+    parser.add_argument(
+        "--iterations",
+        type=int,
+        default=20,
+        help="Number of benchmark iterations (default: 20)",
     )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=5,
+        help="Number of times to repeat each iteration (default: 5)",
+    )
+    parser.add_argument(
+        "--number",
+        type=int,
+        default=100,
+        help="Number of times to call function per measurement (inner loop, 
default: 10000)",
+    )
+    return parser.parse_args()
+
+
+def run_benchmark(func, *args, warmup=3, iterations=20, repeat=5, 
number=10000):
+    """Run a benchmark and return timing statistics
+
+    Args:
+        func: Function to benchmark
+        *args: Arguments to pass to func
+        warmup: Number of warmup iterations
+        iterations: Number of measurement iterations
+        repeat: Number of times to repeat each measurement
+        number: Number of times to call func per measurement (inner loop)
+
+    Returns:
+        (mean_time_per_call, stdev_time_per_call)
+    """
+    # Warmup
+    for _ in range(warmup):
+        for _ in range(number):
+            func(*args)
+
+    # Benchmark - run func 'number' times per measurement
+    times = []
+    for _ in range(iterations):
+        timer = timeit.Timer(lambda: func(*args))
+        iteration_times = timer.repeat(repeat=repeat, number=number)
+        # Convert total time to time per call
+        times.extend([t / number for t in iteration_times])
 
-    def register_complex(fory):
-        if args.xlang:
-            fory.register_type(ComplexObject1, 
typename="example.ComplexObject1")
-            fory.register_type(ComplexObject2, 
typename="example.ComplexObject2")
-        else:
-            fory.register_type(ComplexObject1)
-            fory.register_type(ComplexObject2)
-
-    runner.bench_func(
-        "fory_complex",
-        fory_data_class,
-        xlang,
-        not args.no_ref,
-        COMPLEX_OBJECT,
-        register_complex,
+    mean = statistics.mean(times)
+    stdev = statistics.stdev(times) if len(times) > 1 else 0
+    return mean, stdev
+
+
+def format_time(seconds):
+    """Format time in human-readable units"""
+    if seconds < 1e-6:
+        return f"{seconds * 1e9:.2f} ns"
+    elif seconds < 1e-3:
+        return f"{seconds * 1e6:.2f} us"
+    elif seconds < 1:
+        return f"{seconds * 1e3:.2f} ms"
+    else:
+        return f"{seconds:.2f} s"
+
+
+def micro_benchmark():
+    args = benchmark_args()
+    ref = not args.no_ref
+
+    # Define benchmark data and functions
+    benchmark_data = {
+        "dict": (DICT, fory_object, pickle_object),
+        "large_dict": (LARGE_DICT, fory_object, pickle_object),
+        "dict_group": (DICT_GROUP, fory_object, pickle_object),
+        "tuple": (TUPLE, fory_object, pickle_object),
+        "large_tuple": (LARGE_TUPLE, fory_object, pickle_object),
+        "large_float_tuple": (LARGE_FLOAT_TUPLE, fory_object, pickle_object),
+        "large_boolean_tuple": (LARGE_BOOLEAN_TUPLE, fory_object, 
pickle_object),
+        "list": (LIST, fory_object, pickle_object),
+        "large_list": (LARGE_LIST, fory_object, pickle_object),
+        "complex": (COMPLEX_OBJECT, fory_data_class, pickle_data_class),
+    }
+
+    # Determine which benchmarks to run
+    if args.benchmarks == "all":
+        selected_benchmarks = list(benchmark_data.keys())
+    else:
+        selected_benchmarks = [b.strip() for b in args.benchmarks.split(",")]
+        # Validate benchmark names
+        invalid = [b for b in selected_benchmarks if b not in benchmark_data]
+        if invalid:
+            print(f"Error: Invalid benchmark names: {', '.join(invalid)}")
+            print(f"Available benchmarks: {', '.join(benchmark_data.keys())}")
+            sys.exit(1)
+
+    # Determine which serializers to run
+    available_serializers = {"fory", "pickle"}
+    if args.serializers == "all":
+        selected_serializers = ["fory", "pickle"]
+    else:
+        selected_serializers = [s.strip() for s in args.serializers.split(",")]
+        # Validate serializer names
+        invalid = [s for s in selected_serializers if s not in 
available_serializers]
+        if invalid:
+            print(f"Error: Invalid serializer names: {', '.join(invalid)}")
+            print(f"Available serializers: {', '.join(available_serializers)}")
+            sys.exit(1)
+
+    print(
+        f"\nBenchmarking {len(selected_benchmarks)} benchmark(s) with 
{len(selected_serializers)} serializer(s)"
+    )
+    print(
+        f"Warmup: {args.warmup}, Iterations: {args.iterations}, Repeat: 
{args.repeat}, Inner loop: {args.number}"
     )
+    print(f"Fory reference tracking: {'enabled' if ref else 'disabled'}")
+    print("=" * 80)
+
+    # Run selected benchmarks with selected serializers
+    results = []
+    for benchmark_name in selected_benchmarks:
+        data, fory_func, pickle_func = benchmark_data[benchmark_name]
+
+        if "fory" in selected_serializers:
+            print(f"\nRunning fory_{benchmark_name}...", end=" ", flush=True)
+            mean, stdev = run_benchmark(
+                fory_func,
+                ref,
+                data,
+                warmup=args.warmup,
+                iterations=args.iterations,
+                repeat=args.repeat,
+                number=args.number,
+            )
+            results.append(("fory", benchmark_name, mean, stdev))
+            print(f"{format_time(mean)} ± {format_time(stdev)}")
+
+        if "pickle" in selected_serializers:
+            print(f"Running pickle_{benchmark_name}...", end=" ", flush=True)
+            mean, stdev = run_benchmark(
+                pickle_func,
+                data,
+                warmup=args.warmup,
+                iterations=args.iterations,
+                repeat=args.repeat,
+                number=args.number,
+            )
+            results.append(("pickle", benchmark_name, mean, stdev))
+            print(f"{format_time(mean)} ± {format_time(stdev)}")
+
+    # Print summary
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    print(f"{'Serializer':<15} {'Benchmark':<25} {'Mean':<20} {'Std Dev':<20}")
+    print("-" * 80)
+    for serializer, benchmark, mean, stdev in results:
+        print(
+            f"{serializer:<15} {benchmark:<25} {format_time(mean):<20} 
{format_time(stdev):<20}"
+        )
+
+    # Calculate speedup if both serializers were tested
+    if "fory" in selected_serializers and "pickle" in selected_serializers:
+        print("\n" + "=" * 80)
+        print("SPEEDUP (Fory vs Pickle)")
+        print("=" * 80)
+        print(f"{'Benchmark':<25} {'Fory':<20} {'Pickle':<20} {'Speedup':<20}")
+        print("-" * 80)
+
+        for benchmark_name in selected_benchmarks:
+            fory_result = next(
+                (r for r in results if r[0] == "fory" and r[1] == 
benchmark_name), None
+            )
+            pickle_result = next(
+                (r for r in results if r[0] == "pickle" and r[1] == 
benchmark_name),
+                None,
+            )
+
+            if fory_result and pickle_result:
+                fory_mean = fory_result[2]
+                pickle_mean = pickle_result[2]
+                speedup = pickle_mean / fory_mean
+                speedup_str = (
+                    f"{speedup:.2f}x" if speedup >= 1 else f"{1 / 
speedup:.2f}x slower"
+                )
+                print(
+                    f"{benchmark_name:<25} {format_time(fory_mean):<20} 
{format_time(pickle_mean):<20} {speedup_str:<20}"
+                )
 
 
 if __name__ == "__main__":
diff --git a/integration_tests/cpython_benchmark/requirements.txt 
b/integration_tests/cpython_benchmark/requirements.txt
deleted file mode 100644
index 82860069d..000000000
--- a/integration_tests/cpython_benchmark/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pyperf
\ No newline at end of file
diff --git a/python/pyfory/_util.pxd b/python/pyfory/_util.pxd
index 5108a6f65..6938e755c 100644
--- a/python/pyfory/_util.pxd
+++ b/python/pyfory/_util.pxd
@@ -33,6 +33,7 @@ cdef class Buffer:
     copy the data."""
     cdef:
         shared_ptr[CBuffer] c_buffer
+        CBuffer* c_buffer_ptr
         uint8_t* _c_address
         int32_t _c_size
         # hold python buffer reference count
diff --git a/python/pyfory/_util.pyx b/python/pyfory/_util.pyx
index fa4eef19b..32efbac65 100644
--- a/python/pyfory/_util.pyx
+++ b/python/pyfory/_util.pyx
@@ -55,6 +55,7 @@ cdef class Buffer:
             self._c_address = NULL
         self._c_size = length_
         self.c_buffer = make_shared[CBuffer](self._c_address, length_, False)
+        self.c_buffer_ptr = self.c_buffer.get()
         # hold c_address directly to avoid pointer indirect cost.
         self.reader_index = 0
         self.writer_index = 0
@@ -63,8 +64,9 @@ cdef class Buffer:
     cdef Buffer wrap(shared_ptr[CBuffer] c_buffer):
         cdef Buffer buffer = Buffer.__new__(Buffer)
         buffer.c_buffer = c_buffer
-        buffer._c_address = c_buffer.get().data()
-        buffer._c_size = c_buffer.get().size()
+        buffer.c_buffer_ptr = c_buffer.get()
+        buffer._c_address = buffer.c_buffer_ptr.data()
+        buffer._c_size = buffer.c_buffer_ptr.size()
         return buffer
 
     @classmethod
@@ -75,29 +77,29 @@ cdef class Buffer:
         return Buffer.wrap(buf)
 
     cpdef c_bool own_data(self):
-        return self.c_buffer.get().own_data()
+        return self.c_buffer_ptr.own_data()
 
     cpdef inline reserve(self, int32_t new_size):
         assert 0 < new_size < max_buffer_size
-        self.c_buffer.get().Reserve(new_size)
-        self._c_address = self.c_buffer.get().data()
-        self._c_size = self.c_buffer.get().size()
+        self.c_buffer_ptr.Reserve(new_size)
+        self._c_address = self.c_buffer_ptr.data()
+        self._c_size = self.c_buffer_ptr.size()
 
     cpdef inline put_bool(self, uint32_t offset, c_bool v):
         self.check_bound(offset, <int32_t>1)
-        self.c_buffer.get().UnsafePutByte(offset, v)
+        self.c_buffer_ptr.UnsafePutByte(offset, v)
 
     cpdef inline put_uint8(self, uint32_t offset, uint8_t v):
         self.check_bound(offset, <int32_t>1)
-        self.c_buffer.get().UnsafePutByte(offset, v)
+        self.c_buffer_ptr.UnsafePutByte(offset, v)
 
     cpdef inline put_int8(self, uint32_t offset, int8_t v):
         self.check_bound(offset, <int32_t>1)
-        self.c_buffer.get().UnsafePutByte(offset, v)
+        self.c_buffer_ptr.UnsafePutByte(offset, v)
 
     cpdef inline put_int16(self, uint32_t offset, int16_t v):
         self.check_bound(offset, <int32_t>2)
-        self.c_buffer.get().UnsafePut(offset, v)
+        self.c_buffer_ptr.UnsafePut(offset, v)
 
     cpdef inline put_int24(self, uint32_t offset, int32_t v):
         self.check_bound(offset, <int32_t>3)
@@ -108,31 +110,31 @@ cdef class Buffer:
 
     cpdef inline put_int32(self, uint32_t offset, int32_t v):
         self.check_bound(offset, <int32_t>4)
-        self.c_buffer.get().UnsafePut(offset, v)
+        self.c_buffer_ptr.UnsafePut(offset, v)
 
     cpdef inline put_int64(self, uint32_t offset, int64_t v):
         self.check_bound(offset, <int32_t>8)
-        self.c_buffer.get().UnsafePut(offset, v)
+        self.c_buffer_ptr.UnsafePut(offset, v)
 
     cpdef inline put_float(self, uint32_t offset, float v):
         self.check_bound(offset, <int32_t>4)
-        self.c_buffer.get().UnsafePut(offset, v)
+        self.c_buffer_ptr.UnsafePut(offset, v)
 
     cpdef inline put_double(self, uint32_t offset, double v):
         self.check_bound(offset, <int32_t>8)
-        self.c_buffer.get().UnsafePut(offset, v)
+        self.c_buffer_ptr.UnsafePut(offset, v)
 
     cpdef inline c_bool get_bool(self, uint32_t offset):
         self.check_bound(offset, <int32_t>1)
-        return self.c_buffer.get().GetBool(offset)
+        return self.c_buffer_ptr.GetBool(offset)
 
     cpdef inline int8_t get_int8(self, uint32_t offset):
         self.check_bound(offset, <int32_t>1)
-        return self.c_buffer.get().GetInt8(offset)
+        return self.c_buffer_ptr.GetInt8(offset)
 
     cpdef inline int16_t get_int16(self, uint32_t offset):
         self.check_bound(offset, <int32_t>2)
-        return self.c_buffer.get().GetInt16(offset)
+        return self.c_buffer_ptr.GetInt16(offset)
 
     cpdef inline int32_t get_int24(self, uint32_t offset):
         self.check_bound(offset, <int32_t>3)
@@ -143,22 +145,22 @@ cdef class Buffer:
 
     cpdef inline int32_t get_int32(self, uint32_t offset):
         self.check_bound(offset, <int32_t>4)
-        return self.c_buffer.get().GetInt32(offset)
+        return self.c_buffer_ptr.GetInt32(offset)
 
     cpdef inline int64_t get_int64(self, uint32_t offset):
         self.check_bound(offset, <int32_t>8)
-        return self.c_buffer.get().GetInt64(offset)
+        return self.c_buffer_ptr.GetInt64(offset)
 
     cpdef inline float get_float(self, uint32_t offset):
         self.check_bound(offset, <int32_t>4)
-        return self.c_buffer.get().GetFloat(offset)
+        return self.c_buffer_ptr.GetFloat(offset)
 
     cpdef inline double get_double(self, uint32_t offset):
         self.check_bound(offset, <int32_t>8)
-        return self.c_buffer.get().GetDouble(offset)
+        return self.c_buffer_ptr.GetDouble(offset)
 
     cpdef inline check_bound(self, int32_t offset, int32_t length):
-        cdef int32_t size_ = self.c_buffer.get().size()
+        cdef int32_t size_ = self._c_size
         if offset | length | (offset + length) | (size_- (offset + length)) < 
0:
             raise ValueError(f"Address range {offset, offset + length} "
                              f"out of bound {0, size_}")
@@ -180,7 +182,7 @@ cdef class Buffer:
 
     cpdef inline write_int16(self, int16_t value):
         self.grow(<int32_t>2)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>2
 
     cpdef inline write_int24(self, int32_t value):
@@ -193,32 +195,32 @@ cdef class Buffer:
 
     cpdef inline write_int32(self, int32_t value):
         self.grow(<int32_t>4)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>4
 
     cpdef inline write_int64(self, int64_t value):
         self.grow(<int32_t>8)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>8
 
     cpdef inline write_float(self, float value):
         self.grow(<int32_t>4)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>4
 
     cpdef inline write_float32(self, float value):
         self.grow(<int32_t>4)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>4
 
     cpdef inline write_double(self, double value):
         self.grow(<int32_t>8)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>8
 
     cpdef inline write_float64(self, double value):
         self.grow(<int32_t>8)
-        self.c_buffer.get().UnsafePut(self.writer_index, value)
+        self.c_buffer_ptr.UnsafePut(self.writer_index, value)
         self.writer_index += <int32_t>8
 
     cpdef put_buffer(self, uint32_t offset, v, int32_t src_index, int32_t 
length):
@@ -231,7 +233,7 @@ cdef class Buffer:
         self.check_bound(offset, size)
         src_offset = src_index * itemsize
         cdef uint8_t* ptr = get_address(v)
-        self.c_buffer.get().CopyFrom(offset, ptr, src_offset, size)
+        self.c_buffer_ptr.CopyFrom(offset, ptr, src_offset, size)
 
     cpdef inline write_bytes_and_size(self, bytes value):
         cdef const unsigned char[:] data = value
@@ -239,7 +241,7 @@ cdef class Buffer:
         self.write_varuint32(length)
         if length > 0:
             self.grow(length)
-            self.c_buffer.get().CopyFrom(self.writer_index, &data[0], 0, 
length)
+            self.c_buffer_ptr.CopyFrom(self.writer_index, &data[0], 0, length)
             self.writer_index += length
 
     cpdef inline bytes read_bytes_and_size(self):
@@ -253,7 +255,7 @@ cdef class Buffer:
         cdef int32_t length = data.nbytes
         if length > 0:
             self.grow(length)
-            self.c_buffer.get().CopyFrom(self.writer_index, &data[0], 0, 
length)
+            self.c_buffer_ptr.CopyFrom(self.writer_index, &data[0], 0, length)
             self.writer_index += length
 
     cpdef inline bytes read_bytes(self, int32_t length):
@@ -263,7 +265,7 @@ cdef class Buffer:
 
     cpdef inline int64_t read_bytes_as_int64(self, int32_t length):
         cdef int64_t result = 0
-        cdef CStatus status = 
self.c_buffer.get().GetBytesAsInt64(self.reader_index, length,  &result)
+        cdef CStatus status = 
self.c_buffer_ptr.GetBytesAsInt64(self.reader_index, length,  &result)
         if status.code() != StatusCode.OK:
             raise ValueError(status.message())
         self.reader_index += length
@@ -274,13 +276,13 @@ cdef class Buffer:
         cdef int32_t length = data.nbytes
         if length > 0:
             self.grow(length)
-            self.c_buffer.get().CopyFrom(offset, &data[0], 0, length)
+            self.c_buffer_ptr.CopyFrom(offset, &data[0], 0, length)
 
     cpdef inline bytes get_bytes(self, uint32_t offset, uint32_t nbytes):
         if nbytes == 0:
             return b""
         self.check_bound(offset, nbytes)
-        cdef unsigned char* binary_data = self.c_buffer.get().data() + offset
+        cdef unsigned char* binary_data = self._c_address + offset
         return binary_data[:nbytes]
 
     cpdef inline write_buffer(self, value, src_index=0, length_=None):
@@ -301,7 +303,7 @@ cdef class Buffer:
         cdef int32_t length = data.nbytes
         if length > 0:
             self.grow(length)
-            self.c_buffer.get().CopyFrom(self.writer_index, &data[0], 0, 
length)
+            self.c_buffer_ptr.CopyFrom(self.writer_index, &data[0], 0, length)
             self.writer_index += length
 
     cpdef inline grow(self, int32_t needed_size):
@@ -382,7 +384,7 @@ cdef class Buffer:
     cpdef inline bytes readline(self, int32_t size=-1):
         if size != <int32_t>-1:
             raise ValueError(f"Specify size {size} is unsupported")
-        cdef uint8_t* arr = self.c_buffer.get().data()
+        cdef uint8_t* arr = self._c_address
         cdef int32_t target_index = self.reader_index
         cdef uint8_t sep = 10  # '\n'
         cdef int32_t buffer_size = self._c_size
@@ -397,8 +399,7 @@ cdef class Buffer:
 
     cpdef inline write_varuint32(self, int32_t value):
         self.grow(<int8_t>5)
-        cdef int32_t actual_bytes_written = self.c_buffer.get()\
-            .PutVarUint32(self.writer_index, value)
+        cdef int32_t actual_bytes_written = 
self.c_buffer_ptr.PutVarUint32(self.writer_index, value)
         self.writer_index += actual_bytes_written
         return actual_bytes_written
 
@@ -412,8 +413,7 @@ cdef class Buffer:
             int8_t b
             int32_t result
         if self._c_size - self.reader_index > 5:
-            result = self.c_buffer.get().GetVarUint32(
-                self.reader_index, &read_length)
+            result = self.c_buffer_ptr.GetVarUint32(self.reader_index, 
&read_length)
             self.reader_index += read_length
             return result
         else:
@@ -441,7 +441,7 @@ cdef class Buffer:
             uint64_t value = v
             int64_t offset = self.writer_index
         self.grow(<int8_t>9)
-        cdef uint8_t* arr = self.c_buffer.get().data()
+        cdef uint8_t* arr = self._c_address
         if value >> 7 == 0:
             arr[offset] = <int8_t>value
             self.writer_index += <int32_t>1
@@ -496,7 +496,7 @@ cdef class Buffer:
             int64_t b
             int64_t result
             uint32_t position = self.reader_index
-            int8_t * arr = <int8_t *> (self.c_buffer.get().data() + position)
+            int8_t * arr = <int8_t *> (self._c_address + position)
         if self._c_size - self.reader_index > 9:
             b = arr[0]
             result = b & 0x7F
@@ -571,12 +571,12 @@ cdef class Buffer:
             return
         self.grow(length)
         self.check_bound(self.writer_index, length)
-        self.c_buffer.get().CopyFrom(self.writer_index, value, 0, length)
+        self.c_buffer_ptr.CopyFrom(self.writer_index, value, 0, length)
         self.writer_index += length
 
     cdef inline int32_t read_c_buffer(self, uint8_t** buf):
         cdef int32_t length = self.read_varuint32()
-        cdef uint8_t* binary_data = self.c_buffer.get().data()
+        cdef uint8_t* binary_data = self._c_address
         self.check_bound(self.reader_index, length)
         buf[0] = binary_data + self.reader_index
         self.reader_index += length
@@ -604,14 +604,14 @@ cdef class Buffer:
             return
         self.grow(buffer_size)
         self.check_bound(self.writer_index, buffer_size)
-        self.c_buffer.get().CopyFrom(self.writer_index, <const uint8_t 
*>buffer, 0, buffer_size)
+        self.c_buffer_ptr.CopyFrom(self.writer_index, <const uint8_t *>buffer, 
0, buffer_size)
         self.writer_index += buffer_size
 
     cpdef inline str read_string(self):
         cdef uint64_t header = self.read_varuint64()
         cdef uint32_t size = header >> 2
         self.check_bound(self.reader_index, size)
-        cdef const char * buf = <const char *>(self.c_buffer.get().data() + 
self.reader_index)
+        cdef const char * buf = <const char *>(self._c_address + 
self.reader_index)
         self.reader_index += size
         cdef uint32_t encoding = header & <uint32_t>0b11
         if encoding == 0:
@@ -643,7 +643,7 @@ cdef class Buffer:
         else:
             length = self._c_size
         cdef:
-            uint8_t* data = self.c_buffer.get().data() + offset
+            uint8_t* data = self._c_address + offset
         return data[:length]
 
     def to_pybytes(self) -> bytes:
@@ -660,7 +660,7 @@ cdef class Buffer:
         return self.getitem(_normalize_index(key, self._c_size))
 
     cdef getitem(self, int64_t i):
-        return self.c_buffer.get().data()[i]
+        return self._c_address[i]
 
     def hex(self):
         """
@@ -670,13 +670,13 @@ cdef class Buffer:
         -------
         : bytes
         """
-        return self.c_buffer.get().Hex().decode("UTF-8")
+        return self.c_buffer_ptr.Hex().decode("UTF-8")
 
     def __getbuffer__(self, Py_buffer *buffer, int flags):
         cdef Py_ssize_t itemsize = 1
         self.shape[0] = self._c_size
         self.stride[0] = itemsize
-        buffer.buf = <char *>(self.c_buffer.get().data())
+        buffer.buf = <char *>(self._c_address)
         buffer.format = 'B'
         buffer.internal = NULL                  # see References
         buffer.itemsize = itemsize
@@ -785,15 +785,15 @@ cdef Py_ssize_t _normalize_index(Py_ssize_t index,
 
 
 def get_bit(Buffer buffer, uint32_t base_offset, uint32_t index) -> bool:
-    return GetBit(buffer.c_buffer.get().data() + base_offset, index)
+    return GetBit(buffer._c_address + base_offset, index)
 
 
 def set_bit(Buffer buffer, uint32_t base_offset, uint32_t index):
-    return SetBit(buffer.c_buffer.get().data() + base_offset, index)
+    return SetBit(buffer._c_address + base_offset, index)
 
 
 def clear_bit(Buffer buffer, uint32_t base_offset, uint32_t index):
-    return ClearBit(buffer.c_buffer.get().data() + base_offset, index)
+    return ClearBit(buffer._c_address + base_offset, index)
 
 
 def set_bit_to(Buffer buffer,
@@ -801,4 +801,4 @@ def set_bit_to(Buffer buffer,
                uint32_t index,
                c_bool bit_is_set):
     return SetBitTo(
-        buffer.c_buffer.get().data() + base_offset, index, bit_is_set)
+        buffer._c_address + base_offset, index, bit_is_set)
diff --git a/python/pyfory/serialization.pyx b/python/pyfory/serialization.pyx
index 6026ef348..9b46e940a 100644
--- a/python/pyfory/serialization.pyx
+++ b/python/pyfory/serialization.pyx
@@ -378,7 +378,7 @@ cdef class MetaStringResolver:
         self._c_dynamic_id_to_enum_string_vec.push_back(enum_str_ptr)
         return <MetaStringBytes> enum_str_ptr
 
-    def get_metastr_bytes(self, metastr):
+    cpdef inline get_metastr_bytes(self, metastr):
         metastr_bytes = self._metastr_to_metastr_bytes.get(metastr)
         if metastr_bytes is not None:
             return metastr_bytes
@@ -551,6 +551,9 @@ cdef class TypeResolver:
         if type_id > 0 and (self.fory.language == Language.PYTHON or not 
IsNamespacedType(type_id)):
             self._c_registered_id_to_type_info[type_id] = <PyObject *> typeinfo
         self._c_types_info[<uintptr_t> <PyObject *> typeinfo.cls] = <PyObject 
*> typeinfo
+        # Resize if load factor >= 0.4 (using integer arithmetic: 
size/capacity >= 4/10)
+        if self._c_types_info.size() * 10 >= self._c_types_info.bucket_count() 
* 5:
+            self._c_types_info.rehash(self._c_types_info.size() * 2)
         if typeinfo.typename_bytes is not None:
             self._load_bytes_to_typeinfo(type_id, typeinfo.namespace_bytes, 
typeinfo.typename_bytes)
 
@@ -588,16 +591,16 @@ cdef class TypeResolver:
             self._populate_typeinfo(type_info)
             return type_info
 
-    def is_registered_by_name(self, cls):
+    cpdef inline is_registered_by_name(self, cls):
         return self._resolver.is_registered_by_name(cls)
 
-    def is_registered_by_id(self, cls):
+    cpdef inline is_registered_by_id(self, cls):
         return self._resolver.is_registered_by_id(cls)
 
-    def get_registered_name(self, cls):
+    cpdef inline get_registered_name(self, cls):
         return self._resolver.get_registered_name(cls)
 
-    def get_registered_id(self, cls):
+    cpdef inline get_registered_id(self, cls):
         return self._resolver.get_registered_id(cls)
 
     cdef inline TypeInfo _load_bytes_to_typeinfo(
@@ -612,7 +615,7 @@ cdef class TypeResolver:
             ns_metabytes.hashcode, type_metabytes.hashcode)] = typeinfo_ptr
         return typeinfo
 
-    cpdef write_typeinfo(self, Buffer buffer, TypeInfo typeinfo):
+    cpdef inline write_typeinfo(self, Buffer buffer, TypeInfo typeinfo):
         if typeinfo.dynamic_type:
             return
         cdef:
@@ -660,31 +663,31 @@ cdef class TypeResolver:
         typeinfo = <TypeInfo> typeinfo_ptr
         return typeinfo
 
-    def get_typeinfo_by_name(self, namespace, typename):
+    cpdef inline get_typeinfo_by_name(self, namespace, typename):
         return self._resolver.get_typeinfo_by_name(namespace=namespace, 
typename=typename)
 
-    cpdef _set_typeinfo(self, typeinfo):
+    cpdef inline _set_typeinfo(self, typeinfo):
         self._resolver._set_typeinfo(typeinfo)
 
-    def get_meta_compressor(self):
+    cpdef inline get_meta_compressor(self):
         return self._resolver.get_meta_compressor()
 
-    cpdef write_shared_type_meta(self, Buffer buffer, TypeInfo typeinfo):
+    cpdef inline write_shared_type_meta(self, Buffer buffer, TypeInfo 
typeinfo):
         """Write shared type meta information."""
         meta_context = self.serialization_context.meta_context
         meta_context.write_shared_typeinfo(buffer, typeinfo)
 
-    cpdef TypeInfo read_shared_type_meta(self, Buffer buffer):
+    cpdef inline TypeInfo read_shared_type_meta(self, Buffer buffer):
         """Read shared type meta information."""
         meta_context = self.serialization_context.meta_context
         typeinfo = meta_context.read_shared_typeinfo(buffer)
         return typeinfo
 
-    cpdef write_type_defs(self, Buffer buffer):
+    cpdef inline write_type_defs(self, Buffer buffer):
         """Write all type definitions that need to be sent."""
         self._resolver.write_type_defs(buffer)
 
-    cpdef read_type_defs(self, Buffer buffer):
+    cpdef inline read_type_defs(self, Buffer buffer):
         """Read all type definitions from the buffer."""
         self._resolver.read_type_defs(buffer)
 
@@ -824,7 +827,7 @@ cdef class SerializationContext:
             self.meta_context = None
         self.fory = fory
 
-    def add(self, key, obj):
+    cpdef inline add(self, key, obj):
         self.objects[id(key)] = obj
 
     def __contains__(self, key):
@@ -836,17 +839,17 @@ cdef class SerializationContext:
     def get(self, key):
         return self.objects.get(id(key))
 
-    cpdef reset(self):
+    cpdef inline reset(self):
         if len(self.objects) > 0:
             self.objects.clear()
 
-    cpdef reset_write(self):
+    cpdef inline reset_write(self):
         if len(self.objects) > 0:
             self.objects.clear()
         if self.scoped_meta_share_enabled and self.meta_context is not None:
             self.meta_context.reset_write()
 
-    cpdef reset_read(self):
+    cpdef inline reset_read(self):
         if len(self.objects) > 0:
             self.objects.clear()
         if self.scoped_meta_share_enabled and self.meta_context is not None:
@@ -1778,6 +1781,7 @@ cdef class StringSerializer(XlangCompatibleSerializer):
 
 
 cdef _base_date = datetime.date(1970, 1, 1)
+cdef int _base_date_ordinal = _base_date.toordinal()  # Precompute for faster 
date deserialization
 
 
 @cython.final
@@ -1794,7 +1798,7 @@ cdef class DateSerializer(XlangCompatibleSerializer):
 
     cpdef inline read(self, Buffer buffer):
         days = buffer.read_int32()
-        return _base_date + datetime.timedelta(days=days)
+        return datetime.date.fromordinal(_base_date_ordinal + days)
 
 
 @cython.final
@@ -1869,7 +1873,7 @@ cdef class CollectionSerializer(Serializer):
             self.elem_tracking_ref = <int8_t> 
(elem_serializer.need_to_write_ref)
         self.is_py = fory.is_py
 
-    cdef pair[int8_t, int64_t] write_header(self, Buffer buffer, value):
+    cdef inline pair[int8_t, int64_t] write_header(self, Buffer buffer, value):
         cdef int8_t collect_flag = COLL_DEFAULT_FLAG
         elem_type = self.elem_type
         cdef TypeInfo elem_typeinfo = self.elem_typeinfo
diff --git a/python/pyfory/serializer.py b/python/pyfory/serializer.py
index 5eee1ea41..62235b4c0 100644
--- a/python/pyfory/serializer.py
+++ b/python/pyfory/serializer.py
@@ -1247,25 +1247,25 @@ class ReduceSerializer(XlangCompatibleSerializer):
         # Handle different __reduce__ return formats
         if isinstance(reduce_result, str):
             # Case 1: Just a global name (simple case)
-            reduce_data = ("global", reduce_result)
+            reduce_data = (0, reduce_result)
         elif isinstance(reduce_result, tuple):
             if len(reduce_result) == 2:
                 # Case 2: (callable, args)
                 callable_obj, args = reduce_result
-                reduce_data = ("callable", callable_obj, args)
+                reduce_data = (1, callable_obj, args)
             elif len(reduce_result) == 3:
                 # Case 3: (callable, args, state)
                 callable_obj, args, state = reduce_result
-                reduce_data = ("callable", callable_obj, args, state)
+                reduce_data = (1, callable_obj, args, state)
             elif len(reduce_result) == 4:
                 # Case 4: (callable, args, state, listitems)
                 callable_obj, args, state, listitems = reduce_result
-                reduce_data = ("callable", callable_obj, args, state, 
listitems)
+                reduce_data = (1, callable_obj, args, state, listitems)
             elif len(reduce_result) == 5:
                 # Case 5: (callable, args, state, listitems, dictitems)
                 callable_obj, args, state, listitems, dictitems = reduce_result
                 reduce_data = (
-                    "callable",
+                    1,
                     callable_obj,
                     args,
                     state,
@@ -1289,7 +1289,7 @@ class ReduceSerializer(XlangCompatibleSerializer):
         for i in range(reduce_data_num_items):
             reduce_data[i] = fory.read_ref(buffer)
 
-        if reduce_data[0] == "global":
+        if reduce_data[0] == 0:
             # Case 1: Global name
             global_name = reduce_data[1]
             # Import and return the global object
@@ -1306,7 +1306,7 @@ class ReduceSerializer(XlangCompatibleSerializer):
                     return getattr(builtins, global_name)
                 except AttributeError:
                     raise ValueError(f"Cannot resolve global name: 
{global_name}")
-        elif reduce_data[0] == "callable":
+        elif reduce_data[0] == 1:
             # Case 2-5: Callable with args and optional state/items
             callable_obj = reduce_data[1]
             args = reduce_data[2] or ()
@@ -1342,7 +1342,7 @@ class ReduceSerializer(XlangCompatibleSerializer):
                 obj = result
             return obj
         else:
-            raise ValueError(f"Invalid reduce data format: {reduce_data[0]}")
+            raise ValueError(f"Invalid reduce data format flag: 
{reduce_data[0]}")
 
 
 __skip_class_attr_names__ = ("__module__", "__qualname__", "__dict__", 
"__weakref__")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to