This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git


The following commit(s) were added to refs/heads/main by this push:
     new f384e4f32 perf(c++): optimize primitive struct fields read performance 
(#2960)
f384e4f32 is described below

commit f384e4f320d99d14eec21ebd8be515da149149d4
Author: Shawn Yang <[email protected]>
AuthorDate: Tue Dec 2 17:03:29 2025 +0800

    perf(c++): optimize primitive struct fields read performance (#2960)
    
    ## Why?
    
    <!-- Describe the purpose of this PR. -->
    
    ## What does this PR do?
    
    1. Type-based encoding detection: Added compile-time helpers to
    correctly distinguish signed (varint) vs unsigned (fixed) integers:
    - field_is_fixed_primitive<Index>() - bool, int8, uint8, int16, uint16,
    uint32, uint64, float, double
    - field_is_varint_primitive<Index>() - int32_t, int, int64_t, long long
    (zigzag varint)
      2. Optimized fixed field reading:
    - Compute field offsets at compile time with
    compute_fixed_field_offset<T, I>()
    - Read all fixed fields at absolute offsets without per-field
    reader_index updates
        - Single reader_index update after all fixed fields
      3. Optimized varint field reading:
        - Track offset locally during batch reading
    - Removed overly conservative max-varint-bytes pre-check (varints are
    variable-length)
        - Single reader_index update after all varints
      4. Three-phase deserialization:
        - Phase 1: Batch read leading fixed-size primitives
        - Phase 2: Batch read consecutive varint primitives
        - Phase 3: Read remaining fields normally
    
    
    ## Related issues
    
    #2958
    #2906
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fory/issues/new/choose) describing the
    need to do so and update the document if necessary.
    
    Delete section if not applicable.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    | Datatype | Operation | Fory (ns) | Protobuf (ns) | Faster |
    |----------|-----------|-----------|---------------|--------|
    | Sample | Serialize | 103.9 | 59.2 | Protobuf (1.8x) |
    | Sample | Deserialize | 329.3 | 478.1 | Fory (1.5x) |
    | Struct | Serialize | 10.3 | 20.1 | Fory (1.9x) |
    | Struct | Deserialize | 19.1 | 16.0 | Protobuf (1.2x) |
---
 cpp/fory/serialization/struct_serializer.h | 721 +++++++++++++++++++++++++----
 1 file changed, 619 insertions(+), 102 deletions(-)

diff --git a/cpp/fory/serialization/struct_serializer.h 
b/cpp/fory/serialization/struct_serializer.h
index e62b3ba1a..e4ac67e8b 100644
--- a/cpp/fory/serialization/struct_serializer.h
+++ b/cpp/fory/serialization/struct_serializer.h
@@ -167,21 +167,26 @@ inline constexpr bool is_primitive_type_id(TypeId 
type_id) {
 template <typename T>
 FORY_ALWAYS_INLINE uint32_t put_primitive_at(T value, Buffer &buffer,
                                              uint32_t offset) {
-  if constexpr (std::is_same_v<T, int32_t>) {
+  if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, int>) {
     // varint32 with zigzag encoding
-    uint32_t zigzag = (static_cast<uint32_t>(value) << 1) ^
-                      static_cast<uint32_t>(value >> 31);
+    int32_t val = static_cast<int32_t>(value);
+    uint32_t zigzag =
+        (static_cast<uint32_t>(val) << 1) ^ static_cast<uint32_t>(val >> 31);
     return buffer.PutVarUint32(offset, zigzag);
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    buffer.UnsafePut<uint32_t>(offset, value);
+  } else if constexpr (std::is_same_v<T, uint32_t> ||
+                       std::is_same_v<T, unsigned int>) {
+    buffer.UnsafePut<uint32_t>(offset, static_cast<uint32_t>(value));
     return 4;
-  } else if constexpr (std::is_same_v<T, int64_t>) {
+  } else if constexpr (std::is_same_v<T, int64_t> ||
+                       std::is_same_v<T, long long>) {
     // varint64 with zigzag encoding
-    uint64_t zigzag = (static_cast<uint64_t>(value) << 1) ^
-                      static_cast<uint64_t>(value >> 63);
+    int64_t val = static_cast<int64_t>(value);
+    uint64_t zigzag =
+        (static_cast<uint64_t>(val) << 1) ^ static_cast<uint64_t>(val >> 63);
     return buffer.PutVarUint64(offset, zigzag);
-  } else if constexpr (std::is_same_v<T, uint64_t>) {
-    buffer.UnsafePut<uint64_t>(offset, value);
+  } else if constexpr (std::is_same_v<T, uint64_t> ||
+                       std::is_same_v<T, unsigned long long>) {
+    buffer.UnsafePut<uint64_t>(offset, static_cast<uint64_t>(value));
     return 8;
   } else if constexpr (std::is_same_v<T, bool>) {
     buffer.UnsafePutByte(offset, static_cast<uint8_t>(value ? 1 : 0));
@@ -206,6 +211,58 @@ FORY_ALWAYS_INLINE uint32_t put_primitive_at(T value, 
Buffer &buffer,
   }
 }
 
+/// Write a fixed-size primitive at absolute offset. Does NOT return bytes
+/// written (caller uses compile-time size). Caller ensures buffer capacity.
+template <typename T>
+FORY_ALWAYS_INLINE void put_fixed_primitive_at(T value, Buffer &buffer,
+                                               uint32_t offset) {
+  if constexpr (std::is_same_v<T, bool>) {
+    buffer.UnsafePutByte(offset, static_cast<uint8_t>(value ? 1 : 0));
+  } else if constexpr (std::is_same_v<T, int8_t> ||
+                       std::is_same_v<T, uint8_t>) {
+    buffer.UnsafePutByte(offset, static_cast<uint8_t>(value));
+  } else if constexpr (std::is_same_v<T, int16_t> ||
+                       std::is_same_v<T, uint16_t>) {
+    buffer.UnsafePut<T>(offset, value);
+  } else if constexpr (std::is_same_v<T, uint32_t> ||
+                       std::is_same_v<T, unsigned int>) {
+    buffer.UnsafePut<uint32_t>(offset, static_cast<uint32_t>(value));
+  } else if constexpr (std::is_same_v<T, uint64_t> ||
+                       std::is_same_v<T, unsigned long long>) {
+    buffer.UnsafePut<uint64_t>(offset, static_cast<uint64_t>(value));
+  } else if constexpr (std::is_same_v<T, float>) {
+    buffer.UnsafePut<float>(offset, value);
+  } else if constexpr (std::is_same_v<T, double>) {
+    buffer.UnsafePut<double>(offset, value);
+  } else {
+    static_assert(sizeof(T) == 0, "Unsupported fixed-size primitive type");
+  }
+}
+
+/// Write a varint primitive at offset. Returns bytes written.
+/// Caller ensures buffer capacity.
+template <typename T>
+FORY_ALWAYS_INLINE uint32_t put_varint_at(T value, Buffer &buffer,
+                                          uint32_t offset) {
+  if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, int>) {
+    // varint32 with zigzag encoding
+    int32_t val = static_cast<int32_t>(value);
+    uint32_t zigzag =
+        (static_cast<uint32_t>(val) << 1) ^ static_cast<uint32_t>(val >> 31);
+    return buffer.PutVarUint32(offset, zigzag);
+  } else if constexpr (std::is_same_v<T, int64_t> ||
+                       std::is_same_v<T, long long>) {
+    // varint64 with zigzag encoding
+    int64_t val = static_cast<int64_t>(value);
+    uint64_t zigzag =
+        (static_cast<uint64_t>(val) << 1) ^ static_cast<uint64_t>(val >> 63);
+    return buffer.PutVarUint64(offset, zigzag);
+  } else {
+    static_assert(sizeof(T) == 0, "Unsupported varint type");
+    return 0;
+  }
+}
+
 template <size_t... Indices, typename Func>
 void for_each_index(std::index_sequence<Indices...>, Func &&func) {
   (func(std::integral_constant<size_t, Indices>{}), ...);
@@ -270,6 +327,144 @@ template <typename T> struct CompileTimeFieldHelpers {
     }
   }
 
+  /// Check if field at Index uses fixed-size encoding based on C++ type
+  /// Fixed types: bool, int8, uint8, int16, uint16, uint32, uint64, float,
+  /// double Note: TypeId::INT32/INT64 can be either signed (varint) or 
unsigned
+  /// (fixed)
+  template <size_t Index> static constexpr bool field_is_fixed_primitive() {
+    if constexpr (FieldCount == 0) {
+      return false;
+    } else {
+      using PtrT = std::tuple_element_t<Index, FieldPtrs>;
+      using FieldType = meta::RemoveMemberPointerCVRefT<PtrT>;
+      return std::is_same_v<FieldType, bool> ||
+             std::is_same_v<FieldType, int8_t> ||
+             std::is_same_v<FieldType, uint8_t> ||
+             std::is_same_v<FieldType, int16_t> ||
+             std::is_same_v<FieldType, uint16_t> ||
+             std::is_same_v<FieldType, uint32_t> ||
+             std::is_same_v<FieldType, unsigned int> ||
+             std::is_same_v<FieldType, uint64_t> ||
+             std::is_same_v<FieldType, unsigned long long> ||
+             std::is_same_v<FieldType, float> ||
+             std::is_same_v<FieldType, double>;
+    }
+  }
+
+  /// Check if field at Index uses varint encoding based on C++ type
+  /// Varint types: int32, int, int64, long long (signed integers use zigzag)
+  template <size_t Index> static constexpr bool field_is_varint_primitive() {
+    if constexpr (FieldCount == 0) {
+      return false;
+    } else {
+      using PtrT = std::tuple_element_t<Index, FieldPtrs>;
+      using FieldType = meta::RemoveMemberPointerCVRefT<PtrT>;
+      return std::is_same_v<FieldType, int32_t> ||
+             std::is_same_v<FieldType, int> ||
+             std::is_same_v<FieldType, int64_t> ||
+             std::is_same_v<FieldType, long long>;
+    }
+  }
+
+  /// Get fixed size in bytes for a field based on its C++ type
+  template <size_t Index> static constexpr size_t field_fixed_size_bytes() {
+    if constexpr (FieldCount == 0) {
+      return 0;
+    } else {
+      using PtrT = std::tuple_element_t<Index, FieldPtrs>;
+      using FieldType = meta::RemoveMemberPointerCVRefT<PtrT>;
+      if constexpr (std::is_same_v<FieldType, bool> ||
+                    std::is_same_v<FieldType, int8_t> ||
+                    std::is_same_v<FieldType, uint8_t>) {
+        return 1;
+      } else if constexpr (std::is_same_v<FieldType, int16_t> ||
+                           std::is_same_v<FieldType, uint16_t>) {
+        return 2;
+      } else if constexpr (std::is_same_v<FieldType, uint32_t> ||
+                           std::is_same_v<FieldType, unsigned int> ||
+                           std::is_same_v<FieldType, float>) {
+        return 4;
+      } else if constexpr (std::is_same_v<FieldType, uint64_t> ||
+                           std::is_same_v<FieldType, unsigned long long> ||
+                           std::is_same_v<FieldType, double>) {
+        return 8;
+      } else {
+        return 0; // Not a fixed-size primitive
+      }
+    }
+  }
+
+  /// Get max varint size in bytes for a field based on its C++ type
+  template <size_t Index> static constexpr size_t field_max_varint_bytes() {
+    if constexpr (FieldCount == 0) {
+      return 0;
+    } else {
+      using PtrT = std::tuple_element_t<Index, FieldPtrs>;
+      using FieldType = meta::RemoveMemberPointerCVRefT<PtrT>;
+      if constexpr (std::is_same_v<FieldType, int32_t> ||
+                    std::is_same_v<FieldType, int>) {
+        return 5; // int32 varint max
+      } else if constexpr (std::is_same_v<FieldType, int64_t> ||
+                           std::is_same_v<FieldType, long long>) {
+        return 10; // int64 varint max
+      } else {
+        return 0; // Not a varint primitive
+      }
+    }
+  }
+
+  /// Create arrays of field encoding info at compile time
+  template <size_t... Indices>
+  static constexpr std::array<bool, FieldCount>
+  make_field_is_fixed_array(std::index_sequence<Indices...>) {
+    if constexpr (FieldCount == 0) {
+      return {};
+    } else {
+      return {field_is_fixed_primitive<Indices>()...};
+    }
+  }
+
+  template <size_t... Indices>
+  static constexpr std::array<bool, FieldCount>
+  make_field_is_varint_array(std::index_sequence<Indices...>) {
+    if constexpr (FieldCount == 0) {
+      return {};
+    } else {
+      return {field_is_varint_primitive<Indices>()...};
+    }
+  }
+
+  template <size_t... Indices>
+  static constexpr std::array<size_t, FieldCount>
+  make_field_fixed_size_array(std::index_sequence<Indices...>) {
+    if constexpr (FieldCount == 0) {
+      return {};
+    } else {
+      return {field_fixed_size_bytes<Indices>()...};
+    }
+  }
+
+  template <size_t... Indices>
+  static constexpr std::array<size_t, FieldCount>
+  make_field_max_varint_array(std::index_sequence<Indices...>) {
+    if constexpr (FieldCount == 0) {
+      return {};
+    } else {
+      return {field_max_varint_bytes<Indices>()...};
+    }
+  }
+
+  /// Arrays storing encoding info for each field (indexed by original field
+  /// index)
+  static inline constexpr std::array<bool, FieldCount> field_is_fixed =
+      make_field_is_fixed_array(std::make_index_sequence<FieldCount>{});
+  static inline constexpr std::array<bool, FieldCount> field_is_varint =
+      make_field_is_varint_array(std::make_index_sequence<FieldCount>{});
+  static inline constexpr std::array<size_t, FieldCount> field_fixed_sizes =
+      make_field_fixed_size_array(std::make_index_sequence<FieldCount>{});
+  static inline constexpr std::array<size_t, FieldCount> field_max_varints =
+      make_field_max_varint_array(std::make_index_sequence<FieldCount>{});
+
   template <size_t... Indices>
   static constexpr std::array<uint32_t, FieldCount>
   make_type_ids(std::index_sequence<Indices...>) {
@@ -580,6 +775,8 @@ template <typename T> struct CompileTimeFieldHelpers {
       compute_primitive_field_count();
 
   /// Check if a type_id represents a fixed-size primitive (not varint)
+  /// Includes bool, int8, int16, float16, float32, float64
+  /// Note: INT32/INT64 use varint encoding per basic_serializer.h write/read
   static constexpr bool is_fixed_size_primitive(uint32_t tid) {
     switch (static_cast<TypeId>(tid)) {
     case TypeId::BOOL:
@@ -594,7 +791,39 @@ template <typename T> struct CompileTimeFieldHelpers {
     }
   }
 
+  /// Check if a type_id represents a varint primitive (int32/int64 types)
+  /// Per basic_serializer.h, INT32/INT64 use zigzag varint encoding
+  /// VAR_INT32/VAR_INT64/SLI_INT64 also use varint encoding
+  static constexpr bool is_varint_primitive(uint32_t tid) {
+    switch (static_cast<TypeId>(tid)) {
+    case TypeId::INT32:     // int32_t uses zigzag varint per 
basic_serializer.h
+    case TypeId::INT64:     // int64_t uses zigzag varint per 
basic_serializer.h
+    case TypeId::VAR_INT32: // explicit varint type
+    case TypeId::VAR_INT64: // explicit varint type
+    case TypeId::SLI_INT64: // alternative int64 encoding
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Get the max varint size in bytes for a type_id (0 if not varint)
+  static constexpr size_t max_varint_bytes(uint32_t tid) {
+    switch (static_cast<TypeId>(tid)) {
+    case TypeId::INT32:     // int32_t uses zigzag varint
+    case TypeId::VAR_INT32: // explicit varint
+      return 5;             // int32 varint max
+    case TypeId::INT64:     // int64_t uses zigzag varint
+    case TypeId::VAR_INT64: // explicit varint
+    case TypeId::SLI_INT64:
+      return 10; // int64 varint max
+    default:
+      return 0;
+    }
+  }
+
   /// Get the fixed size in bytes for a type_id (0 if not fixed-size)
+  /// Note: INT32/INT64 use varint encoding per basic_serializer.h, not fixed
   static constexpr size_t fixed_size_bytes(uint32_t tid) {
     switch (static_cast<TypeId>(tid)) {
     case TypeId::BOOL:
@@ -613,7 +842,9 @@ template <typename T> struct CompileTimeFieldHelpers {
   }
 
   /// Compute total bytes for leading fixed-size primitive fields only
-  /// (stops at first varint field)
+  /// (stops at first varint or non-primitive field)
+  /// Uses type-based arrays to correctly distinguish signed (varint) vs
+  /// unsigned (fixed)
   static constexpr size_t compute_leading_fixed_size_bytes() {
     if constexpr (FieldCount == 0) {
       return 0;
@@ -621,21 +852,20 @@ template <typename T> struct CompileTimeFieldHelpers {
       size_t total = 0;
       for (size_t i = 0; i < FieldCount; ++i) {
         size_t original_idx = sorted_indices[i];
-        if (!is_primitive_type_id(type_ids[original_idx]) ||
-            nullable_flags[original_idx]) {
-          break; // Stop at non-primitive or nullable
+        if (nullable_flags[original_idx]) {
+          break; // Stop at nullable
         }
-        size_t fs = fixed_size_bytes(type_ids[original_idx]);
-        if (fs == 0) {
-          break; // Stop at first varint
+        if (!field_is_fixed[original_idx]) {
+          break; // Stop at first non-fixed (varint or non-primitive)
         }
-        total += fs;
+        total += field_fixed_sizes[original_idx];
       }
       return total;
     }
   }
 
-  /// Count leading fixed-size primitive fields (stops at first varint)
+  /// Count leading fixed-size primitive fields (stops at first varint or
+  /// non-primitive)
   static constexpr size_t compute_leading_fixed_count() {
     if constexpr (FieldCount == 0) {
       return 0;
@@ -643,12 +873,11 @@ template <typename T> struct CompileTimeFieldHelpers {
       size_t count = 0;
       for (size_t i = 0; i < FieldCount; ++i) {
         size_t original_idx = sorted_indices[i];
-        if (!is_primitive_type_id(type_ids[original_idx]) ||
-            nullable_flags[original_idx]) {
+        if (nullable_flags[original_idx]) {
           break;
         }
-        if (fixed_size_bytes(type_ids[original_idx]) == 0) {
-          break; // Varint encountered
+        if (!field_is_fixed[original_idx]) {
+          break; // Varint or non-primitive encountered
         }
         ++count;
       }
@@ -661,6 +890,45 @@ template <typename T> struct CompileTimeFieldHelpers {
   static inline constexpr size_t leading_fixed_count =
       compute_leading_fixed_count();
 
+  /// Count consecutive varint primitives (int32, int64) after leading fixed
+  /// fields
+  static constexpr size_t compute_varint_count() {
+    if constexpr (FieldCount == 0) {
+      return 0;
+    } else {
+      size_t count = 0;
+      for (size_t i = leading_fixed_count; i < FieldCount; ++i) {
+        size_t original_idx = sorted_indices[i];
+        if (nullable_flags[original_idx]) {
+          break; // Stop at nullable
+        }
+        if (!field_is_varint[original_idx]) {
+          break; // Stop at non-varint (e.g., float, double, non-primitive)
+        }
+        ++count;
+      }
+      return count;
+    }
+  }
+
+  /// Compute max bytes needed for all varint fields
+  static constexpr size_t compute_max_varint_bytes() {
+    if constexpr (FieldCount == 0) {
+      return 0;
+    } else {
+      size_t total = 0;
+      for (size_t i = leading_fixed_count;
+           i < leading_fixed_count + compute_varint_count(); ++i) {
+        size_t original_idx = sorted_indices[i];
+        total += field_max_varints[original_idx];
+      }
+      return total;
+    }
+  }
+
+  static inline constexpr size_t varint_count = compute_varint_count();
+  static inline constexpr size_t max_varint_size = compute_max_varint_bytes();
+
   /// Compute max serialized size for leading primitive fields only.
   /// Used for hybrid fast/slow path buffer pre-reservation.
   static constexpr size_t compute_max_leading_primitive_size() {
@@ -707,34 +975,145 @@ template <typename T> struct CompileTimeFieldHelpers {
       compute_max_leading_primitive_size();
 };
 
+/// Compute the write offset of field at sorted index I within leading fixed
+/// fields. This is the sum of sizes of all fields before index I.
+/// Uses type-based field_fixed_sizes for correct encoding detection.
+template <typename T, size_t I>
+constexpr size_t compute_fixed_field_write_offset() {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  size_t offset = 0;
+  for (size_t i = 0; i < I; ++i) {
+    size_t original_idx = Helpers::sorted_indices[i];
+    offset += Helpers::field_fixed_sizes[original_idx];
+  }
+  return offset;
+}
+
+/// Helper to write a single fixed-size primitive field at compile-time offset.
+/// No lambda overhead - direct function call that will be inlined.
+template <typename T, size_t SortedIdx>
+FORY_ALWAYS_INLINE void write_single_fixed_field(const T &obj, Buffer &buffer,
+                                                 uint32_t base_offset) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  constexpr size_t original_index = Helpers::sorted_indices[SortedIdx];
+  constexpr size_t field_offset =
+      compute_fixed_field_write_offset<T, SortedIdx>();
+  const auto field_info = ForyFieldInfo(obj);
+  const auto field_ptr = std::get<original_index>(decltype(field_info)::Ptrs);
+  using FieldType =
+      typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
+  put_fixed_primitive_at<FieldType>(obj.*field_ptr, buffer,
+                                    base_offset + field_offset);
+}
+
+/// Fast write leading fixed-size primitive fields using compile-time offsets.
+/// Caller must ensure buffer has sufficient capacity.
+/// Optimized: uses compile-time offsets and updates writer_index once at end.
+template <typename T, size_t... Indices>
+FORY_ALWAYS_INLINE void
+write_fixed_primitive_fields(const T &obj, Buffer &buffer,
+                             std::index_sequence<Indices...>) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  const uint32_t base_offset = buffer.writer_index();
+
+  // Write each field using helper function - no lambda overhead
+  (write_single_fixed_field<T, Indices>(obj, buffer, base_offset), ...);
+
+  // Update writer_index once with total fixed bytes (compile-time constant)
+  buffer.WriterIndex(base_offset + Helpers::leading_fixed_size_bytes);
+}
+
+/// Helper to write a single varint primitive field.
+/// No lambda overhead - direct function call that will be inlined.
+template <typename T, size_t SortedPos>
+FORY_ALWAYS_INLINE void write_single_varint_field(const T &obj, Buffer &buffer,
+                                                  uint32_t &offset) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  constexpr size_t original_index = Helpers::sorted_indices[SortedPos];
+  const auto field_info = ForyFieldInfo(obj);
+  const auto field_ptr = std::get<original_index>(decltype(field_info)::Ptrs);
+  using FieldType =
+      typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
+  offset += put_varint_at<FieldType>(obj.*field_ptr, buffer, offset);
+}
+
+/// Fast write consecutive varint primitive fields (int32, int64).
+/// Caller must ensure buffer has sufficient capacity.
+/// Optimized: tracks offset locally and updates writer_index once at the end.
+template <typename T, size_t FixedCount, size_t... Indices>
+FORY_ALWAYS_INLINE void
+write_varint_primitive_fields(const T &obj, Buffer &buffer, uint32_t &offset,
+                              std::index_sequence<Indices...>) {
+  // Write each varint field using helper function - no lambda overhead
+  // Indices are 0, 1, 2, ... so actual sorted position is FixedCount + Indices
+  (write_single_varint_field<T, FixedCount + Indices>(obj, buffer, offset),
+   ...);
+}
+
+/// Helper to write a single remaining primitive field.
+/// No lambda overhead - direct function call that will be inlined.
+template <typename T, size_t SortedPos>
+FORY_ALWAYS_INLINE void
+write_single_remaining_field(const T &obj, Buffer &buffer, uint32_t &offset) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  constexpr size_t original_index = Helpers::sorted_indices[SortedPos];
+  const auto field_info = ForyFieldInfo(obj);
+  const auto field_ptr = std::get<original_index>(decltype(field_info)::Ptrs);
+  using FieldType =
+      typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
+  offset += put_primitive_at<FieldType>(obj.*field_ptr, buffer, offset);
+}
+
+/// Write remaining primitive fields after fixed and varint phases.
+/// StartPos is the first sorted index to process.
+template <typename T, size_t StartPos, size_t... Indices>
+FORY_ALWAYS_INLINE void
+write_remaining_primitive_fields(const T &obj, Buffer &buffer, uint32_t 
&offset,
+                                 std::index_sequence<Indices...>) {
+  // Write each remaining field using helper function - no lambda overhead
+  (write_single_remaining_field<T, StartPos + Indices>(obj, buffer, offset),
+   ...);
+}
+
 /// Fast path writer for primitive-only, non-nullable structs.
 /// Writes all fields directly without Result wrapping.
-/// Optimized: tracks offset locally and updates writer_index once at the end.
+/// Optimized: three-phase approach with single writer_index update at the end.
+/// Phase 1: Fixed-size primitives (compile-time offsets)
+/// Phase 2: Varint primitives (local offset tracking)
+/// Phase 3: Remaining primitives (if any)
 template <typename T, size_t... Indices>
 FORY_ALWAYS_INLINE void
 write_primitive_fields_fast(const T &obj, Buffer &buffer,
                             std::index_sequence<Indices...>) {
   using Helpers = CompileTimeFieldHelpers<T>;
-  const auto field_info = ForyFieldInfo(obj);
-  const auto field_ptrs = decltype(field_info)::Ptrs;
+  constexpr size_t fixed_count = Helpers::leading_fixed_count;
+  constexpr size_t fixed_bytes = Helpers::leading_fixed_size_bytes;
+  constexpr size_t varint_count = Helpers::varint_count;
+  constexpr size_t total_count = sizeof...(Indices);
+
+  // Phase 1: Write leading fixed-size primitives if any
+  if constexpr (fixed_count > 0 && fixed_bytes > 0) {
+    write_fixed_primitive_fields<T>(obj, buffer,
+                                    std::make_index_sequence<fixed_count>{});
+  }
+
+  // Phase 2: Write consecutive varint primitives if any
+  if constexpr (varint_count > 0) {
+    uint32_t offset = buffer.writer_index();
+    write_varint_primitive_fields<T, fixed_count>(
+        obj, buffer, offset, std::make_index_sequence<varint_count>{});
+    buffer.WriterIndex(offset);
+  }
 
-  // Track offset locally - single writer_index update at the end
-  uint32_t offset = buffer.writer_index();
-
-  // Write each field directly in sorted order using fold expression
-  (
-      [&]() {
-        constexpr size_t original_index = Helpers::sorted_indices[Indices];
-        const auto field_ptr = std::get<original_index>(field_ptrs);
-        using FieldType =
-            typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
-        const auto &field_value = obj.*field_ptr;
-        offset += put_primitive_at<FieldType>(field_value, buffer, offset);
-      }(),
-      ...);
-
-  // Single writer_index update for all fields
-  buffer.WriterIndex(offset);
+  // Phase 3: Write remaining primitives (if any) using dedicated helper
+  constexpr size_t fast_count = fixed_count + varint_count;
+  if constexpr (fast_count < total_count) {
+    uint32_t offset = buffer.writer_index();
+    write_remaining_primitive_fields<T, fast_count>(
+        obj, buffer, offset,
+        std::make_index_sequence<total_count - fast_count>{});
+    buffer.WriterIndex(offset);
+  }
 }
 
 template <typename T, size_t Index, typename FieldPtrs>
@@ -1069,6 +1448,30 @@ read_single_field_by_index_compatible(T &obj, 
ReadContext &ctx,
   return Result<void, Error>();
 }
 
+/// Helper to dispatch field reading by field_id in compatible mode.
+/// Uses fold expression with short-circuit to avoid lambda overhead.
+/// Returns the result of reading; sets handled=true if field was matched.
+template <typename T, size_t... Indices>
+FORY_ALWAYS_INLINE Result<void, Error>
+dispatch_compatible_field_read_impl(T &obj, ReadContext &ctx, int16_t field_id,
+                                    bool read_ref_flag, bool &handled,
+                                    std::index_sequence<Indices...>) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  Result<void, Error> result;
+
+  // Short-circuit fold: stops at first match
+  // Each element evaluates to bool; || short-circuits on first true
+  ((static_cast<int16_t>(Indices) == field_id
+        ? (handled = true,
+           result = read_single_field_by_index_compatible<
+               Helpers::sorted_indices[Indices]>(obj, ctx, read_ref_flag),
+           true)
+        : false) ||
+   ...);
+
+  return result;
+}
+
 /// Helper to read a single field at compile-time sorted position
 template <typename T, size_t SortedPosition>
 Result<void, Error> read_field_at_sorted_position(T &obj, ReadContext &ctx) {
@@ -1077,58 +1480,164 @@ Result<void, Error> read_field_at_sorted_position(T 
&obj, ReadContext &ctx) {
   return read_single_field_by_index<original_index>(obj, ctx);
 }
 
-/// Read a fixed-size primitive value directly using UnsafeGet.
-/// Caller must ensure buffer bounds are pre-checked.
+/// Get the fixed size of a primitive type at compile time
+template <typename T> constexpr size_t fixed_primitive_size() {
+  if constexpr (std::is_same_v<T, bool> || std::is_same_v<T, int8_t> ||
+                std::is_same_v<T, uint8_t>) {
+    return 1;
+  } else if constexpr (std::is_same_v<T, int16_t> ||
+                       std::is_same_v<T, uint16_t>) {
+    return 2;
+  } else if constexpr (std::is_same_v<T, uint32_t> ||
+                       std::is_same_v<T, float>) {
+    return 4;
+  } else if constexpr (std::is_same_v<T, uint64_t> ||
+                       std::is_same_v<T, double>) {
+    return 8;
+  } else {
+    return 0; // Not a fixed-size primitive
+  }
+}
+
+/// Compute the offset of field at sorted index I within the leading fixed
+/// fields This is the sum of sizes of all fields before index I
+/// Uses type-based field_fixed_sizes for correct encoding detection
+template <typename T, size_t I> constexpr size_t compute_fixed_field_offset() {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  size_t offset = 0;
+  for (size_t i = 0; i < I; ++i) {
+    size_t original_idx = Helpers::sorted_indices[i];
+    offset += Helpers::field_fixed_sizes[original_idx];
+  }
+  return offset;
+}
+
+/// Read a fixed-size primitive value at a given absolute offset using
+/// UnsafeGet. Does NOT update any offset - purely reads at the specified
+/// position. Caller must ensure buffer bounds are pre-checked.
 template <typename T>
-FORY_ALWAYS_INLINE T read_fixed_primitive(Buffer &buffer) {
-  uint32_t idx = buffer.reader_index();
-  T value;
+FORY_ALWAYS_INLINE T read_fixed_primitive_at(Buffer &buffer, uint32_t offset) {
   if constexpr (std::is_same_v<T, bool>) {
-    value = buffer.UnsafeGet<uint8_t>(idx) != 0;
-    buffer.IncreaseReaderIndex(1);
+    return buffer.UnsafeGet<uint8_t>(offset) != 0;
   } else if constexpr (std::is_same_v<T, int8_t>) {
-    value = static_cast<int8_t>(buffer.UnsafeGet<uint8_t>(idx));
-    buffer.IncreaseReaderIndex(1);
+    return static_cast<int8_t>(buffer.UnsafeGet<uint8_t>(offset));
   } else if constexpr (std::is_same_v<T, uint8_t>) {
-    value = buffer.UnsafeGet<uint8_t>(idx);
-    buffer.IncreaseReaderIndex(1);
+    return buffer.UnsafeGet<uint8_t>(offset);
   } else if constexpr (std::is_same_v<T, int16_t>) {
-    value = buffer.UnsafeGet<int16_t>(idx);
-    buffer.IncreaseReaderIndex(2);
+    return buffer.UnsafeGet<int16_t>(offset);
   } else if constexpr (std::is_same_v<T, uint16_t>) {
-    value = buffer.UnsafeGet<uint16_t>(idx);
-    buffer.IncreaseReaderIndex(2);
+    return buffer.UnsafeGet<uint16_t>(offset);
+  } else if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, int>) {
+    // Handle both int32_t and int (different types on some platforms)
+    return static_cast<T>(buffer.UnsafeGet<int32_t>(offset));
+  } else if constexpr (std::is_same_v<T, uint32_t> ||
+                       std::is_same_v<T, unsigned int>) {
+    // Handle both uint32_t and unsigned int (different types on some 
platforms)
+    return static_cast<T>(buffer.UnsafeGet<uint32_t>(offset));
   } else if constexpr (std::is_same_v<T, float>) {
-    value = buffer.UnsafeGet<float>(idx);
-    buffer.IncreaseReaderIndex(4);
+    return buffer.UnsafeGet<float>(offset);
+  } else if constexpr (std::is_same_v<T, uint64_t> ||
+                       std::is_same_v<T, unsigned long long>) {
+    // Handle both uint64_t and unsigned long long (different types on some
+    // platforms)
+    return static_cast<T>(buffer.UnsafeGet<uint64_t>(offset));
+  } else if constexpr (std::is_same_v<T, int64_t> ||
+                       std::is_same_v<T, long long>) {
+    // Handle both int64_t and long long (different types on some platforms)
+    // Note: int64_t/long long uses varint, but if classified as fixed by
+    // TypeId, we read as fixed 8 bytes
+    return static_cast<T>(buffer.UnsafeGet<int64_t>(offset));
   } else if constexpr (std::is_same_v<T, double>) {
-    value = buffer.UnsafeGet<double>(idx);
-    buffer.IncreaseReaderIndex(8);
+    return buffer.UnsafeGet<double>(offset);
   } else {
     static_assert(sizeof(T) == 0, "Unsupported fixed-size primitive type");
+    return T{};
   }
-  return value;
+}
+
+/// Helper to read a single fixed-size primitive field at compile-time offset.
+/// No lambda overhead - direct function call that will be inlined.
+template <typename T, size_t SortedIdx>
+FORY_ALWAYS_INLINE void read_single_fixed_field(T &obj, Buffer &buffer,
+                                                uint32_t base_offset) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  constexpr size_t original_index = Helpers::sorted_indices[SortedIdx];
+  constexpr size_t field_offset = compute_fixed_field_offset<T, SortedIdx>();
+  const auto field_info = ForyFieldInfo(obj);
+  const auto field_ptr = std::get<original_index>(decltype(field_info)::Ptrs);
+  using FieldType =
+      typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
+  obj.*field_ptr =
+      read_fixed_primitive_at<FieldType>(buffer, base_offset + field_offset);
 }
 
 /// Fast read leading fixed-size primitive fields using UnsafeGet.
 /// Caller must ensure buffer bounds are pre-checked.
+/// Optimized: uses compile-time offsets and updates reader_index once at end.
 template <typename T, size_t... Indices>
 FORY_ALWAYS_INLINE void
 read_fixed_primitive_fields(T &obj, Buffer &buffer,
                             std::index_sequence<Indices...>) {
   using Helpers = CompileTimeFieldHelpers<T>;
+  const uint32_t base_offset = buffer.reader_index();
+
+  // Read each field using helper function - no lambda overhead
+  (read_single_fixed_field<T, Indices>(obj, buffer, base_offset), ...);
+
+  // Update reader_index once with total fixed bytes (compile-time constant)
+  buffer.ReaderIndex(base_offset + Helpers::leading_fixed_size_bytes);
+}
+
+/// Read a single varint field at a given offset.
+/// Does NOT update reader_index - caller must track offset and update once.
+/// Caller must ensure buffer has enough bytes (pre-checked).
+template <typename T>
+FORY_ALWAYS_INLINE T read_varint_at(Buffer &buffer, uint32_t &offset) {
+  uint32_t bytes_read;
+  if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, int>) {
+    // Handle both int32_t and int (different types on some platforms)
+    uint32_t raw = buffer.GetVarUint32(offset, &bytes_read);
+    offset += bytes_read;
+    // Zigzag decode
+    return static_cast<T>((raw >> 1) ^ (~(raw & 1) + 1));
+  } else if constexpr (std::is_same_v<T, int64_t> ||
+                       std::is_same_v<T, long long>) {
+    // Handle both int64_t and long long (different types on some platforms)
+    uint64_t raw = buffer.GetVarUint64(offset, &bytes_read);
+    offset += bytes_read;
+    // Zigzag decode
+    return static_cast<T>((raw >> 1) ^ (~(raw & 1) + 1));
+  } else {
+    static_assert(sizeof(T) == 0, "Unsupported varint type");
+    return T{};
+  }
+}
+
+/// Helper to read a single varint primitive field.
+/// No lambda overhead - direct function call that will be inlined.
+template <typename T, size_t SortedPos>
+FORY_ALWAYS_INLINE void read_single_varint_field(T &obj, Buffer &buffer,
+                                                 uint32_t &offset) {
+  using Helpers = CompileTimeFieldHelpers<T>;
+  constexpr size_t original_index = Helpers::sorted_indices[SortedPos];
   const auto field_info = ForyFieldInfo(obj);
-  const auto field_ptrs = decltype(field_info)::Ptrs;
+  const auto field_ptr = std::get<original_index>(decltype(field_info)::Ptrs);
+  using FieldType =
+      typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
+  obj.*field_ptr = read_varint_at<FieldType>(buffer, offset);
+}
 
-  (
-      [&]() {
-        constexpr size_t original_index = Helpers::sorted_indices[Indices];
-        const auto field_ptr = std::get<original_index>(field_ptrs);
-        using FieldType =
-            typename meta::RemoveMemberPointerCVRefT<decltype(field_ptr)>;
-        obj.*field_ptr = read_fixed_primitive<FieldType>(buffer);
-      }(),
-      ...);
+/// Fast read consecutive varint primitive fields (int32, int64).
+/// Caller must ensure buffer bounds are pre-checked for max varint bytes.
+/// Optimized: tracks offset locally and updates reader_index once at the end.
+/// StartIdx is the sorted index to start reading from.
+template <typename T, size_t StartIdx, size_t... Is>
+FORY_ALWAYS_INLINE void
+read_varint_primitive_fields(T &obj, Buffer &buffer, uint32_t &offset,
+                             std::index_sequence<Is...>) {
+  // Read each varint field using helper function - no lambda overhead
+  // Is are 0, 1, 2, ... so actual sorted position is StartIdx + Is
+  (read_single_varint_field<T, StartIdx + Is>(obj, buffer, offset), ...);
 }
 
 /// Helper to read remaining fields starting from Offset
@@ -1150,22 +1659,25 @@ Result<void, Error> read_remaining_fields(T &obj, 
ReadContext &ctx) {
 
 /// Read struct fields recursively using index sequence (sorted order - matches
 /// write order)
-/// Optimized: when compatible=false and there are leading fixed-size
-/// primitives, pre-check bounds once and use UnsafeGet for those fields. Note:
-/// varints (int32/int64) cannot be pre-checked since their length is unknown.
+/// Optimized: when compatible=false, use fast paths for:
+/// 1. Leading fixed-size primitives (bool, int8, int16, float, double)
+/// 2. Consecutive varint primitives (int32, int64) after fixed fields
+/// Both paths pre-check bounds and update reader_index once at the end.
 template <typename T, size_t... Indices>
 Result<void, Error> read_struct_fields_impl(T &obj, ReadContext &ctx,
                                             std::index_sequence<Indices...>) {
   using Helpers = CompileTimeFieldHelpers<T>;
   constexpr size_t fixed_count = Helpers::leading_fixed_count;
   constexpr size_t fixed_bytes = Helpers::leading_fixed_size_bytes;
+  constexpr size_t varint_count = Helpers::varint_count;
   constexpr size_t total_count = sizeof...(Indices);
 
-  // FAST PATH: When compatible=false and we have leading fixed-size primitives
-  // (bool, int8, int16, float, double - NOT varints like int32/int64)
-  if constexpr (fixed_count > 0 && fixed_bytes > 0) {
-    if (!ctx.is_compatible()) {
-      Buffer &buffer = ctx.buffer();
+  // FAST PATH: When compatible=false, use optimized batch reading
+  if (!ctx.is_compatible()) {
+    Buffer &buffer = ctx.buffer();
+
+    // Phase 1: Read leading fixed-size primitives if any
+    if constexpr (fixed_count > 0 && fixed_bytes > 0) {
       // Pre-check bounds for all fixed-size fields at once
       if (FORY_PREDICT_FALSE(buffer.reader_index() + fixed_bytes >
                              buffer.size())) {
@@ -1175,17 +1687,32 @@ Result<void, Error> read_struct_fields_impl(T &obj, 
ReadContext &ctx,
       // Fast read fixed-size primitives
       read_fixed_primitive_fields<T>(obj, buffer,
                                      std::make_index_sequence<fixed_count>{});
+    }
 
-      if constexpr (fixed_count < total_count) {
-        // Read remaining fields with normal path
-        return read_remaining_fields<T, fixed_count, total_count>(obj, ctx);
-      } else {
-        return Result<void, Error>();
-      }
+    // Phase 2: Read consecutive varint primitives (int32, int64) if any
+    // Note: varint bounds checking is done per-byte during reading since
+    // varint lengths are variable (actual size << max possible size)
+    if constexpr (varint_count > 0) {
+      // Track offset locally for batch varint reading
+      uint32_t offset = buffer.reader_index();
+      // Fast read varint primitives (bounds checking happens in
+      // GetVarUint32/64)
+      read_varint_primitive_fields<T, fixed_count>(
+          obj, buffer, offset, std::make_index_sequence<varint_count>{});
+      // Update reader_index once after all varints
+      buffer.ReaderIndex(offset);
+    }
+
+    // Phase 3: Read remaining fields (if any) with normal path
+    constexpr size_t fast_count = fixed_count + varint_count;
+    if constexpr (fast_count < total_count) {
+      return read_remaining_fields<T, fast_count, total_count>(obj, ctx);
+    } else {
+      return Result<void, Error>();
     }
   }
 
-  // NORMAL PATH: compatible mode or structs with varints/complex types
+  // NORMAL PATH: compatible mode - all fields need full serialization
   Result<void, Error> result;
   ((result = read_field_at_sorted_position<T, Indices>(obj, ctx),
     result.ok()) &&
@@ -1200,8 +1727,6 @@ Result<void, Error>
 read_struct_fields_compatible(T &obj, ReadContext &ctx,
                               const TypeMeta *remote_type_meta,
                               std::index_sequence<Indices...>) {
-
-  using Helpers = CompileTimeFieldHelpers<T>;
   const auto &remote_fields = remote_type_meta->get_field_infos();
 
   // Iterate through remote fields in their serialization order
@@ -1238,19 +1763,11 @@ read_struct_fields_compatible(T &obj, ReadContext &ctx,
     }
 
     // Dispatch to the correct local field by field_id
+    // Uses fold expression with short-circuit - no lambda overhead
     bool handled = false;
-    Result<void, Error> result;
-
-    detail::for_each_index(
-        std::index_sequence<Indices...>{}, [&](auto index_constant) {
-          constexpr size_t index = decltype(index_constant)::value;
-          if (!handled && static_cast<int16_t>(index) == field_id) {
-            handled = true;
-            constexpr size_t original_index = Helpers::sorted_indices[index];
-            result = read_single_field_by_index_compatible<original_index>(
-                obj, ctx, read_ref_flag);
-          }
-        });
+    Result<void, Error> result = dispatch_compatible_field_read_impl<T>(
+        obj, ctx, field_id, read_ref_flag, handled,
+        std::index_sequence<Indices...>{});
 
     if (!handled) {
       // Shouldn't happen if TypeMeta::assign_field_ids worked correctly


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to