(fory) branch main updated: perf(rust): optimize rust performance by remove copy simd and add more inline hints (#2807)

chaokunyang Wed, 22 Oct 2025 03:01:43 -0700

This is an automated email from the ASF dual-hosted git repository.

chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fory.git



The following commit(s) were added to refs/heads/main by this push:
     new bef19b151 perf(rust): optimize rust performance by remove copy simd 
and add more inline hints (#2807)
bef19b151 is described below

commit bef19b15137f6430eeda5d9f746b5eef1a6ad594
Author: Shawn Yang <[email protected]>
AuthorDate: Wed Oct 22 18:00:31 2025 +0800

    perf(rust): optimize rust performance by remove copy simd and add more 
inline hints (#2807)
    
    ## Why?
    
    <!-- Describe the purpose of this PR. -->
    
    ## What does this PR do?
    
    optimize rust performance by remove copy simd and add more inline hints
    
    ## Related issues
    
    <!--
    Is there any related issue? If this PR closes them you say say
    fix/closes:
    
    - #xxxx0
    - #xxxx1
    - Fixes #xxxx2
    -->
    
    ## Does this PR introduce any user-facing change?
    
    <!--
    If any user-facing interface changes, please [open an
    issue](https://github.com/apache/fory/issues/new/choose) describing the
    need to do so and update the document if necessary.
    
    Delete section if not applicable.
    -->
    
    - [ ] Does this PR introduce any public API change?
    - [ ] Does this PR introduce any binary protocol compatibility change?
    
    ## Benchmark
    
    <!--
    When the PR has an impact on performance (if you don't know whether the
    PR will have an impact on performance, you can submit the PR first, and
    if it will have impact on performance, the code reviewer will explain
    it), be sure to attach a benchmark data here.
    
    Delete section if not applicable.
    -->
---
 rust/fory-core/benches/simd_bench.rs               | 116 +-----
 rust/fory-core/src/buffer.rs                       |  85 ++--
 rust/fory-core/src/fory.rs                         |   1 +
 rust/fory-core/src/meta/string_util.rs             | 427 +++------------------
 rust/fory-core/src/meta/type_meta.rs               |   9 +
 rust/fory-core/src/resolver/context.rs             |   1 +
 rust/fory-core/src/resolver/meta_resolver.rs       |   6 +
 rust/fory-core/src/resolver/metastring_resolver.rs |   3 +
 8 files changed, 107 insertions(+), 541 deletions(-)

diff --git a/rust/fory-core/benches/simd_bench.rs 
b/rust/fory-core/benches/simd_bench.rs
index 3d49aa527..208e80f12 100644
--- a/rust/fory-core/benches/simd_bench.rs
+++ b/rust/fory-core/benches/simd_bench.rs
@@ -21,9 +21,7 @@ use std::arch::x86_64::*;
 
 use fory_core::buffer::{Reader, Writer};
 use fory_core::meta::buffer_rw_string::{
-    read_latin1_simd, read_latin1_standard, read_utf16_simd, 
read_utf16_standard, read_utf8_simd,
-    read_utf8_standard, write_latin1_simd, write_latin1_standard, 
write_utf16_simd,
-    write_utf16_standard, write_utf8_simd, write_utf8_standard,
+    read_latin1_simd, read_latin1_standard, write_latin1_simd, 
write_latin1_standard,
 };
 #[cfg(target_feature = "sse2")]
 use std::arch::x86_64::*;
@@ -109,54 +107,6 @@ fn benchmark_write_latin1(c: &mut Criterion) {
     }
 }
 
-fn benchmark_write_utf8(c: &mut Criterion) {
-    let sizes = [100, 1000, 10000, 100000];
-    for &size in &sizes {
-        let s = "Hello, 世界! 🌍".repeat(size);
-
-        let name_simd = format!("Write UTF-8 SIMD size {}", size);
-        c.bench_function(&name_simd, |b| {
-            b.iter(|| {
-                let mut w = Writer::default();
-                write_utf8_simd(black_box(&mut w), black_box(&s));
-            })
-        });
-
-        let name_scalar = format!("Write UTF-8 Standard size {}", size);
-        c.bench_function(&name_scalar, |b| {
-            b.iter(|| {
-                let mut w = Writer::default();
-                write_utf8_standard(black_box(&mut w), black_box(&s));
-            })
-        });
-    }
-}
-
-fn benchmark_write_utf16(c: &mut Criterion) {
-    let sizes = [100, 1000, 10000, 100000];
-    for &size in &sizes {
-        let s = "Hello, 世界! 🌍".repeat(size);
-
-        let name_simd = format!("Write UTF-16 SIMD size {}", size);
-        c.bench_function(&name_simd, |b| {
-            b.iter(|| {
-                let mut w = Writer::default();
-                let utf16: Vec<u16> = s.encode_utf16().collect();
-                write_utf16_simd(black_box(&mut w), black_box(&utf16));
-            })
-        });
-
-        let name_scalar = format!("Write UTF-16 Standard size {}", size);
-        c.bench_function(&name_scalar, |b| {
-            b.iter(|| {
-                let mut w = Writer::default();
-                let utf16: Vec<u16> = s.encode_utf16().collect();
-                write_utf16_standard(black_box(&mut w), black_box(&utf16));
-            })
-        });
-    }
-}
-
 fn benchmark_read_latin1(c: &mut Criterion) {
     let sizes = [100, 1000, 10000, 100000];
     let ascii_string = "abcdefghijklmnopqrstuvwxyz0123456789";
@@ -185,66 +135,6 @@ fn benchmark_read_latin1(c: &mut Criterion) {
     }
 }
 
-fn benchmark_read_utf8(c: &mut Criterion) {
-    let sizes = [100, 1000, 10000, 100000];
-    let test_string = "Hello, 世界! 🌍";
-
-    for &size in &sizes {
-        let s = test_string.repeat(size / test_string.len() + 1);
-        let mut writer = Writer::default();
-        writer.write_utf8_string(&s);
-        let data = writer.dump();
-
-        let name_simd = format!("Read UTF-8 SIMD size {}", size);
-        c.bench_function(&name_simd, |b| {
-            b.iter(|| {
-                let mut reader = Reader::new(black_box(&data));
-                read_utf8_simd(black_box(&mut reader), 
black_box(s.len())).unwrap();
-            })
-        });
-
-        let name_scalar = format!("Read UTF-8 Standard size {}", size);
-        c.bench_function(&name_scalar, |b| {
-            b.iter(|| {
-                let mut reader = Reader::new(black_box(&data));
-                read_utf8_standard(black_box(&mut reader), 
black_box(s.len())).unwrap();
-            })
-        });
-    }
-}
-
-fn benchmark_read_utf16(c: &mut Criterion) {
-    let sizes = [100, 1000, 10000, 100000];
-    let test_string = "Hello, 世界! 🌍";
-
-    for &size in &sizes {
-        let s = test_string.repeat(size / test_string.len() + 1);
-        let mut data: Vec<u8> = Vec::with_capacity(s.len() * 2);
-        for u in s.encode_utf16() {
-            let lo = (u & 0x00FF) as u8;
-            let hi = (u >> 8) as u8;
-            data.push(lo);
-            data.push(hi);
-        }
-
-        let name_simd = format!("Read UTF-16 SIMD size {}", size);
-        c.bench_function(&name_simd, |b| {
-            b.iter(|| {
-                let mut reader = Reader::new(black_box(&data));
-                read_utf16_simd(black_box(&mut reader), 
black_box(data.len())).unwrap();
-            })
-        });
-
-        let name_scalar = format!("Read UTF-16 Standard size {}", size);
-        c.bench_function(&name_scalar, |b| {
-            b.iter(|| {
-                let mut reader = Reader::new(black_box(&data));
-                read_utf16_standard(black_box(&mut reader), 
black_box(data.len())).unwrap();
-            })
-        });
-    }
-}
-
 fn criterion_benchmark(c: &mut Criterion) {
     let test_str_short = "Hello, World!";
     let test_str_long = "Hello, World! ".repeat(1000);
@@ -275,12 +165,8 @@ fn criterion_benchmark(c: &mut Criterion) {
     });
 
     benchmark_write_latin1(c);
-    benchmark_write_utf8(c);
-    benchmark_write_utf16(c);
 
     benchmark_read_latin1(c);
-    benchmark_read_utf8(c);
-    benchmark_read_utf16(c);
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/rust/fory-core/src/buffer.rs b/rust/fory-core/src/buffer.rs
index d5c00cb91..3ffc9f6ad 100644
--- a/rust/fory-core/src/buffer.rs
+++ b/rust/fory-core/src/buffer.rs
@@ -16,10 +16,7 @@
 // under the License.
 
 use crate::error::Error;
-use crate::meta::buffer_rw_string::{
-    read_latin1_simd, read_utf16_simd, read_utf8_simd, write_latin1_simd, 
write_utf16_simd,
-    write_utf8_simd,
-};
+use crate::meta::buffer_rw_string::{read_latin1_simd, write_latin1_simd};
 use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
 use std::slice;
 
@@ -355,34 +352,21 @@ impl Writer {
     pub fn write_utf8_string(&mut self, s: &str) {
         let bytes = s.as_bytes();
         let len = bytes.len();
-
-        if len < SIMD_THRESHOLD {
-            // Fast path for small strings - direct copy avoids SIMD overhead
-            // For small strings, the branch cost + simple copy is faster than 
SIMD setup
-            self.bf.reserve(len);
-            self.bf.extend_from_slice(bytes);
-        } else {
-            // Use SIMD for larger strings where the overhead is amortized
-            write_utf8_simd(self, s);
-        }
+        self.bf.reserve(len);
+        self.bf.extend_from_slice(bytes);
     }
 
     #[inline(always)]
     pub fn write_utf16_bytes(&mut self, bytes: &[u16]) {
         let total_bytes = bytes.len() * 2;
-        if total_bytes < SIMD_THRESHOLD {
-            // Fast path for small UTF-16 data - direct copy
-            let old_len = self.bf.len();
-            self.bf.reserve(total_bytes);
-            unsafe {
-                let dest = self.bf.as_mut_ptr().add(old_len);
-                let src = bytes.as_ptr() as *const u8;
-                std::ptr::copy_nonoverlapping(src, dest, total_bytes);
-                self.bf.set_len(old_len + total_bytes);
-            }
-            return;
+        let old_len = self.bf.len();
+        self.bf.reserve(total_bytes);
+        unsafe {
+            let dest = self.bf.as_mut_ptr().add(old_len);
+            let src = bytes.as_ptr() as *const u8;
+            std::ptr::copy_nonoverlapping(src, dest, total_bytes);
+            self.bf.set_len(old_len + total_bytes);
         }
-        write_utf16_simd(self, bytes);
     }
 }
 
@@ -712,43 +696,32 @@ impl Reader {
     #[inline(always)]
     pub fn read_utf8_string(&mut self, len: usize) -> Result<String, Error> {
         self.check_bound(len)?;
-
-        if len < SIMD_THRESHOLD {
-            // Fast path for small strings - direct copy avoids SIMD overhead
-            // SAFETY: bounds already checked, assuming valid UTF-8 (caller's 
responsibility)
-            unsafe {
-                let mut vec = Vec::with_capacity(len);
-                let src = self.bf.add(self.cursor);
-                let dst = vec.as_mut_ptr();
-                // Use fastest possible copy - copy_nonoverlapping compiles to 
memcpy
-                std::ptr::copy_nonoverlapping(src, dst, len);
-                vec.set_len(len);
-                self.move_next(len);
-                // SAFETY: Assuming valid UTF-8 bytes (responsibility of 
serialization protocol)
-                Ok(String::from_utf8_unchecked(vec))
-            }
-        } else {
-            // Use SIMD for larger strings where the overhead is amortized
-            read_utf8_simd(self, len)
+        // don't use simd for memory copy, copy_non_overlapping is faster
+        unsafe {
+            let mut vec = Vec::with_capacity(len);
+            let src = self.bf.add(self.cursor);
+            let dst = vec.as_mut_ptr();
+            // Use fastest possible copy - copy_nonoverlapping compiles to 
memcpy
+            std::ptr::copy_nonoverlapping(src, dst, len);
+            vec.set_len(len);
+            self.move_next(len);
+            // SAFETY: Assuming valid UTF-8 bytes (responsibility of 
serialization protocol)
+            Ok(String::from_utf8_unchecked(vec))
         }
     }
 
     #[inline(always)]
     pub fn read_utf16_string(&mut self, len: usize) -> Result<String, Error> {
         self.check_bound(len)?;
-        if len < SIMD_THRESHOLD {
-            // Fast path for small UTF-16 strings - direct copy
-            unsafe {
-                let slice = 
std::slice::from_raw_parts(self.bf.add(self.cursor), len);
-                let units: Vec<u16> = slice
-                    .chunks_exact(2)
-                    .map(|c| u16::from_le_bytes([c[0], c[1]]))
-                    .collect();
-                self.move_next(len);
-                return Ok(String::from_utf16_lossy(&units));
-            }
+        unsafe {
+            let slice = std::slice::from_raw_parts(self.bf.add(self.cursor), 
len);
+            let units: Vec<u16> = slice
+                .chunks_exact(2)
+                .map(|c| u16::from_le_bytes([c[0], c[1]]))
+                .collect();
+            self.move_next(len);
+            Ok(String::from_utf16_lossy(&units))
         }
-        read_utf16_simd(self, len)
     }
 
     #[inline(always)]
diff --git a/rust/fory-core/src/fory.rs b/rust/fory-core/src/fory.rs
index b00d0e07a..1f664a18b 100644
--- a/rust/fory-core/src/fory.rs
+++ b/rust/fory-core/src/fory.rs
@@ -364,6 +364,7 @@ impl Fory {
         }
     }
 
+    #[inline(always)]
     fn read_head(&self, reader: &mut Reader) -> Result<bool, Error> {
         if self.xlang {
             let magic_numer = reader.read_u16()?;
diff --git a/rust/fory-core/src/meta/string_util.rs 
b/rust/fory-core/src/meta/string_util.rs
index 8eda15d57..d77124b18 100644
--- a/rust/fory-core/src/meta/string_util.rs
+++ b/rust/fory-core/src/meta/string_util.rs
@@ -522,17 +522,6 @@ pub mod buffer_rw_string {
     ))]
     use std::arch::x86_64::*;
 
-    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-    const SIMD_CHUNK_SIZE: usize = 64;
-    #[cfg(all(
-        any(target_arch = "x86", target_arch = "x86_64"),
-        target_feature = "sse2",
-        not(target_feature = "avx2")
-    ))]
-    const SIMD_CHUNK_SIZE: usize = 32;
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-    const SIMD_CHUNK_SIZE: usize = 32;
-
     use crate::buffer::{Reader, Writer};
     use crate::error::Error;
 
@@ -548,21 +537,39 @@ pub mod buffer_rw_string {
     #[inline]
     pub fn write_utf8_standard(writer: &mut Writer, s: &str) {
         let bytes = s.as_bytes();
-        for &b in bytes {
-            writer.write_u8(b);
-        }
+        let len = bytes.len();
+        writer.reserve(len);
+        writer.bf.extend_from_slice(bytes);
     }
 
     #[inline]
     pub fn write_utf16_standard(writer: &mut Writer, utf16: &[u16]) {
-        for unit in utf16 {
-            #[cfg(target_endian = "little")]
-            {
-                writer.write_u16(*unit);
+        #[cfg(target_endian = "little")]
+        {
+            let total_bytes = utf16.len() * 2;
+            let old_len = writer.bf.len();
+            writer.bf.reserve(total_bytes);
+            unsafe {
+                let dest = writer.bf.as_mut_ptr().add(old_len);
+                let src = utf16.as_ptr() as *const u8;
+                std::ptr::copy_nonoverlapping(src, dest, total_bytes);
+                writer.bf.set_len(old_len + total_bytes);
             }
-            #[cfg(target_endian = "big")]
-            {
-                unimplemented!()
+        }
+        #[cfg(target_endian = "big")]
+        {
+            let total_bytes = utf16.len() * 2;
+            let old_len = writer.bf.len();
+            writer.bf.reserve(total_bytes);
+            unsafe {
+                let dest = writer.bf.as_mut_ptr().add(old_len);
+                // Need to swap bytes for each u16 to little-endian
+                for (i, &unit) in utf16.iter().enumerate() {
+                    let swapped = unit.swap_bytes();
+                    let ptr = dest.add(i * 2) as *mut u16;
+                    std::ptr::write_unaligned(ptr, swapped);
+                }
+                writer.bf.set_len(old_len + total_bytes);
             }
         }
     }
@@ -577,109 +584,33 @@ pub mod buffer_rw_string {
 
     #[inline]
     pub fn read_utf8_standard(reader: &mut Reader, len: usize) -> 
Result<String, Error> {
-        let slice = unsafe { 
std::slice::from_raw_parts(reader.bf.add(reader.cursor), len) };
-        let result = String::from_utf8_lossy(slice).to_string();
-        reader.move_next(len);
-        Ok(result)
+        unsafe {
+            let mut vec = Vec::with_capacity(len);
+            let src = reader.bf.add(reader.cursor);
+            let dst = vec.as_mut_ptr();
+            // Use fastest possible copy - copy_nonoverlapping compiles to 
memcpy
+            std::ptr::copy_nonoverlapping(src, dst, len);
+            vec.set_len(len);
+            reader.move_next(len);
+            // Use from_utf8_lossy for safety - handles invalid UTF-8 
gracefully
+            // If you're certain the data is valid UTF-8, use 
from_utf8_unchecked for more performance
+            Ok(String::from_utf8_lossy(&vec).into_owned())
+        }
     }
 
     #[inline]
     pub fn read_utf16_standard(reader: &mut Reader, len: usize) -> 
Result<String, Error> {
-        assert!(len % 2 == 0, "UTF-16 length must be even");
-        let slice = unsafe { 
std::slice::from_raw_parts(reader.bf.add(reader.cursor), len) };
-        let units: Vec<u16> = slice
-            .chunks(2)
-            // little endian
-            .map(|c| (c[0] as u16) | ((c[1] as u16) << 8))
-            .collect();
-        let result = String::from_utf16(&units)
-            // lossy
-            .unwrap_or_else(|_| String::from("�"));
-        reader.move_next(len);
-        Ok(result)
-    }
-
-    #[inline]
-    fn write_bytes_simd(writer: &mut Writer, bytes: &[u8]) {
-        let len = bytes.len();
-        if len == 0 {
-            return;
+        if len % 2 != 0 {
+            return Err(Error::encoding_error("UTF-16 length must be even"));
         }
-        let mut i = 0usize;
-        writer.bf.reserve(len);
-
-        #[cfg(any(
-            all(target_arch = "x86_64", target_feature = "avx2"),
-            all(target_arch = "x86_64", target_feature = "sse2"),
-            all(target_arch = "aarch64", target_feature = "neon")
-        ))]
         unsafe {
-            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-            {
-                const CHUNK: usize = 64;
-                while i + CHUNK <= len {
-                    let ptr = bytes.as_ptr().add(i);
-                    let chunk1 = _mm256_loadu_si256(ptr as *const __m256i);
-                    let chunk2 = _mm256_loadu_si256(ptr.add(32) as *const 
__m256i);
-
-                    let current_len = writer.bf.len();
-                    writer.bf.set_len(current_len + CHUNK);
-                    let dest_ptr = writer.bf.as_mut_ptr().add(current_len);
-
-                    _mm256_storeu_si256(dest_ptr as *mut __m256i, chunk1);
-                    _mm256_storeu_si256(dest_ptr.add(32) as *mut __m256i, 
chunk2);
-                    i += CHUNK;
-                }
-            }
-
-            #[cfg(all(
-                target_arch = "x86_64",
-                not(target_feature = "avx2"),
-                target_feature = "sse2"
-            ))]
-            {
-                const CHUNK: usize = 32;
-                while i + CHUNK <= len {
-                    let ptr = bytes.as_ptr().add(i);
-                    let chunk1 = _mm_loadu_si128(ptr as *const __m128i);
-                    let chunk2 = _mm_loadu_si128(ptr.add(16) as *const 
__m128i);
-
-                    let current_len = writer.bf.len();
-                    writer.bf.set_len(current_len + CHUNK);
-                    let dest_ptr = writer.bf.as_mut_ptr().add(current_len);
-
-                    _mm_storeu_si128(dest_ptr as *mut __m128i, chunk1);
-                    _mm_storeu_si128(dest_ptr.add(16) as *mut __m128i, chunk2);
-                    i += CHUNK;
-                }
-            }
-
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            {
-                const CHUNK: usize = 32;
-                while i + CHUNK <= len {
-                    let ptr = bytes.as_ptr().add(i);
-                    let chunk1 = vld1q_u8(ptr);
-                    let chunk2 = vld1q_u8(ptr.add(16));
-
-                    let current_len = writer.bf.len();
-                    writer.bf.set_len(current_len + CHUNK);
-                    let dest_ptr = writer.bf.as_mut_ptr().add(current_len);
-
-                    vst1q_u8(dest_ptr, chunk1);
-                    vst1q_u8(dest_ptr.add(16), chunk2);
-                    i += CHUNK;
-                }
-            }
-        }
-
-        const MEDIUM_CHUNK: usize = 16;
-        while i + MEDIUM_CHUNK <= len {
-            writer.bf.extend_from_slice(&bytes[i..i + MEDIUM_CHUNK]);
-            i += MEDIUM_CHUNK;
-        }
-        if i < len {
-            writer.bf.extend_from_slice(&bytes[i..]);
+            let slice = 
std::slice::from_raw_parts(reader.bf.add(reader.cursor), len);
+            let units: Vec<u16> = slice
+                .chunks_exact(2)
+                .map(|c| u16::from_le_bytes([c[0], c[1]]))
+                .collect();
+            reader.move_next(len);
+            Ok(String::from_utf16_lossy(&units))
         }
     }
 
@@ -745,7 +676,9 @@ pub mod buffer_rw_string {
         // Check if all ASCII using SIMD
         if is_ascii_bytes(bytes) {
             // Zero-copy fast path: direct write
-            write_bytes_simd(writer, bytes);
+            let len = bytes.len();
+            writer.bf.reserve(len);
+            writer.bf.extend_from_slice(bytes);
         } else {
             // Non-ASCII: Must iterate chars to extract Latin1 byte values
             // Example: 'À' in Rust String is UTF-8 [0xC3, 0x80] but Latin1 is 
[0xC0]
@@ -755,77 +688,9 @@ pub mod buffer_rw_string {
                 assert!(v <= 0xFF, "Non-Latin1 character found");
                 buf.push(v as u8);
             }
-            write_bytes_simd(writer, &buf);
-        }
-    }
-
-    #[inline(always)]
-    pub fn write_utf8_simd(writer: &mut Writer, s: &str) {
-        let bytes = s.as_bytes();
-        write_bytes_simd(writer, bytes);
-    }
-
-    pub fn write_utf16_simd(writer: &mut Writer, utf16: &[u16]) {
-        if utf16.is_empty() {
-            return;
-        }
-
-        #[cfg(target_endian = "big")]
-        {
-            unimplemented!("Big-endian UTF-16 writing is not implemented");
-        }
-
-        #[cfg(target_endian = "little")]
-        unsafe {
-            let total_bytes = utf16.len() * 2;
-            let old_len = writer.bf.len();
-            writer.bf.reserve(total_bytes);
-            let dest = writer.bf.as_mut_ptr().add(old_len);
-            let src = utf16.as_ptr() as *const u8;
-
-            let mut i = 0usize;
-
-            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-            {
-                const CHUNK: usize = 32;
-                while i + CHUNK <= total_bytes {
-                    let chunk = _mm256_loadu_si256(src.add(i) as *const 
__m256i);
-                    _mm256_storeu_si256(dest.add(i) as *mut __m256i, chunk);
-                    i += CHUNK;
-                }
-            }
-
-            #[cfg(all(
-                any(target_arch = "x86", target_arch = "x86_64"),
-                target_feature = "sse2",
-                not(target_feature = "avx2")
-            ))]
-            {
-                const CHUNK: usize = 16;
-                while i + CHUNK <= total_bytes {
-                    let chunk = _mm_loadu_si128(src.add(i) as *const __m128i);
-                    _mm_storeu_si128(dest.add(i) as *mut __m128i, chunk);
-                    i += CHUNK;
-                }
-            }
-
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            {
-                const CHUNK: usize = 16;
-                while i + CHUNK <= total_bytes {
-                    let chunk = vld1q_u8(src.add(i));
-                    vst1q_u8(dest.add(i), chunk);
-                    i += CHUNK;
-                }
-            }
-
-            // fallback for remaining bytes
-            if i < total_bytes {
-                std::ptr::copy_nonoverlapping(src.add(i), dest.add(i), 
total_bytes - i);
-            }
-
-            // set length only after all writes
-            writer.bf.set_len(old_len + total_bytes);
+            let len = buf.len();
+            writer.bf.reserve(len);
+            writer.bf.extend_from_slice(&buf);
         }
     }
 
@@ -935,172 +800,6 @@ pub mod buffer_rw_string {
         Ok(unsafe { String::from_utf8_unchecked(out) })
     }
 
-    #[inline]
-    pub fn read_utf8_simd(reader: &mut Reader, len: usize) -> Result<String, 
Error> {
-        if len == 0 {
-            return Ok(String::new());
-        }
-        let src = unsafe { 
std::slice::from_raw_parts(reader.bf.add(reader.cursor), len) };
-
-        // CRITICAL OPTIMIZATION: Allocate Vec once, SIMD copy directly, 
single String construction
-        // Eliminates multiple push_str copies
-        let mut vec = Vec::with_capacity(len);
-
-        unsafe {
-            let dst: *mut u8 = vec.as_mut_ptr();
-            let mut i = 0usize;
-
-            // ---- AVX2 path: 32-byte chunks ----
-            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-            {
-                const CHUNK: usize = 32;
-                while i + CHUNK <= len {
-                    let chunk = _mm256_loadu_si256(src.as_ptr().add(i) as 
*const __m256i);
-                    _mm256_storeu_si256(dst.add(i) as *mut __m256i, chunk);
-                    i += CHUNK;
-                }
-            }
-
-            // ---- SSE2 path: 16-byte chunks ----
-            #[cfg(all(
-                any(target_arch = "x86", target_arch = "x86_64"),
-                target_feature = "sse2",
-                not(target_feature = "avx2")
-            ))]
-            {
-                const CHUNK: usize = 16;
-                while i + CHUNK <= len {
-                    let chunk = _mm_loadu_si128(src.as_ptr().add(i) as *const 
__m128i);
-                    _mm_storeu_si128(dst.add(i) as *mut __m128i, chunk);
-                    i += CHUNK;
-                }
-            }
-
-            // ---- NEON path: 16-byte chunks ----
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            {
-                const CHUNK: usize = 16;
-                while i + CHUNK <= len {
-                    let chunk = vld1q_u8(src.as_ptr().add(i));
-                    vst1q_u8(dst.add(i), chunk);
-                    i += CHUNK;
-                }
-            }
-
-            // ---- Copy remaining bytes ----
-            if i < len {
-                std::ptr::copy_nonoverlapping(src.as_ptr().add(i), dst.add(i), 
len - i);
-            }
-
-            vec.set_len(len);
-        }
-
-        reader.move_next(len);
-        // Single String construction - no intermediate copies!
-        Ok(unsafe { String::from_utf8_unchecked(vec) })
-    }
-
-    #[inline]
-    pub fn read_utf16_simd(reader: &mut Reader, len: usize) -> Result<String, 
Error> {
-        assert_eq!(len % 2, 0, "UTF-16 length must be even");
-        unsafe fn simd_impl(bytes: &[u8]) -> String {
-            let len = bytes.len();
-            let unit_len = len / 2;
-            let mut units: Vec<u16> = vec![0u16; unit_len];
-
-            let dest_u8 = units.as_mut_ptr() as *mut u8;
-            let src_u8 = bytes.as_ptr();
-            let mut i = 0usize;
-
-            #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
-            {
-                while i + SIMD_CHUNK_SIZE <= len {
-                    let c1 = _mm256_loadu_si256(src_u8.add(i) as *const 
__m256i);
-                    let c2 = _mm256_loadu_si256(src_u8.add(i + 32) as *const 
__m256i);
-                    _mm256_storeu_si256(dest_u8.add(i) as *mut __m256i, c1);
-                    _mm256_storeu_si256(dest_u8.add(i + 32) as *mut __m256i, 
c2);
-                    i += SIMD_CHUNK_SIZE;
-                }
-                while i + 32 <= len {
-                    let c = _mm256_loadu_si256(src_u8.add(i) as *const 
__m256i);
-                    _mm256_storeu_si256(dest_u8.add(i) as *mut __m256i, c);
-                    i += 32;
-                }
-            }
-
-            #[cfg(all(
-                any(target_arch = "x86", target_arch = "x86_64"),
-                target_feature = "sse2",
-                not(target_feature = "avx2")
-            ))]
-            {
-                while i + SIMD_CHUNK_SIZE <= len {
-                    let c1 = _mm_loadu_si128(src_u8.add(i) as *const __m128i);
-                    let c2 = _mm_loadu_si128(src_u8.add(i + 16) as *const 
__m128i);
-                    _mm_storeu_si128(dest_u8.add(i) as *mut __m128i, c1);
-                    _mm_storeu_si128(dest_u8.add(i + 16) as *mut __m128i, c2);
-                    i += SIMD_CHUNK_SIZE;
-                }
-                while i + 16 <= len {
-                    let c = _mm_loadu_si128(src_u8.add(i) as *const __m128i);
-                    _mm_storeu_si128(dest_u8.add(i) as *mut __m128i, c);
-                    i += 16;
-                }
-            }
-
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            {
-                while i + SIMD_CHUNK_SIZE <= len {
-                    let c1 = vld1q_u8(src_u8.add(i));
-                    let c2 = vld1q_u8(src_u8.add(i + 16));
-                    vst1q_u8(dest_u8.add(i), c1);
-                    vst1q_u8(dest_u8.add(i + 16), c2);
-                    i += SIMD_CHUNK_SIZE;
-                }
-                while i + 16 <= len {
-                    let c = vld1q_u8(src_u8.add(i));
-                    vst1q_u8(dest_u8.add(i), c);
-                    i += 16;
-                }
-            }
-
-            if i < len {
-                std::ptr::copy_nonoverlapping(src_u8.add(i), dest_u8.add(i), 
len - i);
-            }
-
-            String::from_utf16(&units).unwrap_or_else(|_| String::new())
-        }
-
-        let slice = unsafe { 
std::slice::from_raw_parts(reader.bf.add(reader.cursor), len) };
-        #[cfg(target_arch = "x86_64")]
-        {
-            if std::arch::is_x86_feature_detected!("avx2") {
-                let s = unsafe { simd_impl(slice) };
-                reader.move_next(len);
-                return Ok(s);
-            }
-        }
-        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        {
-            if std::arch::is_x86_feature_detected!("sse2") {
-                let s = unsafe { simd_impl(slice) };
-                reader.move_next(len);
-                return Ok(s);
-            }
-        }
-        #[cfg(target_arch = "aarch64")]
-        {
-            if std::arch::is_aarch64_feature_detected!("neon") {
-                let s = unsafe { simd_impl(slice) };
-                reader.move_next(len);
-                return Ok(s);
-            }
-        }
-
-        // ---- fallback ----
-        read_utf16_standard(reader, len)
-    }
-
     #[cfg(test)]
     mod tests {
         use super::*;
@@ -1154,14 +853,6 @@ pub mod buffer_rw_string {
                 write_utf8_standard(&mut writer, s);
                 let bytes = &*writer.dump();
                 let mut reader = Reader::new(bytes);
-                assert_eq!(read_utf8_simd(&mut reader, bytes_len).unwrap(), s);
-                assert_eq!(read_utf8_simd(&mut reader, bytes_len).unwrap(), s);
-
-                let mut writer = Writer::default();
-                write_utf8_simd(&mut writer, s);
-                write_utf8_simd(&mut writer, s);
-                let bytes = &*writer.dump();
-                let mut reader = Reader::new(bytes);
                 assert_eq!(read_utf8_standard(&mut reader, 
bytes_len).unwrap(), s);
                 assert_eq!(read_utf8_standard(&mut reader, 
bytes_len).unwrap(), s);
             }
@@ -1183,14 +874,10 @@ pub mod buffer_rw_string {
                 let mut writer = Writer::default();
                 write_utf16_standard(&mut writer, &utf16);
                 write_utf16_standard(&mut writer, &utf16);
-                let bytes = &*writer.dump();
-                let mut reader = Reader::new(bytes);
-                assert_eq!(read_utf16_simd(&mut reader, bytes_len).unwrap(), 
s);
-                assert_eq!(read_utf16_simd(&mut reader, bytes_len).unwrap(), 
s);
 
                 let mut writer = Writer::default();
-                write_utf16_simd(&mut writer, &utf16);
-                write_utf16_simd(&mut writer, &utf16);
+                write_utf16_standard(&mut writer, &utf16);
+                write_utf16_standard(&mut writer, &utf16);
                 let bytes = &*writer.dump();
                 let mut reader = Reader::new(bytes);
                 assert_eq!(read_utf16_standard(&mut reader, 
bytes_len).unwrap(), s);
diff --git a/rust/fory-core/src/meta/type_meta.rs 
b/rust/fory-core/src/meta/type_meta.rs
index 3a2ecfa68..d9821c870 100644
--- a/rust/fory-core/src/meta/type_meta.rs
+++ b/rust/fory-core/src/meta/type_meta.rs
@@ -545,25 +545,32 @@ pub struct TypeMeta {
 }
 
 impl TypeMeta {
+    #[inline(always)]
     pub fn get_field_infos(&self) -> &Vec<FieldInfo> {
         self.layer.get_field_infos()
     }
 
+    #[inline(always)]
     pub fn get_type_id(&self) -> u32 {
         self.layer.get_type_id()
     }
 
+    #[inline(always)]
     pub fn get_hash(&self) -> i64 {
         self.hash
     }
+
+    #[inline(always)]
     pub fn get_type_name(&self) -> Rc<MetaString> {
         self.layer.get_type_name()
     }
 
+    #[inline(always)]
     pub fn get_namespace(&self) -> Rc<MetaString> {
         self.layer.get_namespace()
     }
 
+    #[inline(always)]
     pub fn empty() -> TypeMeta {
         TypeMeta {
             hash: 0,
@@ -632,6 +639,7 @@ impl TypeMeta {
         })
     }
 
+    #[inline(always)]
     pub fn skip_bytes(reader: &mut Reader, header: i64) -> Result<(), Error> {
         let mut meta_size = header & META_SIZE_MASK;
         if meta_size == META_SIZE_MASK {
@@ -641,6 +649,7 @@ impl TypeMeta {
     }
 
     /// Check class version consistency, similar to Java's checkClassVersion
+    #[inline(always)]
     pub fn check_struct_version(
         read_version: i32,
         local_version: i32,
diff --git a/rust/fory-core/src/resolver/context.rs 
b/rust/fory-core/src/resolver/context.rs
index 48e91af28..b85d46a0f 100644
--- a/rust/fory-core/src/resolver/context.rs
+++ b/rust/fory-core/src/resolver/context.rs
@@ -371,6 +371,7 @@ impl ReadContext {
         self.type_resolver.get_type_info(type_id)
     }
 
+    #[inline(always)]
     pub fn read_meta_string(&mut self) -> Result<&MetaString, Error> {
         self.meta_string_resolver.read_meta_string(&mut self.reader)
     }
diff --git a/rust/fory-core/src/resolver/meta_resolver.rs 
b/rust/fory-core/src/resolver/meta_resolver.rs
index 9504c6388..7836aa30c 100644
--- a/rust/fory-core/src/resolver/meta_resolver.rs
+++ b/rust/fory-core/src/resolver/meta_resolver.rs
@@ -33,6 +33,7 @@ const MAX_PARSED_NUM_TYPE_DEFS: usize = 8192;
 
 #[allow(dead_code)]
 impl MetaWriterResolver {
+    #[inline(always)]
     pub fn push(
         &mut self,
         type_id: std::any::TypeId,
@@ -50,6 +51,7 @@ impl MetaWriterResolver {
         }
     }
 
+    #[inline(always)]
     pub fn to_bytes(&self, writer: &mut Writer) {
         writer.write_varuint32(self.type_defs.len() as u32);
         for item in &self.type_defs {
@@ -57,10 +59,12 @@ impl MetaWriterResolver {
         }
     }
 
+    #[inline(always)]
     pub fn empty(&mut self) -> bool {
         self.type_defs.is_empty()
     }
 
+    #[inline(always)]
     pub fn reset(&mut self) {
         self.type_defs.clear();
         self.type_id_index_map.clear();
@@ -74,6 +78,7 @@ pub struct MetaReaderResolver {
 }
 
 impl MetaReaderResolver {
+    #[inline(always)]
     pub fn get(&self, index: usize) -> Option<&Rc<TypeInfo>> {
         self.reading_type_infos.get(index)
     }
@@ -140,6 +145,7 @@ impl MetaReaderResolver {
         Ok(reader.get_cursor())
     }
 
+    #[inline(always)]
     pub fn reset(&mut self) {
         self.reading_type_infos.clear();
     }
diff --git a/rust/fory-core/src/resolver/metastring_resolver.rs 
b/rust/fory-core/src/resolver/metastring_resolver.rs
index 64ca15ad2..ad5d22349 100644
--- a/rust/fory-core/src/resolver/metastring_resolver.rs
+++ b/rust/fory-core/src/resolver/metastring_resolver.rs
@@ -333,6 +333,7 @@ impl MetaStringReaderResolver {
         Ok(mb_ref)
     }
 
+    #[inline(always)]
     fn read_bytes_as_u64(reader: &mut Reader, len: usize) -> Result<u64, 
Error> {
         let mut v = 0;
         let slice = reader.read_bytes(len)?;
@@ -342,6 +343,7 @@ impl MetaStringReaderResolver {
         Ok(v)
     }
 
+    #[inline(always)]
     pub fn reset(&mut self) {
         if self.dynamic_read_id != 0 {
             for i in 0..self.dynamic_read_id {
@@ -351,6 +353,7 @@ impl MetaStringReaderResolver {
         }
     }
 
+    #[inline(always)]
     pub fn read_meta_string(&mut self, reader: &mut Reader) -> 
Result<&MetaString, Error> {
         let ptr = {
             let mb_ref = self.read_meta_string_bytes(reader)?;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(fory) branch main updated: perf(rust): optimize rust performance by remove copy simd and add more inline hints (#2807)

Reply via email to