chaokunyang commented on PR #2585:
URL: https://github.com/apache/fory/pull/2585#issuecomment-3280714800

   string serialization in rust needs to support latin1 encoding. You can take 
following code as an example for simd latin1 check:
   
   ```rust
   
   /// Checks if a UTF-8 string can be losslessly encoded as Latin-1 using SIMD 
if available.
   ///
   /// This is true if all characters in the string have a Unicode codepoint <= 
255.
   /// This translates to a byte-level check: the string must not contain any 
byte >= 0xC4.
   pub fn can_be_latin1(s: &str) -> bool {
       let bytes = s.as_bytes();
   
       // Runtime feature detection to select the best implementation.
       // The functions are guarded by `#[target_feature]`, so the compiler
       // generates optimized code for each case. The function pointers are 
resolved
       // at runtime on the first call.
       #[cfg(target_arch = "x86_64")]
       {
           if is_x86_feature_detected!("avx2") {
               return unsafe { can_be_latin1_avx2(bytes) };
           }
           if is_x86_feature_detected!("sse2") {
               return unsafe { can_be_latin1_sse2(bytes) };
           }
       }
   
       // Fallback for non-x86_64 architectures or if no SIMD features are 
available.
       can_be_latin1_scalar(bytes)
   }
   
   /// Scalar fallback implementation. Checks byte by byte.
   #[inline]
   fn can_be_latin1_scalar(bytes: &[u8]) -> bool {
       // A simple iterator-based check is clean and often optimized well by 
the compiler.
       !bytes.iter().any(|&b| b >= 0xC4)
   }
   
   /// Implementation using SSE2 intrinsics (16-byte vectors).
   #[cfg(target_arch = "x86_64")]
   #[target_feature(enable = "sse2")]
   unsafe fn can_be_latin1_sse2(bytes: &[u8]) -> bool {
       const CHUNK_SIZE: usize = 16;
       let limit = _mm_set1_epi8(0xC3 as i8); // We check for values > 0xC3
   
       let mut i = 0;
       while i + CHUNK_SIZE <= bytes.len() {
           // Load 16 bytes of data. `loadu` handles unaligned memory.
           let chunk = _mm_loadu_si128(bytes.as_ptr().add(i) as *const _);
   
           // This is a common trick for unsigned comparison with signed-only 
intrinsics.
           // We want to check `byte >= 0xC4`. This is equivalent to `byte > 
0xC3`.
           // The `_mm_cmpgt_epi8` instruction performs a signed comparison (a 
> b).
           // By adding -128 (or XORing with 0x80) to both operands, we can map 
the
           // unsigned range [0, 255] to the signed range [-128, 127] and 
perform a
           // valid comparison.
           // `byte > 0xC3` becomes `(byte - 128) > (0xC3 - 128)`.
           // `0xC3 - 128 = 195 - 128 = 67`. The `limit` vector holds `0xC3` 
because
           // we're comparing `chunk > limit`, and `_mm_cmpgt_epi8` works on 
signed i8.
           // The values from 0xC4 to 0xFF will correctly be "greater than" 
0xC3.
           let comparison = _mm_cmpgt_epi8(chunk, limit);
   
           // Create a bitmask from the most significant bit of each byte in 
the result.
           // If any byte in `chunk` was > 0xC3, the corresponding byte in 
`comparison`
           // will be all 1s (0xFF), and its MSB will be 1.
           // `movemask` will be non-zero if any invalid byte was found.
           if _mm_movemask_epi8(comparison) != 0 {
               return false;
           }
   
           i += CHUNK_SIZE;
       }
   
       // Handle the remainder
       can_be_latin1_scalar(&bytes[i..])
   }
   
   /// Implementation using AVX2 intrinsics (32-byte vectors).
   #[cfg(target_arch = "x86_64")]
   #[target_feature(enable = "avx2")]
   unsafe fn can_be_latin1_avx2(bytes: &[u8]) -> bool {
       const CHUNK_SIZE: usize = 32;
       // We want to check for bytes >= 0xC4, which is equivalent to > 0xC3.
       // The vector is filled with 0xC3.
       let limit = _mm256_set1_epi8(0xC3 as i8);
   
       let mut i = 0;
       while i + CHUNK_SIZE <= bytes.len() {
           // Load 32 bytes of data.
           let chunk = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const _);
           
           // Perform a signed "greater than" comparison.
           let comparison = _mm256_cmpgt_epi8(chunk, limit);
   
           // Create a 32-bit mask. If any byte was > 0xC3, the mask will be 
non-zero.
           if _mm256_movemask_epi8(comparison) != 0 {
               return false;
           }
   
           i += CHUNK_SIZE;
       }
   
       // Handle the remainder using the SSE2 or scalar implementation
       // to avoid duplicating the tail-handling logic.
       can_be_latin1_sse2(&bytes[i..])
   }
   
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to