chaokunyang commented on code in PR #2585:
URL: https://github.com/apache/fory/pull/2585#discussion_r2338917131


##########
rust/fory-core/src/serializer/string.rs:
##########
@@ -23,19 +24,104 @@ use crate::serializer::Serializer;
 use crate::types::{ForyGeneralList, TypeId};
 use std::mem;
 
+enum StrEncoding {
+    Latin1 = 0,
+    Utf16 = 1,
+    Utf8 = 2,
+}
+
+fn best_coder(s: &str) -> StrEncoding {
+    let chars: Vec<char> = s.chars().collect();
+    let num_chars = chars.len();
+    if num_chars == 0 {
+        return StrEncoding::Latin1;
+    }
+
+    let sample_num = num_chars.min(64);
+    let vectorized_len = sample_num / 4;
+    let vectorized_chars = vectorized_len * 4;
+
+    let mut ascii_count = 0;
+    let mut latin1_count = 0;
+
+    for i in 0..vectorized_len {
+        let base = i * 4;
+        for j in 0..4 {
+            let c = chars[base + j] as u32;
+            if c <= 0x7F {
+                ascii_count += 1;
+                latin1_count += 1;
+            } else if c <= 0xFF {
+                latin1_count += 1;
+            }
+        }
+    }
+
+    for &c in chars.iter().take(sample_num).skip(vectorized_chars) {
+        let c = c as u32;
+        if c <= 0x7F {
+            ascii_count += 1;
+            latin1_count += 1;
+        } else if c <= 0xFF {
+            latin1_count += 1;
+        }
+    }
+
+    if latin1_count == num_chars || latin1_count == sample_num {
+        StrEncoding::Latin1
+    } else if (ascii_count as f64) >= sample_num as f64 * 0.5 {
+        StrEncoding::Utf8
+    } else {
+        StrEncoding::Utf16
+    }
+}
+
 impl Serializer for String {
     fn reserved_space() -> usize {
         mem::size_of::<i32>()
     }
 
     fn write(&self, context: &mut WriteContext) {
-        context.writer.var_int32(self.len() as i32);
-        context.writer.bytes(self.as_bytes());
+        let encoding = best_coder(self);

Review Comment:
   For write, we could keep using utf8 instead, this will minimize the string 
encoding cost, and other languages support decode utf8



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to