chaokunyang commented on code in PR #1997:
URL: https://github.com/apache/fury/pull/1997#discussion_r1905727830


##########
cpp/fury/util/string_util.h:
##########
@@ -19,14 +19,86 @@
 
 #pragma once
 
+#include <cstdint>
 #include <string>
+// AVX not included here since some older intel cpu doesn't support avx2
+// but the built wheel for avx2 is same as sse2.
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define USE_NEON_SIMD
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+#define USE_SSE2_SIMD
+#endif
 
 namespace fury {
 
 bool isLatin(const std::string &str);
 
+inline bool hasSurrogatePairFallback(const uint16_t *data, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    auto c = data[i];
+    if (c >= 0xD800 && c <= 0xDFFF) {
+      return true;
+    }
+  }
+  return false;
+}
+
+#ifdef USE_NEON_SIMD
+inline bool hasSurrogatePairNEON(const uint16_t *data, size_t length) {
+  size_t i = 0;
+  uint16x8_t lower_bound = vdupq_n_u16(0xD800);
+  uint16x8_t higher_bound = vdupq_n_u16(0xDFFF);
+  for (; i + 7 < length; i += 8) {
+    uint16x8_t chunk = vld1q_u16(data + i);
+    uint16x8_t mask1 = vcgeq_u16(chunk, lower_bound);
+    uint16x8_t mask2 = vcleq_u16(chunk, higher_bound);
+    if (vmaxvq_u16(mask1 & mask2)) {
+      return true; // Detected a high surrogate
+    }
+  }
+  return hasSurrogatePairFallback(data + i, length - i);
+}
+#endif
+
+#ifdef USE_SSE2_SIMD
+inline bool hasSurrogatePairSSE2(const uint16_t *data, size_t length) {
+  size_t i = 0;
+  __m128i lower_bound = _mm_set1_epi16(0xd7ff);
+  __m128i higher_bound = _mm_set1_epi16(0xe000);
+  for (; i + 7 < length; i += 8) {
+    __m128i chunk =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + i));
+    __m128i cmp1 = _mm_cmpgt_epi16(chunk, lower_bound);
+    __m128i cmp2 = _mm_cmpgt_epi16(higher_bound, chunk);
+    if (_mm_movemask_epi8(_mm_and_si128(cmp1, cmp2)) != 0) {
+      return true; // Detected a surrogate
+    }
+  }
+  return hasSurrogatePairFallback(data + i, length - i);
+}
+#endif
+
+inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
+#if defined(USE_NEON_SIMD)

Review Comment:
   done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to