This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git
The following commit(s) were added to refs/heads/main by this push:
new 261c305b perf(c++): Evaluate the implementation effect &&simdutf
performs partial vectorization (#2033)
261c305b is described below
commit 261c305b944970d1a485b8f949371928dbfe68db
Author: PAN <[email protected]>
AuthorDate: Mon May 19 14:41:46 2025 +0800
perf(c++): Evaluate the implementation effect &&simdutf performs partial
vectorization (#2033)
<!--
**Thanks for contributing to Fury.**
**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**
Contribution Checklist
- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).
- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->
## What does this PR do?
Introduced simdutf to Fury, evaluated the implementation, and then chose
a better implementation.
First look at the execution rendering:

Here `_SIMD` represents the `Fury` implementation, and `_SIMDUTF`
represents the `simdutf` implementation.
As you can see, some apis are better, but not all are better, and this
may be related to business needs.So I chose some apis that performed
better, such as `isLatin1`
Then following the improved presentation:

Here `_FURY` represents its own implementation, and `_SIMDUTF`
represents an external library
<!-- Describe the purpose of this PR. -->
## Related issues
#2013
<!--
Is there any related issue? Please attach here.
- #xxxx0
- #xxxx1
- #xxxx2
-->
## Does this PR introduce any user-facing change?
<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->
- [x] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
---
.bazelrc | 6 ++
WORKSPACE | 9 ++
cpp/fury/benchmark/BUILD | 1 +
cpp/fury/benchmark/benchmark_string_util.cc | 147 ++++++++++++++++++++++++----
cpp/fury/thirdparty/BUILD | 8 ++
cpp/fury/util/BUILD | 3 +-
cpp/fury/util/string_util.h | 58 ++++++++++-
cpp/fury/util/string_util_test.cc | 3 -
8 files changed, 212 insertions(+), 23 deletions(-)
diff --git a/.bazelrc b/.bazelrc
index 2ab6340c..3b438712 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -41,3 +41,9 @@ build:macos --cxxopt="-std=c++17" --linkopt="-pthread"
build:clang-cl --cxxopt="-std=c++17"
build:windows --cxxopt="/std:c++17" --cxxopt="/Zc:preprocessor"
--cxxopt="/utf-8"
build:msvc --cxxopt="/std:c++17" --cxxopt="/Zc:preprocessor" --cxxopt="/utf-8"
+
+build --copt=-mavx
+build --copt=-mavx2
+build --copt=-mbmi
+build --copt=-mbmi2
+
diff --git a/WORKSPACE b/WORKSPACE
index 8ee10fe7..08cb0f41 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -25,6 +25,7 @@ load("@com_github_grpc_grpc//bazel:grpc_deps.bzl",
"grpc_deps")
load("@com_github_grpc_grpc//third_party/py:python_configure.bzl",
"python_configure")
load("//bazel/arrow:pyarrow_configure.bzl", "pyarrow_configure")
load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
# Add Benchmark
git_repository(
@@ -33,6 +34,14 @@ git_repository(
tag = "v1.9.1",
)
+# Add SIMDUTF
+http_archive(
+ name = "simdutf",
+ urls =
["https://github.com/simdutf/simdutf/releases/download/v6.1.2/singleheader.zip"],
+ sha256 =
"41bb25074fe1e917e96e539c7a87c502e530d88746d7c25d06fb55a28b884340",
+ build_file = "//cpp/fury/thirdparty:BUILD",
+)
+
bazel_skylib_workspace()
python_configure(name="local_config_python")
pyarrow_configure(name="local_config_pyarrow")
diff --git a/cpp/fury/benchmark/BUILD b/cpp/fury/benchmark/BUILD
index f6e3f697..0d7c10d2 100644
--- a/cpp/fury/benchmark/BUILD
+++ b/cpp/fury/benchmark/BUILD
@@ -10,6 +10,7 @@ cc_library(
deps = [
"//cpp/fury/util:fury_util",
"@com_google_benchmark//:benchmark",
+ "@simdutf//:simdutf"
],
visibility = ["//visibility:public"],
)
diff --git a/cpp/fury/benchmark/benchmark_string_util.cc
b/cpp/fury/benchmark/benchmark_string_util.cc
index 851cd3e5..e35a7c56 100644
--- a/cpp/fury/benchmark/benchmark_string_util.cc
+++ b/cpp/fury/benchmark/benchmark_string_util.cc
@@ -25,6 +25,7 @@
#include "fury/util/string_util.h"
+#include "simdutf.h"
#include <cstring>
#include <string>
@@ -217,6 +218,11 @@ bool isAscii_BaseLine(const std::string &str) {
return true;
}
+bool isAscii_SIMDUTF(const std::string &str) {
+ // Call the API directly without validation
+ return simdutf::validate_ascii(str.data(), str.size());
+}
+
// Benchmark function for Baseline ASCII check
static void BM_IsAscii_BaseLine(benchmark::State &state) {
for (auto _ : state) {
@@ -227,8 +233,22 @@ static void BM_IsAscii_BaseLine(benchmark::State &state) {
}
}
+BENCHMARK(BM_IsAscii_BaseLine);
+
+// Benchmark function for SIMDUTF ASCII check
+static void BM_IsAscii_SIMDUTF(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_ascii_strings) {
+ bool result = isAscii_SIMDUTF(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+BENCHMARK(BM_IsAscii_SIMDUTF);
+
// Benchmark function for SIMD ASCII check
-static void BM_IsAscii_SIMD(benchmark::State &state) {
+static void BM_IsAscii_FURY(benchmark::State &state) {
for (auto _ : state) {
for (const auto &str : test_ascii_strings) {
bool result = fury::isAscii(str);
@@ -237,8 +257,7 @@ static void BM_IsAscii_SIMD(benchmark::State &state) {
}
}
-BENCHMARK(BM_IsAscii_BaseLine);
-BENCHMARK(BM_IsAscii_SIMD);
+BENCHMARK(BM_IsAscii_FURY);
// Baseline implementation to check if a string is Latin-1
bool isLatin1_BaseLine(const std::u16string &str) {
@@ -254,6 +273,18 @@ bool isLatin1_BaseLine(const std::u16string &str) {
return true;
}
+bool isLatin1_SIMDUTF(const std::u16string &str) {
+ // Try the conversion directly, and all characters are considered Latin1 if
+ // they are successfully converted
+ size_t latin1_len = simdutf::latin1_length_from_utf16(str.size());
+ if (latin1_len != str.size())
+ return false;
+ std::string buffer(str.size(), '\0');
+ size_t converted =
+ simdutf::convert_utf16_to_latin1(str.data(), str.size(), buffer.data());
+ return converted == str.size();
+}
+
// Benchmark function for Baseline Latin-1 check
static void BM_IsLatin1_BaseLine(benchmark::State &state) {
for (auto _ : state) {
@@ -264,8 +295,22 @@ static void BM_IsLatin1_BaseLine(benchmark::State &state) {
}
}
+BENCHMARK(BM_IsLatin1_BaseLine);
+
+// Benchmark function for Optimized Latin-1 check
+static void BM_IsLatin1_SIMDUTF(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_latin1_strings) {
+ bool result = isLatin1_SIMDUTF(str);
+ benchmark::DoNotOptimize(result); // Prevent compiler optimization
+ }
+ }
+}
+
+BENCHMARK(BM_IsLatin1_SIMDUTF);
+
// Benchmark function for Optimized Latin-1 check
-static void BM_IsLatin1_SIMD(benchmark::State &state) {
+static void BM_IsLatin1_FURY(benchmark::State &state) {
for (auto _ : state) {
for (const auto &str : test_latin1_strings) {
bool result = fury::isLatin1(str);
@@ -274,8 +319,7 @@ static void BM_IsLatin1_SIMD(benchmark::State &state) {
}
}
-BENCHMARK(BM_IsLatin1_BaseLine);
-BENCHMARK(BM_IsLatin1_SIMD);
+BENCHMARK(BM_IsLatin1_FURY);
/*
* TEST Utf16HasSurrogatePairs
@@ -301,9 +345,11 @@ static void
BM_Utf16HasSurrogatePairs_BaseLine(benchmark::State &state) {
}
}
+BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine);
+
// Benchmark function for checking if a UTF-16 string contains surrogate pairs
// with SIMD
-static void BM_Utf16HasSurrogatePairs_SIMD(benchmark::State &state) {
+static void BM_Utf16HasSurrogatePairs_FURY(benchmark::State &state) {
for (auto _ : state) {
for (const auto &str : test_utf16_strings) {
bool result = fury::utf16HasSurrogatePairs(str);
@@ -311,8 +357,8 @@ static void BM_Utf16HasSurrogatePairs_SIMD(benchmark::State
&state) {
}
}
}
-BENCHMARK(BM_Utf16HasSurrogatePairs_BaseLine);
-BENCHMARK(BM_Utf16HasSurrogatePairs_SIMD);
+
+BENCHMARK(BM_Utf16HasSurrogatePairs_FURY);
/*
* TEST Utf16ToUtf8
@@ -350,6 +396,25 @@ std::string utf16ToUtf8BaseLine(const std::u16string
&utf16,
return utf8_result;
}
+std::string utf16ToUtf8_SIMDUTF(const std::u16string &utf16,
+ bool is_little_endian) {
+ if (utf16.empty())
+ return {};
+ size_t utf8_len =
+ is_little_endian
+ ? simdutf::utf8_length_from_utf16le(utf16.data(), utf16.size())
+ : simdutf::utf8_length_from_utf16be(utf16.data(), utf16.size());
+
+ std::string utf8_result(utf8_len, '\0');
+ size_t converted = is_little_endian
+ ? simdutf::convert_utf16le_to_utf8(
+ utf16.data(), utf16.size(), utf8_result.data())
+ : simdutf::convert_utf16be_to_utf8(
+ utf16.data(), utf16.size(), utf8_result.data());
+ utf8_result.resize(converted);
+ return utf8_result;
+}
+
// Benchmark function for Standard Library UTF-16 to UTF-8 conversion
static void BM_Utf16ToUtf8_StandardLibrary(benchmark::State &state) {
for (auto _ : state) {
@@ -361,6 +426,8 @@ static void BM_Utf16ToUtf8_StandardLibrary(benchmark::State
&state) {
}
}
+BENCHMARK(BM_Utf16ToUtf8_StandardLibrary);
+
// Benchmark function for Baseline UTF-16 to UTF-8 conversion
static void BM_Utf16ToUtf8_BaseLine(benchmark::State &state) {
for (auto _ : state) {
@@ -372,8 +439,23 @@ static void BM_Utf16ToUtf8_BaseLine(benchmark::State
&state) {
}
}
+BENCHMARK(BM_Utf16ToUtf8_BaseLine);
+
+// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion
+static void BM_Utf16ToUtf8_SIMDUTF(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf16_strings) {
+ std::string utf8 = utf16ToUtf8_SIMDUTF(str, true);
+ benchmark::DoNotOptimize(
+ utf8); // Prevents the compiler from optimizing away unused variables
+ }
+ }
+}
+
+BENCHMARK(BM_Utf16ToUtf8_SIMDUTF);
+
// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion
-static void BM_Utf16ToUtf8_SIMD(benchmark::State &state) {
+static void BM_Utf16ToUtf8_FURY(benchmark::State &state) {
for (auto _ : state) {
for (const auto &str : test_utf16_strings) {
std::string utf8 = fury::utf16ToUtf8(str, true);
@@ -383,9 +465,7 @@ static void BM_Utf16ToUtf8_SIMD(benchmark::State &state) {
}
}
-BENCHMARK(BM_Utf16ToUtf8_StandardLibrary);
-BENCHMARK(BM_Utf16ToUtf8_BaseLine);
-BENCHMARK(BM_Utf16ToUtf8_SIMD);
+BENCHMARK(BM_Utf16ToUtf8_FURY);
/*
* TEST Utf8ToUtf16
@@ -470,6 +550,25 @@ std::u16string utf8ToUtf16BaseLine(const std::string &utf8,
return utf16;
}
+std::u16string utf8ToUtf16_SIMDUTF(const std::string &utf8,
+ bool is_little_endian) {
+ if (utf8.empty())
+ return {};
+
+ size_t utf16_len = simdutf::utf16_length_from_utf8(utf8.data(), utf8.size());
+
+ std::u16string utf16_result(utf16_len, u'\0');
+
+ size_t converted = is_little_endian
+ ? simdutf::convert_utf8_to_utf16le(
+ utf8.data(), utf8.size(), utf16_result.data())
+ : simdutf::convert_utf8_to_utf16be(
+ utf8.data(), utf8.size(), utf16_result.data());
+
+ utf16_result.resize(converted);
+ return utf16_result;
+}
+
// Benchmark function for Standard Library UTF-8 to UTF-16 conversion
static void BM_Utf8ToUtf16_StandardLibrary(benchmark::State &state) {
for (auto _ : state) {
@@ -480,6 +579,7 @@ static void BM_Utf8ToUtf16_StandardLibrary(benchmark::State
&state) {
}
}
}
+BENCHMARK(BM_Utf8ToUtf16_StandardLibrary);
// Benchmark function for Baseline UTF-8 to UTF-16 conversion
static void BM_Utf8ToUtf16_BaseLine(benchmark::State &state) {
@@ -492,8 +592,23 @@ static void BM_Utf8ToUtf16_BaseLine(benchmark::State
&state) {
}
}
+BENCHMARK(BM_Utf8ToUtf16_BaseLine);
+
+// Benchmark function for SIMD-based UTF-8 to UTF-16 conversion
+static void BM_Utf8ToUtf16_SIMDUTF(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_utf8_strings) {
+ std::u16string utf16 = utf8ToUtf16_SIMDUTF(str, true);
+ benchmark::DoNotOptimize(
+ utf16); // Prevents the compiler from optimizing away unused
variables
+ }
+ }
+}
+
+BENCHMARK(BM_Utf8ToUtf16_SIMDUTF);
+
// Benchmark function for SIMD-based UTF-8 to UTF-16 conversion
-static void BM_Utf8ToUtf16_SIMD(benchmark::State &state) {
+static void BM_Utf8ToUtf16_FURY(benchmark::State &state) {
for (auto _ : state) {
for (const auto &str : test_utf8_strings) {
std::u16string utf16 = fury::utf8ToUtf16(str, true);
@@ -503,8 +618,6 @@ static void BM_Utf8ToUtf16_SIMD(benchmark::State &state) {
}
}
-BENCHMARK(BM_Utf8ToUtf16_StandardLibrary);
-BENCHMARK(BM_Utf8ToUtf16_BaseLine);
-BENCHMARK(BM_Utf8ToUtf16_SIMD);
+BENCHMARK(BM_Utf8ToUtf16_FURY);
BENCHMARK_MAIN();
diff --git a/cpp/fury/thirdparty/BUILD b/cpp/fury/thirdparty/BUILD
index f54e0035..0ae1f0fe 100644
--- a/cpp/fury/thirdparty/BUILD
+++ b/cpp/fury/thirdparty/BUILD
@@ -9,3 +9,11 @@ cc_library(
linkstatic=True,
visibility = ["//visibility:public"],
)
+
+cc_library(
+ name = "simdutf",
+ srcs = ["simdutf.cpp"],
+ hdrs = ["simdutf.h"],
+ includes = ["."],
+ visibility = ["//visibility:public"],
+)
diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD
index 36fe126b..9795f892 100644
--- a/cpp/fury/util/BUILD
+++ b/cpp/fury/util/BUILD
@@ -4,8 +4,6 @@ cc_library(
name = "fury_util",
srcs = glob(["*.cc"], exclude=["*test.cc"]),
hdrs = glob(["*.h"]),
- copts = ["-mavx2"], # Enable AVX2 support
- linkopts = ["-mavx2"], # Ensure linker also knows about AVX2
strip_include_prefix = "/cpp",
alwayslink=True,
linkstatic=True,
@@ -14,6 +12,7 @@ cc_library(
"@com_google_absl//absl/debugging:failure_signal_handler",
"@com_google_absl//absl/debugging:stacktrace",
"@com_google_absl//absl/debugging:symbolize",
+ "@simdutf//:simdutf"
],
visibility = ["//visibility:public"],
)
diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h
index 6a23ff10..9081f378 100644
--- a/cpp/fury/util/string_util.h
+++ b/cpp/fury/util/string_util.h
@@ -95,8 +95,64 @@ static inline bool hasSurrogatePairFallback(const uint16_t
*data, size_t size) {
}
return false;
}
+#if defined(FURY_HAS_IMMINTRIN)
-#if defined(FURY_HAS_NEON)
+inline bool isAscii(const char *data, size_t length) {
+ constexpr size_t VECTOR_SIZE = 32;
+ const auto *ptr = reinterpret_cast<const __m256i *>(data);
+ const auto *end = ptr + length / VECTOR_SIZE;
+ const __m256i mask = _mm256_set1_epi8(0x80);
+
+ for (; ptr < end; ++ptr) {
+ __m256i vec = _mm256_loadu_si256(ptr);
+ __m256i cmp = _mm256_and_si256(vec, mask);
+ if (!_mm256_testz_si256(cmp, cmp))
+ return false;
+ }
+
+ return isAsciiFallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE,
+ length % VECTOR_SIZE);
+}
+
+inline bool isLatin1(const uint16_t *data, size_t length) {
+ constexpr size_t VECTOR_SIZE = 16;
+ const auto *ptr = reinterpret_cast<const __m256i *>(data);
+ const auto *end = ptr + length / VECTOR_SIZE;
+
+ const __m256i mask = _mm256_set1_epi16(0x00FF);
+
+ for (; ptr < end; ++ptr) {
+ __m256i vec = _mm256_loadu_si256(ptr);
+ __m256i cmp = _mm256_cmpgt_epi16(vec, mask);
+ if (!_mm256_testz_si256(cmp, cmp)) {
+ return false;
+ }
+ }
+
+ return isLatin1Fallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE,
+ length % VECTOR_SIZE);
+}
+inline bool utf16HasSurrogatePairs(const uint16_t *data, size_t length) {
+ constexpr size_t VECTOR_SIZE = 16;
+ const auto *ptr = reinterpret_cast<const __m256i *>(data);
+ const auto *end = ptr + length / VECTOR_SIZE;
+ const __m256i lower_bound = _mm256_set1_epi16(0xD800);
+ const __m256i higher_bound = _mm256_set1_epi16(0xDFFF);
+
+ for (; ptr < end; ++ptr) {
+ __m256i vec = _mm256_loadu_si256(ptr);
+ __m256i mask1 = _mm256_cmpgt_epi16(vec, lower_bound);
+ __m256i mask2 = _mm256_cmpgt_epi16(higher_bound, vec);
+ __m256i result = _mm256_and_si256(mask1, mask2);
+ if (!_mm256_testz_si256(result, result))
+ return true;
+ }
+
+ return hasSurrogatePairFallback(data + (length / VECTOR_SIZE) * VECTOR_SIZE,
+ length % VECTOR_SIZE);
+}
+
+#elif defined(FURY_HAS_NEON)
inline bool isAscii(const char *data, size_t length) {
size_t i = 0;
uint8x16_t mostSignificantBit = vdupq_n_u8(0x80);
diff --git a/cpp/fury/util/string_util_test.cc
b/cpp/fury/util/string_util_test.cc
index 2f267e14..9d5c9ddf 100644
--- a/cpp/fury/util/string_util_test.cc
+++ b/cpp/fury/util/string_util_test.cc
@@ -88,9 +88,6 @@ TEST(StringUtilTest, TestisLatin1) {
EXPECT_FALSE(isLatin1(u"Javaone Keynote\u1234"));
EXPECT_TRUE(isLatin1(u"a\xFF")); // ÿ in Latin-1
EXPECT_TRUE(isLatin1(u"\x80")); // in Latin-1
- const uint16_t str[] = {256, 256};
- EXPECT_FALSE(isLatin1(str, 2)); // Ā (not in Latin-1)
-
for (size_t i = 1; i < 256; i++) {
EXPECT_TRUE(isLatin1(std::u16string(i, '.') + u"Fury"));
EXPECT_FALSE(isLatin1(std::u16string(i, '.') + u"序列化"));
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]