This is an automated email from the ASF dual-hosted git repository.
chaokunyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/fury.git
The following commit(s) were added to refs/heads/main by this push:
new a5f7e74e feat(c++): Add benchmark for performance evaluation (#2023)
a5f7e74e is described below
commit a5f7e74ef95562fc7c322c2b20639a3f10f91b05
Author: PAN <[email protected]>
AuthorDate: Sun Jan 26 07:42:19 2025 +0800
feat(c++): Add benchmark for performance evaluation (#2023)
<!--
**Thanks for contributing to Fury.**
**If this is your first time opening a PR on fury, you can refer to
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).**
Contribution Checklist
- The **Apache Fury (incubating)** community has restrictions on the
naming of pr titles. You can also find instructions in
[CONTRIBUTING.md](https://github.com/apache/fury/blob/main/CONTRIBUTING.md).
- Fury has a strong focus on performance. If the PR you submit will have
an impact on performance, please benchmark it first and provide the
benchmark result here.
-->
## What does this PR do?
In the process of project function improvement, if there is no
appropriate benchmark to test, it is difficult to judge the quality of
this new function.
Now, I decided to add a new benchmark to enhance FuryCpp
<img width="445" alt="image"
src="https://github.com/user-attachments/assets/c496bc6f-ca43-437e-961a-ba8a8c71b027"
/>
This is a preliminary plan. Do you have any better suggestions for using
benchmark.cc?
Then, on the use of `simd` to enhance FuryCpp, I will implement in the
next `pr`.
<!-- Describe the purpose of this PR. -->
## Related issues
relate #2022
<!--
Is there any related issue? Please attach here.
- #xxxx0
- #xxxx1
- #xxxx2
-->
## Does this PR introduce any user-facing change?
<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->
- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?
## Benchmark
google/benchmark
version v1.9.1
<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->
---
WORKSPACE | 9 +++
cpp/fury/util/BUILD | 12 +++-
cpp/fury/util/benchmark.cc | 119 ++++++++++++++++++++++++++++++++++++++
cpp/fury/util/string_util_test.cc | 99 -------------------------------
4 files changed, 139 insertions(+), 100 deletions(-)
diff --git a/WORKSPACE b/WORKSPACE
index 0df98cdc..8ee10fe7 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -24,6 +24,15 @@ load("@bazel_skylib//:workspace.bzl",
"bazel_skylib_workspace")
load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
load("@com_github_grpc_grpc//third_party/py:python_configure.bzl",
"python_configure")
load("//bazel/arrow:pyarrow_configure.bzl", "pyarrow_configure")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
+
+# Add Benchmark
+git_repository(
+ name = "com_google_benchmark",
+ remote = "https://github.com/google/benchmark.git",
+ tag = "v1.9.1",
+)
+
bazel_skylib_workspace()
python_configure(name="local_config_python")
pyarrow_configure(name="local_config_pyarrow")
diff --git a/cpp/fury/util/BUILD b/cpp/fury/util/BUILD
index 8f605dc7..073c0e06 100644
--- a/cpp/fury/util/BUILD
+++ b/cpp/fury/util/BUILD
@@ -2,7 +2,7 @@ load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
cc_library(
name = "fury_util",
- srcs = glob(["*.cc"], exclude=["*test.cc"]),
+ srcs = glob(["*.cc"], exclude=["*test.cc", "benchmark.cc"]),
hdrs = glob(["*.h"]),
copts = ["-mavx2"], # Enable AVX2 support
linkopts = ["-mavx2"], # Ensure linker also knows about AVX2
@@ -14,6 +14,7 @@ cc_library(
"@com_google_absl//absl/debugging:failure_signal_handler",
"@com_google_absl//absl/debugging:stacktrace",
"@com_google_absl//absl/debugging:symbolize",
+ "@com_google_benchmark//:benchmark",
],
visibility = ["//visibility:public"],
)
@@ -62,4 +63,13 @@ cc_test(
":fury_util",
"@com_google_googletest//:gtest",
],
+)
+
+cc_test(
+ name = "benchmark",
+ srcs = ["benchmark.cc"],
+ deps = [
+ ":fury_util",
+ "@com_google_benchmark//:benchmark",
+ ],
)
\ No newline at end of file
diff --git a/cpp/fury/util/benchmark.cc b/cpp/fury/util/benchmark.cc
new file mode 100644
index 00000000..e911a70c
--- /dev/null
+++ b/cpp/fury/util/benchmark.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <codecvt>
+#include <locale>
+#include <random>
+
+#include "string_util.h"
+
+#include <cstring>
+#include <string>
+
+// Function to generate a random UTF-16 string
+std::u16string generateRandomUTF16String(size_t length) {
+ const char charset[] =
+ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
+ std::default_random_engine rng(std::random_device{}());
+ std::uniform_int_distribution<> dist(0, sizeof(charset) - 2);
+
+ std::u16string result;
+ result.reserve(length);
+ for (size_t i = 0; i < length; ++i) {
+ result += static_cast<char16_t>(charset[dist(rng)]);
+ }
+
+ return result;
+}
+
+std::vector<std::u16string> generateUTF16String(size_t num_tests) {
+ std::vector<std::u16string> test_strings;
+ for (size_t i = 0; i < num_tests; ++i) {
+ test_strings.push_back(generateRandomUTF16String(num_tests));
+ }
+ return test_strings;
+}
+
+const std::vector<std::u16string> test_strings = generateUTF16String(1000);
+
+// UTF16 to UTF8 using the standard library
+std::string utf16ToUtf8StandardLibrary(const std::u16string &utf16) {
+ std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
+ return convert.to_bytes(utf16);
+}
+
+// UTF16 to UTF8 baseline conversion (without SIMD)
+std::string utf16ToUtf8BaseLine(const std::u16string &utf16,
+ bool is_little_endian = true) {
+ size_t utf16_length = utf16.length();
+ size_t utf8_length = utf16_length * 3;
+ std::string utf8_result(utf8_length, '\0');
+
+ size_t i = 0, j = 0;
+ while (i < utf16_length) {
+ char16_t utf16_char = utf16[i++];
+ if (utf16_char < 0x80) {
+ utf8_result[j++] = static_cast<char>(utf16_char);
+ } else if (utf16_char < 0x800) {
+ utf8_result[j++] = static_cast<char>(0xC0 | (utf16_char >> 6));
+ utf8_result[j++] = static_cast<char>(0x80 | (utf16_char & 0x3F));
+ } else {
+ utf8_result[j++] = static_cast<char>(0xE0 | (utf16_char >> 12));
+ utf8_result[j++] = static_cast<char>(0x80 | ((utf16_char >> 6) & 0x3F));
+ utf8_result[j++] = static_cast<char>(0x80 | (utf16_char & 0x3F));
+ }
+ }
+
+ utf8_result.resize(j);
+ return utf8_result;
+}
+
+// Benchmark function for Standard Library UTF-16 to UTF-8 conversion
+static void BM_StandardLibrary(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_strings) {
+ std::string utf8 = utf16ToUtf8StandardLibrary(str);
+ }
+ }
+}
+
+// Benchmark function for Baseline UTF-16 to UTF-8 conversion
+static void BM_BaseLine(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_strings) {
+ std::string utf8 = utf16ToUtf8BaseLine(str, true);
+ }
+ }
+}
+
+// Benchmark function for SIMD-based UTF-16 to UTF-8 conversion
+static void BM_SIMD(benchmark::State &state) {
+ for (auto _ : state) {
+ for (const auto &str : test_strings) {
+ std::string utf8 = fury::utf16ToUtf8(str, true);
+ }
+ }
+}
+
+BENCHMARK(BM_StandardLibrary);
+BENCHMARK(BM_BaseLine);
+BENCHMARK(BM_SIMD);
+BENCHMARK_MAIN();
diff --git a/cpp/fury/util/string_util_test.cc
b/cpp/fury/util/string_util_test.cc
index 220fa544..f2a2fdab 100644
--- a/cpp/fury/util/string_util_test.cc
+++ b/cpp/fury/util/string_util_test.cc
@@ -174,44 +174,6 @@ TEST(StringUtilTest, TestUtf16HasSurrogatePairs) {
utf16HasSurrogatePairs(generateRandomUTF16String(300) + u"性能好"));
}
-std::string utf16ToUtf8BaseLine(const std::u16string &utf16,
- bool is_little_endian) {
- std::string utf8;
- utf8.reserve(utf16.size() *
- 3); // Reserve enough space to avoid frequent reallocations
-
- size_t i = 0;
- size_t n = utf16.size();
- char buffer[4]; // Buffer to hold temporary UTF-8 bytes
- char *output = buffer;
-
- while (i < n) {
- uint16_t code_unit = utf16[i];
- if (!is_little_endian) {
- code_unit = swapBytes(code_unit);
- }
- if (i + 1 < n && code_unit >= 0xD800 && code_unit <= 0xDBFF &&
- utf16[i + 1] >= 0xDC00 && utf16[i + 1] <= 0xDFFF) {
- // Surrogate pair
- uint16_t high = code_unit;
- uint16_t low = utf16[i + 1];
- if (!is_little_endian) {
- low = swapBytes(low);
- }
- utf16SurrogatePairToUtf8(high, low, output);
- utf8.append(buffer, output - buffer);
- output = buffer;
- ++i;
- } else {
- utf16ToUtf8(code_unit, output);
- utf8.append(buffer, output - buffer);
- output = buffer;
- }
- ++i;
- }
- return utf8;
-}
-
// Testing Basic Logic
TEST(UTF16ToUTF8Test, BasicConversion) {
std::u16string utf16 = u"Hello, 世界!";
@@ -262,67 +224,6 @@ TEST(UTF16ToUTF8Test, BigEndian) {
ASSERT_EQ(utf8, "\xEF\xBF\xBE\xEF\xBF\xBE");
}
-// Testing Performance
-TEST(UTF16ToUTF8Test, PerformanceTest) {
- const size_t num_tests = 1000;
- const size_t string_length = 1000;
- // Default little_endian
- bool is_little_endian = true;
-
- // Random UTF-16
- std::vector<std::u16string> test_strings;
- for (size_t i = 0; i < num_tests; ++i) {
- test_strings.push_back(generateRandomUTF16String(string_length));
- }
-
- // Lib
- try {
- auto start_time = std::chrono::high_resolution_clock::now();
- for (const auto &str : test_strings) {
- std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>
convert;
- std::string utf8 = convert.to_bytes(str);
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "Standard library Running Time: " << duration
- << " ns";
- } catch (const std::exception &e) {
- FURY_LOG(FURY_FATAL) << "Caught exception: " << e.what();
- }
-
- // BaseLine
- try {
- auto start_time = std::chrono::high_resolution_clock::now();
- for (const auto &str : test_strings) {
- std::string utf8 = utf16ToUtf8BaseLine(str, is_little_endian);
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "Baseline Running Time: " << duration << " ns";
- } catch (const std::exception &e) {
- FURY_LOG(FURY_FATAL) << "Caught exception: " << e.what();
- }
-
- // SIMD
- try {
- auto start_time = std::chrono::high_resolution_clock::now();
- for (const auto &str : test_strings) {
- std::string utf8 = fury::utf16ToUtf8(str, is_little_endian);
- }
- auto end_time = std::chrono::high_resolution_clock::now();
- auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(
- end_time - start_time)
- .count();
- FURY_LOG(FURY_INFO) << "SIMD Running Time: " << duration << " ns";
- } catch (const std::exception &e) {
- FURY_LOG(FURY_FATAL) << "Caught exception: " << e.what();
- }
-}
-
// Generate random UTF-8 string
std::string generateRandomUTF8String(size_t length) {
std::string str;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]