This is an automated email from the ASF dual-hosted git repository.
jbarrett pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode-native.git
The following commit(s) were added to refs/heads/develop by this push:
new f040049 GEODE-6218: Improves UTF-8 hashing function performance.
(#424)
f040049 is described below
commit f040049fc4107cd25eeb8f663c53080ce914bfb1
Author: Jacob Barrett <[email protected]>
AuthorDate: Wed Dec 19 13:05:14 2018 -0800
GEODE-6218: Improves UTF-8 hashing function performance. (#424)
* Update benchmark to measure different Unicode character widths.
---
cppcache/benchmark/GeodeHashBM.cpp | 51 ++++++++++++++++++++------
cppcache/include/geode/internal/functional.hpp | 43 +++++++++++++++++++++-
cppcache/src/util/functional.cpp | 39 --------------------
cppcache/test/util/functionalTests.cpp | 8 ++++
4 files changed, 89 insertions(+), 52 deletions(-)
diff --git a/cppcache/benchmark/GeodeHashBM.cpp
b/cppcache/benchmark/GeodeHashBM.cpp
index 9bb5cb0..2aa8c9d 100644
--- a/cppcache/benchmark/GeodeHashBM.cpp
+++ b/cppcache/benchmark/GeodeHashBM.cpp
@@ -22,24 +22,51 @@
#include "util/string.hpp"
using apache::geode::client::to_utf16;
+using apache::geode::client::to_utf8;
using apache::geode::client::internal::geode_hash;
-class GeodeHashBM : public benchmark::Fixture {};
+template <class ToString, class FromString>
+ToString convert(const FromString& from);
-BENCHMARK_DEFINE_F(GeodeHashBM, std_string)(benchmark::State& state) {
- std::string x(state.range(0), 'x');
- for (auto _ : state) {
- int hashcode;
- benchmark::DoNotOptimize(hashcode = geode_hash<std::string>{}(x));
- }
+template <>
+std::string convert(const std::u32string& from) {
+ return to_utf8(from);
+}
+
+template <>
+std::u16string convert(const std::u32string& from) {
+ return to_utf16(from);
}
-BENCHMARK_REGISTER_F(GeodeHashBM, std_string)->Range(8, 8 << 10);
-BENCHMARK_DEFINE_F(GeodeHashBM, std_u16string)(benchmark::State& state) {
- std::u16string x(state.range(0), u'x');
+template <class String, char32_t UnicodeChar>
+void GeodeHashBM(benchmark::State& state) {
+ const std::u32string u32String(state.range(0), UnicodeChar);
+ const String string = convert<String>(u32String);
+
for (auto _ : state) {
int hashcode;
- benchmark::DoNotOptimize(hashcode = geode_hash<std::u16string>{}(x));
+ benchmark::DoNotOptimize(hashcode = geode_hash<String>{}(string));
}
}
-BENCHMARK_REGISTER_F(GeodeHashBM, std_u16string)->Range(8, 8 << 10);
+
+constexpr char32_t LATIN_CAPITAL_LETTER_C = U'\U00000043';
+constexpr char32_t INVERTED_EXCLAMATION_MARK = U'\U000000A1';
+constexpr char32_t SAMARITAN_PUNCTUATION_ZIQAA = U'\U00000838';
+constexpr char32_t LINEAR_B_SYLLABLE_B008_A = U'\U00010000';
+
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, LATIN_CAPITAL_LETTER_C)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, LATIN_CAPITAL_LETTER_C)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, INVERTED_EXCLAMATION_MARK)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, INVERTED_EXCLAMATION_MARK)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, SAMARITAN_PUNCTUATION_ZIQAA)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, SAMARITAN_PUNCTUATION_ZIQAA)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::string, LINEAR_B_SYLLABLE_B008_A)
+ ->Range(8, 8 << 10);
+BENCHMARK_TEMPLATE(GeodeHashBM, std::u16string, LINEAR_B_SYLLABLE_B008_A)
+ ->Range(8, 8 << 10);
diff --git a/cppcache/include/geode/internal/functional.hpp
b/cppcache/include/geode/internal/functional.hpp
index 7cb4377..6fde1d6 100644
--- a/cppcache/include/geode/internal/functional.hpp
+++ b/cppcache/include/geode/internal/functional.hpp
@@ -104,7 +104,48 @@ struct geode_hash<std::u16string> {
*/
template <>
struct geode_hash<std::string> {
- int32_t operator()(const std::string& val);
+ inline int32_t operator()(const std::string& val) {
+ int32_t hash = 0;
+
+ for (auto&& it = val.cbegin(); it < val.cend(); it++) {
+ auto cp = static_cast<uint32_t>(0xff & *it);
+ if (cp < 0x80) {
+ // 1 byte
+ } else if ((cp >> 5) == 0x6) {
+ // 2 bytes
+ ++it;
+ cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+ } else if ((cp >> 4) == 0xe) {
+ // 3 bytes
+ ++it;
+ cp = ((cp << 12) & 0xffff) + (((0xff & *it) << 6) & 0xfff);
+ ++it;
+ cp += (*it) & 0x3f;
+ } else if ((cp >> 3) == 0x1e) {
+ // 4 bytes
+ ++it;
+ cp = ((cp << 18) & 0x1fffff) + (((0xff & *it) << 12) & 0x3ffff);
+ ++it;
+ cp += ((0xff & *it) << 6) & 0xfff;
+ ++it;
+ cp += (*it) & 0x3f;
+ } else {
+ // TODO throw exception
+ }
+
+ if (cp > 0xffff) {
+ // surrogate pair
+ hash = 31 * hash +
+ static_cast<uint16_t>((cp >> 10) + (0xD800 - (0x10000 >> 10)));
+ hash = 31 * hash + static_cast<uint16_t>((cp & 0x3ff) + 0xdc00u);
+ } else {
+ // single code unit
+ hash = 31 * hash + cp;
+ }
+ }
+
+ return hash;
+ }
};
} // namespace internal
diff --git a/cppcache/src/util/functional.cpp b/cppcache/src/util/functional.cpp
deleted file mode 100644
index c526eaf..0000000
--- a/cppcache/src/util/functional.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <codecvt>
-#include <locale>
-#include <string>
-
-#include <geode/internal/functional.hpp>
-
-#include "string.hpp"
-
-namespace apache {
-namespace geode {
-namespace client {
-namespace internal {
-
-int32_t geode_hash<std::string>::operator()(const std::string& val) {
- // TODO string optimize without conversion to UTF-16
- return geode_hash<std::u16string>{}(to_utf16(val));
-}
-
-} // namespace internal
-} // namespace client
-} // namespace geode
-} // namespace apache
diff --git a/cppcache/test/util/functionalTests.cpp
b/cppcache/test/util/functionalTests.cpp
index f4e0426..9c18bae 100644
--- a/cppcache/test/util/functionalTests.cpp
+++ b/cppcache/test/util/functionalTests.cpp
@@ -32,4 +32,12 @@ TEST(string, geode_hash) {
EXPECT_EQ(48, hash("0"));
EXPECT_EQ(57, hash("9"));
EXPECT_EQ(1077910243, hash("supercalifragilisticexpialidocious"));
+
+ EXPECT_EQ(1544552287, hash("You had me at meat tornad\u00F6!\U000F0000"));
+
+ auto str = std::string("You had me at");
+ str.push_back(0);
+ str.append("meat tornad\u00F6!\U000F0000");
+
+ EXPECT_EQ(701776767, hash(str));
}