This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new f89879d8c ORC-1950: [C++] Replace std::unorder_map with google
dense_hash_map in SortedStringDictionary and remove reorder to improve write
performance of dict-encoding columns
f89879d8c is described below
commit f89879d8c9a5ca4515af20dfa41c7830eee2204d
Author: taiyang-li <[email protected]>
AuthorDate: Tue Jul 15 09:13:22 2025 -0700
ORC-1950: [C++] Replace std::unorder_map with google dense_hash_map in
SortedStringDictionary and remove reorder to improve write performance of
dict-encoding columns
### What changes were proposed in this pull request?
Replace std::unorder_map with google dense_hash_map in
SortedStringDictionary and remove reorder to improve write performance of
dict-encoding columns
### Why are the changes needed?
For performance improvement.s
### How was this patch tested?
Pass the CIs.
I tested it in clickhouse. Here are the results
``` sql
Query: select concat('gluten ', cast(rand()%1000 as String)) from
numbers(10000000) into outfile 'dict.orc' truncate;
-- Enable dictionary encoding before optimization
set max_threads =1 ;
set output_format_orc_dictionary_key_size_threshold = 1;
10000000 rows in set. Elapsed: 0.760 sec. Processed 10.00 million rows,
80.00 MB (13.16 million rows/s., 105.31 MB/s.)
10000000 rows in set. Elapsed: 0.796 sec. Processed 10.00 million rows,
80.00 MB (12.56 million rows/s., 100.51 MB/s.)
10000000 rows in set. Elapsed: 0.803 sec. Processed 10.00 million rows,
80.00 MB (12.45 million rows/s., 99.63 MB/s.)
-- Enable dictionary encoding after optimization
set max_threads =1 ;
set output_format_orc_dictionary_key_size_threshold = 1;
10000000 rows in set. Elapsed: 0.645 sec. Processed 10.00 million rows,
80.00 MB (15.50 million rows/s., 124.00 MB/s.)
10000000 rows in set. Elapsed: 0.622 sec. Processed 10.00 million rows,
80.00 MB (16.08 million rows/s., 128.64 MB/s.)
10000000 rows in set. Elapsed: 0.631 sec. Processed 10.00 million rows,
80.00 MB (15.85 million rows/s., 126.83 MB/s.)
-- Disable dictionary encoding
set max_threads =1 ;
set output_format_orc_dictionary_key_size_threshold = 0;
10000000 rows in set. Elapsed: 0.707 sec. Processed 10.00 million rows,
80.00 MB (14.14 million rows/s., 113.15 MB/s.)
10000000 rows in set. Elapsed: 0.671 sec. Processed 10.00 million rows,
80.00 MB (14.90 million rows/s., 119.17 MB/s.)
10000000 rows in set. Elapsed: 0.727 sec. Processed 10.00 million rows,
80.00 MB (13.75 million rows/s., 109.98 MB/s.)
```
We can conclude that
- Dictionary encoded writing speeds up by 1.24x.
- Now dictionary encoded writing is 1.11x faster than plain encoded writing.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #2321 from taiyang-li/opt_orc_dict.
Authored-by: taiyang-li <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/meson.build | 1 +
c++/src/CMakeLists.txt | 1 +
c++/src/ColumnWriter.cc | 113 ++++++---------------
c++/src/meson.build | 1 +
c++/test/CMakeLists.txt | 1 +
c++/test/meson.build | 1 +
cmake_modules/ThirdpartyToolchain.cmake | 40 ++++++++
c++/meson.build => subprojects/sparsehash-c11.wrap | 33 +++---
8 files changed, 90 insertions(+), 101 deletions(-)
diff --git a/c++/meson.build b/c++/meson.build
index de44b1a88..216d7e563 100644
--- a/c++/meson.build
+++ b/c++/meson.build
@@ -21,6 +21,7 @@ lz4_dep = dependency('liblz4')
snappy_dep = dependency('snappy')
zlib_dep = dependency('zlib')
zstd_dep = dependency('libzstd')
+sparsehash_c11_dep = dependency('sparsehash-c11')
# optional dependencies (should be set later in configuration)
gtest_dep = disabler()
diff --git a/c++/src/CMakeLists.txt b/c++/src/CMakeLists.txt
index 09a0b148e..b8a168307 100644
--- a/c++/src/CMakeLists.txt
+++ b/c++/src/CMakeLists.txt
@@ -212,6 +212,7 @@ target_link_libraries (orc
$<BUILD_INTERFACE:orc::snappy>
$<BUILD_INTERFACE:orc::lz4>
$<BUILD_INTERFACE:orc::zstd>
+ $<BUILD_INTERFACE:orc::sparsehash>
$<BUILD_INTERFACE:${LIBHDFSPP_LIBRARIES}>
)
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index c99890b88..915277ef4 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -29,6 +29,8 @@
#include "Timezone.hh"
#include "Utils.hh"
+#include <sparsehash/dense_hash_map>
+
namespace orc {
StreamsFactory::~StreamsFactory() {
// PASS
@@ -932,19 +934,15 @@ namespace orc {
class SortedStringDictionary {
public:
struct DictEntry {
- DictEntry(const char* str, size_t len) : data(str), length(len) {}
- const char* data;
- size_t length;
- };
+ DictEntry(const char* str, size_t len) :
data(std::make_unique<std::string>(str, len)) {}
- struct DictEntryWithIndex {
- DictEntryWithIndex(const char* str, size_t len, size_t index)
- : entry(str, len), index(index) {}
- DictEntry entry;
- size_t index;
+ std::unique_ptr<std::string> data;
};
- SortedStringDictionary() : totalLength_(0) {}
+ SortedStringDictionary() : totalLength_(0) {
+ /// Need to set empty key otherwise dense_hash_map will not work
correctly
+ keyToIndex_.set_empty_key(std::string_view{});
+ }
// insert a new string into dictionary, return its insertion order
size_t insert(const char* str, size_t len);
@@ -952,11 +950,8 @@ namespace orc {
// write dictionary data & length to output buffer
void flush(AppendOnlyBufferedStream* dataStream, RleEncoder*
lengthEncoder) const;
- // reorder input index buffer from insertion order to dictionary order
- void reorder(std::vector<int64_t>& idxBuffer) const;
-
// get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
+ const std::vector<DictEntry>& getEntriesInInsertionOrder() const;
// return count of entries
size_t size() const;
@@ -967,20 +962,11 @@ namespace orc {
void clear();
private:
- struct LessThan {
- bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex&
r) {
- const auto& left = l.entry;
- const auto& right = r.entry;
- int ret = memcmp(left.data, right.data, std::min(left.length,
right.length));
- if (ret != 0) {
- return ret < 0;
- }
- return left.length < right.length;
- }
- };
+ // store dictionary entries in insertion order
+ mutable std::vector<DictEntry> flatDict_;
- mutable std::vector<DictEntryWithIndex> flatDict_;
- std::unordered_map<std::string, size_t> keyToIndex_;
+ // map from string to its insertion order index
+ google::dense_hash_map<std::string_view, size_t> keyToIndex_;
uint64_t totalLength_;
// use friend class here to avoid being bothered by const function calls
@@ -994,61 +980,33 @@ namespace orc {
// insert a new string into dictionary, return its insertion order
size_t SortedStringDictionary::insert(const char* str, size_t len) {
size_t index = flatDict_.size();
- auto ret = keyToIndex_.emplace(std::string(str, len), index);
- if (ret.second) {
- flatDict_.emplace_back(ret.first->first.data(), ret.first->first.size(),
index);
+
+ auto it = keyToIndex_.find(std::string_view{str, len});
+ if (it != keyToIndex_.end()) {
+ return it->second;
+ } else {
+ flatDict_.emplace_back(str, len);
totalLength_ += len;
+
+ const auto& lastEntry = flatDict_.back();
+ keyToIndex_.emplace(std::string_view{lastEntry.data->data(),
lastEntry.data->size()}, index);
+ return index;
}
- return ret.first->second;
}
// write dictionary data & length to output buffer
void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
RleEncoder* lengthEncoder) const {
- std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
-
- for (const auto& entryWithIndex : flatDict_) {
- const auto& entry = entryWithIndex.entry;
- dataStream->write(entry.data, entry.length);
- lengthEncoder->write(static_cast<int64_t>(entry.length));
- }
- }
-
- /**
- * Reorder input index buffer from insertion order to dictionary order
- *
- * We require this function because string values are buffered by indexes
- * in their insertion order. Until the entire dictionary is complete can
- * we get their sorted indexes in the dictionary in that ORC specification
- * demands dictionary should be ordered. Therefore this function transforms
- * the indexes from insertion order to dictionary value order for final
- * output.
- */
- void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
- // iterate the dictionary to get mapping from insertion order to value
order
- std::vector<size_t> mapping(flatDict_.size());
- for (size_t i = 0; i < flatDict_.size(); ++i) {
- mapping[flatDict_[i].index] = i;
- }
-
- // do the transformation
- for (size_t i = 0; i != idxBuffer.size(); ++i) {
- idxBuffer[i] =
static_cast<int64_t>(mapping[static_cast<size_t>(idxBuffer[i])]);
+ for (const auto& entry : flatDict_) {
+ dataStream->write(entry.data->data(), entry.data->size());
+ lengthEncoder->write(static_cast<int64_t>(entry.data->size()));
}
}
// get dict entries in insertion order
- void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector<const DictEntry*>& entries) const {
- std::sort(flatDict_.begin(), flatDict_.end(),
- [](const DictEntryWithIndex& left, const DictEntryWithIndex&
right) {
- return left.index < right.index;
- });
-
- entries.resize(flatDict_.size());
- for (size_t i = 0; i < flatDict_.size(); ++i) {
- entries[i] = &(flatDict_[i].entry);
- }
+ const std::vector<SortedStringDictionary::DictEntry>&
+ SortedStringDictionary::getEntriesInInsertionOrder() const {
+ return flatDict_;
}
// return count of entries
@@ -1366,9 +1324,6 @@ namespace orc {
// flush dictionary data & length streams
dictionary.flush(dictStream.get(), dictLengthEncoder.get());
- // convert index from insertion order to dictionary order
- dictionary.reorder(dictionary.idxInDictBuffer_);
-
// write data sequences
int64_t* data = dictionary.idxInDictBuffer_.data();
if (enableIndex) {
@@ -1412,16 +1367,14 @@ namespace orc {
}
// get dictionary entries in insertion order
- std::vector<const SortedStringDictionary::DictEntry*> entries;
- dictionary.getEntriesInInsertionOrder(entries);
+ const auto& entries = dictionary.getEntriesInInsertionOrder();
// store each length of the data into a vector
- const SortedStringDictionary::DictEntry* dictEntry = nullptr;
for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
// write one row data in direct encoding
- dictEntry = entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
- directDataStream->write(dictEntry->data, dictEntry->length);
- directLengthEncoder->write(static_cast<int64_t>(dictEntry->length));
+ const auto& dictEntry =
entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
+ directDataStream->write(dictEntry.data->data(), dictEntry.data->size());
+ directLengthEncoder->write(static_cast<int64_t>(dictEntry.data->size()));
}
deleteDictStreams();
diff --git a/c++/src/meson.build b/c++/src/meson.build
index 0794dec84..44a98500f 100644
--- a/c++/src/meson.build
+++ b/c++/src/meson.build
@@ -188,6 +188,7 @@ orc_lib = library(
lz4_dep,
zstd_dep,
threads_dep,
+ sparsehash_c11_dep,
],
include_directories: incdir,
install: true,
diff --git a/c++/test/CMakeLists.txt b/c++/test/CMakeLists.txt
index 3261fedde..b0ee48f38 100644
--- a/c++/test/CMakeLists.txt
+++ b/c++/test/CMakeLists.txt
@@ -77,6 +77,7 @@ target_link_libraries (orc-test
orc::zlib
orc::gtest
orc::gmock
+ orc::sparsehash
orc-test-include
)
diff --git a/c++/test/meson.build b/c++/test/meson.build
index a8d30a6b9..75dcbb094 100644
--- a/c++/test/meson.build
+++ b/c++/test/meson.build
@@ -72,6 +72,7 @@ orc_test = executable(
zlib_dep,
gtest_dep,
gmock_dep,
+ sparsehash_c11_dep,
],
)
diff --git a/cmake_modules/ThirdpartyToolchain.cmake
b/cmake_modules/ThirdpartyToolchain.cmake
index 1cb568701..c494710ba 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -26,6 +26,7 @@ set(ZLIB_VERSION "1.3.1")
set(GTEST_VERSION "1.12.1")
set(PROTOBUF_VERSION "3.5.1")
set(ZSTD_VERSION "1.5.7")
+set(SPARSEHASH_VERSION "2.11.1")
option(ORC_PREFER_STATIC_PROTOBUF "Prefer static protobuf library, if
available" ON)
option(ORC_PREFER_STATIC_SNAPPY "Prefer static snappy library, if available"
ON)
@@ -580,6 +581,45 @@ if (NOT (ORC_PACKAGE_KIND STREQUAL "conan" OR
ORC_PACKAGE_KIND STREQUAL "vcpkg")
add_library (orc::protoc ALIAS orc_protoc)
endif ()
+# ----------------------------------------------------------------------
+# SPARSEHASH
+
+set(SPARSEHASH_HOME "${THIRDPARTY_DIR}/sparsehash_ep-install")
+set(SPARSEHASH_INCLUDE_DIR "${SPARSEHASH_HOME}/include/google")
+set(SPARSEHASH_CMAKE_ARGS
+ -DCMAKE_INSTALL_PREFIX=${SPARSEHASH_HOME}
+ -DBUILD_SHARED_LIBS=OFF
+ -DCMAKE_INSTALL_LIBDIR=lib
+ -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+)
+if (BUILD_POSITION_INDEPENDENT_LIB)
+ set(SPARSEHASH_CMAKE_ARGS ${SPARSEHASH_CMAKE_ARGS}
-DCMAKE_POSITION_INDEPENDENT_CODE=ON)
+endif ()
+
+if (CMAKE_VERSION VERSION_GREATER "3.7")
+ set(SPARSEHASH_CONFIGURE SOURCE_SUBDIR "" CMAKE_ARGS
${SPARSEHASH_CMAKE_ARGS})
+ else()
+ set(SPARSEHASH_CONFIGURE CONFIGURE_COMMAND
"${THIRDPARTY_CONFIGURE_COMMAND}" ${SPARSEHASH_CMAKE_ARGS}
+
"${CMAKE_CURRENT_BINARY_DIR}/sparsehash_ep-prefix/src/sparsehash_ep/")
+endif()
+
+ExternalProject_Add(sparsehash_ep
+ URL
"https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v${SPARSEHASH_VERSION}.tar.gz"
+ ${SPARSEHASH_CONFIGURE}
+ ${THIRDPARTY_LOG_OPTIONS})
+
+# sparsehash-c11 is header-only, create interface library
+add_library(orc_sparsehash INTERFACE)
+target_include_directories(orc_sparsehash INTERFACE
+ $<BUILD_INTERFACE:${SPARSEHASH_INCLUDE_DIR}>
+ $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+add_dependencies(orc_sparsehash sparsehash_ep)
+
+list (APPEND ORC_VENDOR_DEPENDENCIES "orc::vendored_sparsehash")
+list (APPEND ORC_INSTALL_INTERFACE_TARGETS
"$<INSTALL_INTERFACE:orc::vendored_sparsehash>")
+
+add_library (orc::sparsehash ALIAS orc_sparsehash)
+
# ----------------------------------------------------------------------
# LIBHDFSPP
diff --git a/c++/meson.build b/subprojects/sparsehash-c11.wrap
similarity index 53%
copy from c++/meson.build
copy to subprojects/sparsehash-c11.wrap
index de44b1a88..4177861ce 100644
--- a/c++/meson.build
+++ b/subprojects/sparsehash-c11.wrap
@@ -15,25 +15,16 @@
# specific language governing permissions and limitations
# under the License.
-# required dependencies
-protobuf_dep = dependency('protobuf', fallback: ['protobuf', 'protobuf_dep'])
-lz4_dep = dependency('liblz4')
-snappy_dep = dependency('snappy')
-zlib_dep = dependency('zlib')
-zstd_dep = dependency('libzstd')
+[wrap-file]
+directory = sparsehash-c11-2.11.1
+source_url =
https://github.com/sparsehash/sparsehash-c11/archive/refs/tags/v2.11.1.tar.gz
+source_filename = sparsehash-c11-2.11.1.tar.gz
+source_hash = d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755
+patch_filename = sparsehash-c11_2.11.1-1_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/sparsehash-c11_2.11.1-1/get_patch
+patch_hash = d04ddea94db2a0a7ad059c26186e3f6f37b02ddfba8fea11352767becb3d0cfa
+source_fallback_url =
https://github.com/mesonbuild/wrapdb/releases/download/sparsehash-c11_2.11.1-1/sparsehash-c11-2.11.1.tar.gz
+wrapdb_version = 2.11.1-1
-# optional dependencies (should be set later in configuration)
-gtest_dep = disabler()
-gmock_dep = disabler()
-
-subdir('include/orc')
-subdir('src')
-
-if get_option('tests').enabled()
- gtest_dep = dependency('gtest')
- gmock_dep = dependency('gmock')
- subdir('test')
-endif
-
-pkg = import('pkgconfig')
-pkg.generate(orc_lib)
+[provide]
+sparsehash-c11 = sparsehash_c11_dep
\ No newline at end of file