This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new d3fde0849 ORC-2036: [C++] Optimize SortedStringDictionary performance
d3fde0849 is described below
commit d3fde08494f4f50ba83cc5da2d086b4a9e4b59b5
Author: Zehua Zou <[email protected]>
AuthorDate: Tue Dec 16 10:50:13 2025 +0800
ORC-2036: [C++] Optimize SortedStringDictionary performance
### What changes were proposed in this pull request?
1. There are lots of `std::unique_ptr<std::string>` in
`SortedStringDictionary`. This PR uses `google::protobuf::Arena` to store these
continuously.
2. Add a new method `SortedStringDictionary::reserve` to avoid frequent
expansion.
3. Delay the construction of `std::vector<DictEntryWithIndex> flatDict_` in
`flush` method to reducing vector expansion.
### Why are the changes needed?
It optimizes the performance of `SortedStringDictionary`.
### How was this patch tested?
Ran existing tests. Tested performance in my private environment.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #2461 from HuaHuaY/fix_orc_2036.
Lead-authored-by: Zehua Zou <[email protected]>
Co-authored-by: Zehua Zou <[email protected]>
Signed-off-by: Gang Wu <[email protected]>
---
.github/workflows/build_and_test.yml | 2 +-
c++/src/ColumnWriter.cc | 23 ++++++----
c++/src/Dictionary.cc | 83 +++++++++++++++++++++++-------------
c++/src/Dictionary.hh | 50 ++++++++--------------
4 files changed, 88 insertions(+), 70 deletions(-)
diff --git a/.github/workflows/build_and_test.yml
b/.github/workflows/build_and_test.yml
index c80e00548..4ca53f409 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -227,7 +227,7 @@ jobs:
mkdir build && cd build
cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_JAVA=OFF
cmake --build .
- - uses:
cpp-linter/cpp-linter-action@f91c446a32ae3eb9f98fef8c9ed4c7cb613a4f8a
+ - uses:
cpp-linter/cpp-linter-action@0f6d1b8d7e38b584cbee606eb23d850c217d54f8
id: linter
continue-on-error: true
env:
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index b9aac1a12..9cdbae070 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1038,6 +1038,8 @@ namespace orc {
if (!useDictionary) {
directLengthEncoder->add(length, numValues, notNull);
+ } else if (dictionary.capacity() < numValues) {
+ dictionary.reserve(dictionary.size() + numValues);
}
uint64_t count = 0;
@@ -1222,11 +1224,9 @@ namespace orc {
}
if (useDictionary) {
- // flush dictionary data & length streams
- dictionary.flush(dictStream.get(), dictLengthEncoder.get());
-
+ // flush dictionary data & length streams and
// convert index from insertion order to dictionary order
- dictionary.reorder(dictionary.idxInDictBuffer_);
+ dictionary.flush(dictStream.get(), dictLengthEncoder.get(),
dictionary.idxInDictBuffer_);
// write data sequences
int64_t* data = dictionary.idxInDictBuffer_.data();
@@ -1271,15 +1271,14 @@ namespace orc {
}
// get dictionary entries in insertion order
- std::vector<const SortedStringDictionary::DictEntry*> entries;
- dictionary.getEntriesInInsertionOrder(entries);
+ auto entries = dictionary.getEntriesInInsertionOrder();
// store each length of the data into a vector
for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
// write one row data in direct encoding
const auto& dictEntry =
entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
- directDataStream->write(dictEntry->data->data(),
dictEntry->data->size());
-
directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size()));
+ directDataStream->write(dictEntry.data(), dictEntry.size());
+ directLengthEncoder->write(static_cast<int64_t>(dictEntry.size()));
}
deleteDictStreams();
@@ -1322,6 +1321,10 @@ namespace orc {
int64_t* length = charsBatch->length.data() + offset;
const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() +
offset : nullptr;
+ if (useDictionary && dictionary.capacity() < numValues) {
+ dictionary.reserve(dictionary.size() + numValues);
+ }
+
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
@@ -1400,6 +1403,10 @@ namespace orc {
int64_t* length = charsBatch->length.data() + offset;
const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() +
offset : nullptr;
+ if (useDictionary && dictionary.capacity() < numValues) {
+ dictionary.reserve(dictionary.size() + numValues);
+ }
+
uint64_t count = 0;
for (uint64_t i = 0; i < numValues; ++i) {
if (!notNull || notNull[i]) {
diff --git a/c++/src/Dictionary.cc b/c++/src/Dictionary.cc
index 9eb60bb5b..97611e65b 100644
--- a/c++/src/Dictionary.cc
+++ b/c++/src/Dictionary.cc
@@ -18,38 +18,46 @@
#include "Dictionary.hh"
+#include <google/protobuf/arena.h>
+#include <memory>
+#include <utility>
+
+using google::protobuf::Arena;
+
namespace orc {
+ SortedStringDictionary::SortedStringDictionary()
+ : totalLength_(0), arena_(std::make_unique<Arena>()) {
+#ifdef BUILD_SPARSEHASH
+ /// Need to set empty key otherwise dense_hash_map will not work correctly
+ keyToIndex_.set_empty_key(std::string_view{});
+#endif
+ }
+
+ SortedStringDictionary::~SortedStringDictionary() = default;
// insert a new string into dictionary, return its insertion order
size_t SortedStringDictionary::insert(const char* str, size_t len) {
- size_t index = flatDict_.size();
+ size_t index = keyToIndex_.size();
auto it = keyToIndex_.find(std::string_view{str, len});
if (it != keyToIndex_.end()) {
return it->second;
} else {
- flatDict_.emplace_back(str, len, index);
+ auto s = Arena::Create<std::string>(arena_.get(), str, len);
+ keyToIndex_.emplace(std::string_view{s->data(), s->length()}, index);
totalLength_ += len;
-
- const auto& lastEntry = flatDict_.back().entry;
- keyToIndex_.emplace(std::string_view{lastEntry.data->data(),
lastEntry.data->size()}, index);
return index;
}
}
- // write dictionary data & length to output buffer
- void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
- RleEncoder* lengthEncoder) const {
- std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
-
- for (const auto& entryWithIndex : flatDict_) {
- dataStream->write(entryWithIndex.entry.data->data(),
entryWithIndex.entry.data->size());
-
lengthEncoder->write(static_cast<int64_t>(entryWithIndex.entry.data->size()));
- }
+ // reserve space for dictionary entries
+ void SortedStringDictionary::reserve(size_t size) {
+ keyToIndex_.reserve(size);
}
/**
- * Reorder input index buffer from insertion order to dictionary order
+ * Write dictionary data & length to output buffer and
+ * reorder input index buffer from insertion order to dictionary order
*
* We require this function because string values are buffered by indexes
* in their insertion order. Until the entire dictionary is complete can
@@ -58,11 +66,24 @@ namespace orc {
* the indexes from insertion order to dictionary value order for final
* output.
*/
- void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
- // iterate the dictionary to get mapping from insertion order to value
order
- std::vector<size_t> mapping(flatDict_.size());
- for (size_t i = 0; i < flatDict_.size(); ++i) {
- mapping[flatDict_[i].index] = i;
+ void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+ RleEncoder* lengthEncoder,
+ std::vector<int64_t>& idxBuffer) const {
+ std::vector<std::pair<std::string_view, size_t>> flatDict;
+ flatDict.reserve(keyToIndex_.size());
+ for (auto [key, index] : keyToIndex_) {
+ flatDict.emplace_back(key, index);
+ }
+ std::sort(flatDict.begin(), flatDict.end());
+
+ for (const auto& [entry, _] : flatDict) {
+ dataStream->write(entry.data(), entry.size());
+ lengthEncoder->write(static_cast<int64_t>(entry.size()));
+ }
+
+ std::vector<size_t> mapping(flatDict.size());
+ for (size_t i = 0; i < flatDict.size(); ++i) {
+ mapping[flatDict[i].second] = i;
}
// do the transformation
@@ -72,18 +93,22 @@ namespace orc {
}
// get dict entries in insertion order
- void SortedStringDictionary::getEntriesInInsertionOrder(
- std::vector<const DictEntry*>& entries) const {
- /// flatDict_ is sorted in insertion order before
[[SortedStringDictionary::flush]] is invoked.
- entries.resize(flatDict_.size());
- for (size_t i = 0; i < flatDict_.size(); ++i) {
- entries[i] = &(flatDict_[i].entry);
+ std::vector<std::string_view>
SortedStringDictionary::getEntriesInInsertionOrder() const {
+ std::vector<std::string_view> entries(keyToIndex_.size());
+ for (auto [key, index] : keyToIndex_) {
+ entries[index] = key;
}
+ return entries;
}
// return count of entries
size_t SortedStringDictionary::size() const {
- return flatDict_.size();
+ return keyToIndex_.size();
+ }
+
+ // return capacity of dictionary
+ size_t SortedStringDictionary::capacity() const {
+ return keyToIndex_.bucket_count() * keyToIndex_.max_load_factor();
}
// return total length of strings in the dictioanry
@@ -94,6 +119,6 @@ namespace orc {
void SortedStringDictionary::clear() {
totalLength_ = 0;
keyToIndex_.clear();
- flatDict_.clear();
+ arena_->Reset();
}
-} // namespace orc
\ No newline at end of file
+} // namespace orc
diff --git a/c++/src/Dictionary.hh b/c++/src/Dictionary.hh
index dca15b115..99868435f 100644
--- a/c++/src/Dictionary.hh
+++ b/c++/src/Dictionary.hh
@@ -18,7 +18,7 @@
#include <cstddef>
#include <memory>
-#include <string>
+#include <string_view>
#ifdef BUILD_SPARSEHASH
#include <sparsehash/dense_hash_map>
@@ -28,62 +28,46 @@
#include "RLE.hh"
+namespace google::protobuf {
+ class Arena;
+}
+
namespace orc {
/**
* Implementation of increasing sorted string dictionary
*/
class SortedStringDictionary {
public:
- struct DictEntry {
- DictEntry(const char* str, size_t len) :
data(std::make_unique<std::string>(str, len)) {}
-
- std::unique_ptr<std::string> data;
- };
-
- struct DictEntryWithIndex {
- DictEntryWithIndex(const char* str, size_t len, size_t index)
- : entry(str, len), index(index) {}
+ SortedStringDictionary();
- DictEntry entry;
- size_t index;
- };
-
- SortedStringDictionary() : totalLength_(0) {
-#ifdef BUILD_SPARSEHASH
- /// Need to set empty key otherwise dense_hash_map will not work
correctly
- keyToIndex_.set_empty_key(std::string_view{});
-#endif
- }
+ ~SortedStringDictionary();
// insert a new string into dictionary, return its insertion order
size_t insert(const char* str, size_t len);
- // write dictionary data & length to output buffer
- void flush(AppendOnlyBufferedStream* dataStream, RleEncoder*
lengthEncoder) const;
+ // reserve space for dictionary entries
+ void reserve(size_t size);
+ // write dictionary data & length to output buffer and
// reorder input index buffer from insertion order to dictionary order
- void reorder(std::vector<int64_t>& idxBuffer) const;
+ void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder,
+ std::vector<int64_t>& idxBuffer) const;
// get dict entries in insertion order
- void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
+ std::vector<std::string_view> getEntriesInInsertionOrder() const;
// return count of entries
size_t size() const;
+ // return capacity of dictionary
+ size_t capacity() const;
+
// return total length of strings in the dictioanry
uint64_t length() const;
void clear();
private:
- struct LessThan {
- bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex&
r) {
- return *l.entry.data < *r.entry.data; // use std::string's operator<
- }
- };
- // store dictionary entries in insertion order
- mutable std::vector<DictEntryWithIndex> flatDict_;
-
#ifdef BUILD_SPARSEHASH
// map from string to its insertion order index
google::dense_hash_map<std::string_view, size_t> keyToIndex_;
@@ -99,6 +83,8 @@ namespace orc {
friend class VarCharColumnWriter;
// store indexes of insertion order in the dictionary for not-null rows
std::vector<int64_t> idxInDictBuffer_;
+
+ std::unique_ptr<google::protobuf::Arena> arena_;
};
} // namespace orc