(orc) branch main updated: ORC-2036: [C++] Optimize SortedStringDictionary performance

gangwu Mon, 15 Dec 2025 18:50:32 -0800

This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/main by this push:
     new d3fde0849 ORC-2036: [C++] Optimize SortedStringDictionary performance
d3fde0849 is described below

commit d3fde08494f4f50ba83cc5da2d086b4a9e4b59b5
Author: Zehua Zou <[email protected]>
AuthorDate: Tue Dec 16 10:50:13 2025 +0800

    ORC-2036: [C++] Optimize SortedStringDictionary performance
    
    ### What changes were proposed in this pull request?
    1. There are lots of `std::unique_ptr<std::string>` in 
`SortedStringDictionary`. This PR uses `google::protobuf::Arena` to store these 
continuously.
    2. Add a new method `SortedStringDictionary::reserve` to avoid frequent 
expansion.
    3. Delay the construction of `std::vector<DictEntryWithIndex> flatDict_` in 
`flush` method to reducing vector expansion.
    
    ### Why are the changes needed?
    It optimizes the performance of `SortedStringDictionary`.
    
    ### How was this patch tested?
    Ran existing tests. Tested performance in my private environment.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #2461 from HuaHuaY/fix_orc_2036.
    
    Lead-authored-by: Zehua Zou <[email protected]>
    Co-authored-by: Zehua Zou <[email protected]>
    Signed-off-by: Gang Wu <[email protected]>
---
 .github/workflows/build_and_test.yml |  2 +-
 c++/src/ColumnWriter.cc              | 23 ++++++----
 c++/src/Dictionary.cc                | 83 +++++++++++++++++++++++-------------
 c++/src/Dictionary.hh                | 50 ++++++++--------------
 4 files changed, 88 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/build_and_test.yml 
b/.github/workflows/build_and_test.yml
index c80e00548..4ca53f409 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -227,7 +227,7 @@ jobs:
           mkdir build && cd build
           cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DBUILD_JAVA=OFF
           cmake --build .
-      - uses: 
cpp-linter/cpp-linter-action@f91c446a32ae3eb9f98fef8c9ed4c7cb613a4f8a
+      - uses: 
cpp-linter/cpp-linter-action@0f6d1b8d7e38b584cbee606eb23d850c217d54f8
         id: linter
         continue-on-error: true
         env:
diff --git a/c++/src/ColumnWriter.cc b/c++/src/ColumnWriter.cc
index b9aac1a12..9cdbae070 100644
--- a/c++/src/ColumnWriter.cc
+++ b/c++/src/ColumnWriter.cc
@@ -1038,6 +1038,8 @@ namespace orc {
 
     if (!useDictionary) {
       directLengthEncoder->add(length, numValues, notNull);
+    } else if (dictionary.capacity() < numValues) {
+      dictionary.reserve(dictionary.size() + numValues);
     }
 
     uint64_t count = 0;
@@ -1222,11 +1224,9 @@ namespace orc {
     }
 
     if (useDictionary) {
-      // flush dictionary data & length streams
-      dictionary.flush(dictStream.get(), dictLengthEncoder.get());
-
+      // flush dictionary data & length streams and
       // convert index from insertion order to dictionary order
-      dictionary.reorder(dictionary.idxInDictBuffer_);
+      dictionary.flush(dictStream.get(), dictLengthEncoder.get(), 
dictionary.idxInDictBuffer_);
 
       // write data sequences
       int64_t* data = dictionary.idxInDictBuffer_.data();
@@ -1271,15 +1271,14 @@ namespace orc {
     }
 
     // get dictionary entries in insertion order
-    std::vector<const SortedStringDictionary::DictEntry*> entries;
-    dictionary.getEntriesInInsertionOrder(entries);
+    auto entries = dictionary.getEntriesInInsertionOrder();
 
     // store each length of the data into a vector
     for (uint64_t i = 0; i != dictionary.idxInDictBuffer_.size(); ++i) {
       // write one row data in direct encoding
       const auto& dictEntry = 
entries[static_cast<size_t>(dictionary.idxInDictBuffer_[i])];
-      directDataStream->write(dictEntry->data->data(), 
dictEntry->data->size());
-      
directLengthEncoder->write(static_cast<int64_t>(dictEntry->data->size()));
+      directDataStream->write(dictEntry.data(), dictEntry.size());
+      directLengthEncoder->write(static_cast<int64_t>(dictEntry.size()));
     }
 
     deleteDictStreams();
@@ -1322,6 +1321,10 @@ namespace orc {
     int64_t* length = charsBatch->length.data() + offset;
     const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + 
offset : nullptr;
 
+    if (useDictionary && dictionary.capacity() < numValues) {
+      dictionary.reserve(dictionary.size() + numValues);
+    }
+
     uint64_t count = 0;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
@@ -1400,6 +1403,10 @@ namespace orc {
     int64_t* length = charsBatch->length.data() + offset;
     const char* notNull = charsBatch->hasNulls ? charsBatch->notNull.data() + 
offset : nullptr;
 
+    if (useDictionary && dictionary.capacity() < numValues) {
+      dictionary.reserve(dictionary.size() + numValues);
+    }
+
     uint64_t count = 0;
     for (uint64_t i = 0; i < numValues; ++i) {
       if (!notNull || notNull[i]) {
diff --git a/c++/src/Dictionary.cc b/c++/src/Dictionary.cc
index 9eb60bb5b..97611e65b 100644
--- a/c++/src/Dictionary.cc
+++ b/c++/src/Dictionary.cc
@@ -18,38 +18,46 @@
 
 #include "Dictionary.hh"
 
+#include <google/protobuf/arena.h>
+#include <memory>
+#include <utility>
+
+using google::protobuf::Arena;
+
 namespace orc {
+  SortedStringDictionary::SortedStringDictionary()
+      : totalLength_(0), arena_(std::make_unique<Arena>()) {
+#ifdef BUILD_SPARSEHASH
+    /// Need to set empty key otherwise dense_hash_map will not work correctly
+    keyToIndex_.set_empty_key(std::string_view{});
+#endif
+  }
+
+  SortedStringDictionary::~SortedStringDictionary() = default;
 
   // insert a new string into dictionary, return its insertion order
   size_t SortedStringDictionary::insert(const char* str, size_t len) {
-    size_t index = flatDict_.size();
+    size_t index = keyToIndex_.size();
 
     auto it = keyToIndex_.find(std::string_view{str, len});
     if (it != keyToIndex_.end()) {
       return it->second;
     } else {
-      flatDict_.emplace_back(str, len, index);
+      auto s = Arena::Create<std::string>(arena_.get(), str, len);
+      keyToIndex_.emplace(std::string_view{s->data(), s->length()}, index);
       totalLength_ += len;
-
-      const auto& lastEntry = flatDict_.back().entry;
-      keyToIndex_.emplace(std::string_view{lastEntry.data->data(), 
lastEntry.data->size()}, index);
       return index;
     }
   }
 
-  // write dictionary data & length to output buffer
-  void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
-                                     RleEncoder* lengthEncoder) const {
-    std::sort(flatDict_.begin(), flatDict_.end(), LessThan());
-
-    for (const auto& entryWithIndex : flatDict_) {
-      dataStream->write(entryWithIndex.entry.data->data(), 
entryWithIndex.entry.data->size());
-      
lengthEncoder->write(static_cast<int64_t>(entryWithIndex.entry.data->size()));
-    }
+  // reserve space for dictionary entries
+  void SortedStringDictionary::reserve(size_t size) {
+    keyToIndex_.reserve(size);
   }
 
   /**
-   * Reorder input index buffer from insertion order to dictionary order
+   * Write dictionary data & length to output buffer and
+   * reorder input index buffer from insertion order to dictionary order
    *
    * We require this function because string values are buffered by indexes
    * in their insertion order. Until the entire dictionary is complete can
@@ -58,11 +66,24 @@ namespace orc {
    * the indexes from insertion order to dictionary value order for final
    * output.
    */
-  void SortedStringDictionary::reorder(std::vector<int64_t>& idxBuffer) const {
-    // iterate the dictionary to get mapping from insertion order to value 
order
-    std::vector<size_t> mapping(flatDict_.size());
-    for (size_t i = 0; i < flatDict_.size(); ++i) {
-      mapping[flatDict_[i].index] = i;
+  void SortedStringDictionary::flush(AppendOnlyBufferedStream* dataStream,
+                                     RleEncoder* lengthEncoder,
+                                     std::vector<int64_t>& idxBuffer) const {
+    std::vector<std::pair<std::string_view, size_t>> flatDict;
+    flatDict.reserve(keyToIndex_.size());
+    for (auto [key, index] : keyToIndex_) {
+      flatDict.emplace_back(key, index);
+    }
+    std::sort(flatDict.begin(), flatDict.end());
+
+    for (const auto& [entry, _] : flatDict) {
+      dataStream->write(entry.data(), entry.size());
+      lengthEncoder->write(static_cast<int64_t>(entry.size()));
+    }
+
+    std::vector<size_t> mapping(flatDict.size());
+    for (size_t i = 0; i < flatDict.size(); ++i) {
+      mapping[flatDict[i].second] = i;
     }
 
     // do the transformation
@@ -72,18 +93,22 @@ namespace orc {
   }
 
   // get dict entries in insertion order
-  void SortedStringDictionary::getEntriesInInsertionOrder(
-      std::vector<const DictEntry*>& entries) const {
-    /// flatDict_ is sorted in insertion order before 
[[SortedStringDictionary::flush]] is invoked.
-    entries.resize(flatDict_.size());
-    for (size_t i = 0; i < flatDict_.size(); ++i) {
-      entries[i] = &(flatDict_[i].entry);
+  std::vector<std::string_view> 
SortedStringDictionary::getEntriesInInsertionOrder() const {
+    std::vector<std::string_view> entries(keyToIndex_.size());
+    for (auto [key, index] : keyToIndex_) {
+      entries[index] = key;
     }
+    return entries;
   }
 
   // return count of entries
   size_t SortedStringDictionary::size() const {
-    return flatDict_.size();
+    return keyToIndex_.size();
+  }
+
+  // return capacity of dictionary
+  size_t SortedStringDictionary::capacity() const {
+    return keyToIndex_.bucket_count() * keyToIndex_.max_load_factor();
   }
 
   // return total length of strings in the dictioanry
@@ -94,6 +119,6 @@ namespace orc {
   void SortedStringDictionary::clear() {
     totalLength_ = 0;
     keyToIndex_.clear();
-    flatDict_.clear();
+    arena_->Reset();
   }
-}  // namespace orc
\ No newline at end of file
+}  // namespace orc
diff --git a/c++/src/Dictionary.hh b/c++/src/Dictionary.hh
index dca15b115..99868435f 100644
--- a/c++/src/Dictionary.hh
+++ b/c++/src/Dictionary.hh
@@ -18,7 +18,7 @@
 
 #include <cstddef>
 #include <memory>
-#include <string>
+#include <string_view>
 
 #ifdef BUILD_SPARSEHASH
 #include <sparsehash/dense_hash_map>
@@ -28,62 +28,46 @@
 
 #include "RLE.hh"
 
+namespace google::protobuf {
+  class Arena;
+}
+
 namespace orc {
   /**
    * Implementation of increasing sorted string dictionary
    */
   class SortedStringDictionary {
    public:
-    struct DictEntry {
-      DictEntry(const char* str, size_t len) : 
data(std::make_unique<std::string>(str, len)) {}
-
-      std::unique_ptr<std::string> data;
-    };
-
-    struct DictEntryWithIndex {
-      DictEntryWithIndex(const char* str, size_t len, size_t index)
-          : entry(str, len), index(index) {}
+    SortedStringDictionary();
 
-      DictEntry entry;
-      size_t index;
-    };
-
-    SortedStringDictionary() : totalLength_(0) {
-#ifdef BUILD_SPARSEHASH
-      /// Need to set empty key otherwise dense_hash_map will not work 
correctly
-      keyToIndex_.set_empty_key(std::string_view{});
-#endif
-    }
+    ~SortedStringDictionary();
 
     // insert a new string into dictionary, return its insertion order
     size_t insert(const char* str, size_t len);
 
-    // write dictionary data & length to output buffer
-    void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* 
lengthEncoder) const;
+    // reserve space for dictionary entries
+    void reserve(size_t size);
 
+    // write dictionary data & length to output buffer and
     // reorder input index buffer from insertion order to dictionary order
-    void reorder(std::vector<int64_t>& idxBuffer) const;
+    void flush(AppendOnlyBufferedStream* dataStream, RleEncoder* lengthEncoder,
+               std::vector<int64_t>& idxBuffer) const;
 
     // get dict entries in insertion order
-    void getEntriesInInsertionOrder(std::vector<const DictEntry*>&) const;
+    std::vector<std::string_view> getEntriesInInsertionOrder() const;
 
     // return count of entries
     size_t size() const;
 
+    // return capacity of dictionary
+    size_t capacity() const;
+
     // return total length of strings in the dictioanry
     uint64_t length() const;
 
     void clear();
 
    private:
-    struct LessThan {
-      bool operator()(const DictEntryWithIndex& l, const DictEntryWithIndex& 
r) {
-        return *l.entry.data < *r.entry.data;  // use std::string's operator<
-      }
-    };
-    // store dictionary entries in insertion order
-    mutable std::vector<DictEntryWithIndex> flatDict_;
-
 #ifdef BUILD_SPARSEHASH
     // map from string to its insertion order index
     google::dense_hash_map<std::string_view, size_t> keyToIndex_;
@@ -99,6 +83,8 @@ namespace orc {
     friend class VarCharColumnWriter;
     // store indexes of insertion order in the dictionary for not-null rows
     std::vector<int64_t> idxInDictBuffer_;
+
+    std::unique_ptr<google::protobuf::Arena> arena_;
   };
 
 }  // namespace orc

(orc) branch main updated: ORC-2036: [C++] Optimize SortedStringDictionary performance

Reply via email to