This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 39b508e04 ORC-1580: Change default DataBuffer constructor to use 
reserve instead of resize
39b508e04 is described below

commit 39b508e0448ae09f8491a0f36fb0668131b907f1
Author: Xinyu Zeng <[email protected]>
AuthorDate: Wed Jan 17 08:41:26 2024 -0800

    ORC-1580: Change default DataBuffer constructor to use reserve instead of 
resize
    
    ### What changes were proposed in this pull request?
    
    See https://github.com/apache/orc/issues/1430 - default DataBuffer 
constructor now uses `reserve`.
    
    I also add a `zeroOut` function to manually memset the buffer. For example, 
`MapColumnWrite::add` may use uninitialized `offsets` value (at `uint64_t 
elemOffset = static_cast<uint64_t>(offsets[0]);`). i.e., not memset the 
`offsets` buffer will result in segfault because the code implicitly assume the 
value should be 0. Therefore I still `zeroOut` the buffer for Map, List and 
Union, even though List and Union pass the unit tests without memset.
    
    ### Why are the changes needed?
    
    Reduce overhead in `ColumnVectorBatch` construction and 
compression/decompression buffer construction
    
    ### How was this patch tested?
    
    Correctness via `make test-out`. I further use `orc-scan` utility, setting 
`batchSize` to 1 million and then scan a file with 1 million rows and 20 
columns. After this PR, the time reduces from 2.9s to 2.2s. The 
compression/decompression buffer overhead reported in #1430 can be verified by 
the reporter if necessary.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #1739 from XinyuZeng/gh-1430.
    
    Authored-by: Xinyu Zeng <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 790a3e5efe0dad88596354efb1a8026313c5cef2)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 c++/include/orc/MemoryPool.hh | 10 +++++++---
 c++/src/MemoryPool.cc         | 16 +++++++++++++++-
 c++/src/Vector.cc             |  8 +++++---
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/c++/include/orc/MemoryPool.hh b/c++/include/orc/MemoryPool.hh
index 43e379442..6d999d3aa 100644
--- a/c++/include/orc/MemoryPool.hh
+++ b/c++/include/orc/MemoryPool.hh
@@ -19,11 +19,9 @@
 #ifndef MEMORYPOOL_HH_
 #define MEMORYPOOL_HH_
 
+#include <memory>
 #include "orc/Int128.hh"
 #include "orc/orc-config.hh"
-
-#include <memory>
-
 namespace orc {
 
   class MemoryPool {
@@ -82,6 +80,7 @@ namespace orc {
 
     void reserve(uint64_t _size);
     void resize(uint64_t _size);
+    void zeroOut();
   };
 
   // Specializations for char
@@ -164,6 +163,11 @@ namespace orc {
   template <>
   void DataBuffer<unsigned char>::resize(uint64_t newSize);
 
+  // Specializations for Int128
+
+  template <>
+  void DataBuffer<Int128>::zeroOut();
+
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wweak-template-vtables"
diff --git a/c++/src/MemoryPool.cc b/c++/src/MemoryPool.cc
index bd2a30892..8c8837aa6 100644
--- a/c++/src/MemoryPool.cc
+++ b/c++/src/MemoryPool.cc
@@ -54,7 +54,8 @@ namespace orc {
   template <class T>
   DataBuffer<T>::DataBuffer(MemoryPool& pool, uint64_t newSize)
       : memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) {
-    resize(newSize);
+    reserve(newSize);
+    currentSize = newSize;
   }
 
   template <class T>
@@ -108,6 +109,19 @@ namespace orc {
     }
   }
 
+  template <class T>
+  void DataBuffer<T>::zeroOut() {
+    memset(buf, 0, sizeof(T) * currentCapacity);
+  }
+
+  // Specializations for Int128
+  template <>
+  void DataBuffer<Int128>::zeroOut() {
+    for (uint64_t i = 0; i < currentCapacity; ++i) {
+      new (buf + i) Int128();
+    }
+  }
+
   // Specializations for char
 
   template <>
diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc
index ef16b3180..b9e285458 100644
--- a/c++/src/Vector.cc
+++ b/c++/src/Vector.cc
@@ -20,6 +20,7 @@
 
 #include "Adaptor.hh"
 #include "orc/Exceptions.hh"
+#include "orc/MemoryPool.hh"
 
 #include <cstdlib>
 #include <iostream>
@@ -175,7 +176,7 @@ namespace orc {
 
   ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool)
       : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
-    // PASS
+    offsets.zeroOut();
   }
 
   ListVectorBatch::~ListVectorBatch() {
@@ -212,7 +213,7 @@ namespace orc {
 
   MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool)
       : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
-    // PASS
+    offsets.zeroOut();
   }
 
   MapVectorBatch::~MapVectorBatch() {
@@ -252,7 +253,8 @@ namespace orc {
 
   UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool)
       : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) {
-    // PASS
+    tags.zeroOut();
+    offsets.zeroOut();
   }
 
   UnionVectorBatch::~UnionVectorBatch() {

Reply via email to