This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 39b508e04 ORC-1580: Change default DataBuffer constructor to use
reserve instead of resize
39b508e04 is described below
commit 39b508e0448ae09f8491a0f36fb0668131b907f1
Author: Xinyu Zeng <[email protected]>
AuthorDate: Wed Jan 17 08:41:26 2024 -0800
ORC-1580: Change default DataBuffer constructor to use reserve instead of
resize
### What changes were proposed in this pull request?
See https://github.com/apache/orc/issues/1430 - default DataBuffer
constructor now uses `reserve`.
I also add a `zeroOut` function to manually memset the buffer. For example,
`MapColumnWrite::add` may use uninitialized `offsets` value (at `uint64_t
elemOffset = static_cast<uint64_t>(offsets[0]);`). i.e., not memset the
`offsets` buffer will result in segfault because the code implicitly assume the
value should be 0. Therefore I still `zeroOut` the buffer for Map, List and
Union, even though List and Union pass the unit tests without memset.
### Why are the changes needed?
Reduce overhead in `ColumnVectorBatch` construction and
compression/decompression buffer construction
### How was this patch tested?
Correctness via `make test-out`. I further use `orc-scan` utility, setting
`batchSize` to 1 million and then scan a file with 1 million rows and 20
columns. After this PR, the time reduces from 2.9s to 2.2s. The
compression/decompression buffer overhead reported in #1430 can be verified by
the reporter if necessary.
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #1739 from XinyuZeng/gh-1430.
Authored-by: Xinyu Zeng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 790a3e5efe0dad88596354efb1a8026313c5cef2)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/include/orc/MemoryPool.hh | 10 +++++++---
c++/src/MemoryPool.cc | 16 +++++++++++++++-
c++/src/Vector.cc | 8 +++++---
3 files changed, 27 insertions(+), 7 deletions(-)
diff --git a/c++/include/orc/MemoryPool.hh b/c++/include/orc/MemoryPool.hh
index 43e379442..6d999d3aa 100644
--- a/c++/include/orc/MemoryPool.hh
+++ b/c++/include/orc/MemoryPool.hh
@@ -19,11 +19,9 @@
#ifndef MEMORYPOOL_HH_
#define MEMORYPOOL_HH_
+#include <memory>
#include "orc/Int128.hh"
#include "orc/orc-config.hh"
-
-#include <memory>
-
namespace orc {
class MemoryPool {
@@ -82,6 +80,7 @@ namespace orc {
void reserve(uint64_t _size);
void resize(uint64_t _size);
+ void zeroOut();
};
// Specializations for char
@@ -164,6 +163,11 @@ namespace orc {
template <>
void DataBuffer<unsigned char>::resize(uint64_t newSize);
+ // Specializations for Int128
+
+ template <>
+ void DataBuffer<Int128>::zeroOut();
+
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wweak-template-vtables"
diff --git a/c++/src/MemoryPool.cc b/c++/src/MemoryPool.cc
index bd2a30892..8c8837aa6 100644
--- a/c++/src/MemoryPool.cc
+++ b/c++/src/MemoryPool.cc
@@ -54,7 +54,8 @@ namespace orc {
template <class T>
DataBuffer<T>::DataBuffer(MemoryPool& pool, uint64_t newSize)
: memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) {
- resize(newSize);
+ reserve(newSize);
+ currentSize = newSize;
}
template <class T>
@@ -108,6 +109,19 @@ namespace orc {
}
}
+ template <class T>
+ void DataBuffer<T>::zeroOut() {
+ memset(buf, 0, sizeof(T) * currentCapacity);
+ }
+
+ // Specializations for Int128
+ template <>
+ void DataBuffer<Int128>::zeroOut() {
+ for (uint64_t i = 0; i < currentCapacity; ++i) {
+ new (buf + i) Int128();
+ }
+ }
+
// Specializations for char
template <>
diff --git a/c++/src/Vector.cc b/c++/src/Vector.cc
index ef16b3180..b9e285458 100644
--- a/c++/src/Vector.cc
+++ b/c++/src/Vector.cc
@@ -20,6 +20,7 @@
#include "Adaptor.hh"
#include "orc/Exceptions.hh"
+#include "orc/MemoryPool.hh"
#include <cstdlib>
#include <iostream>
@@ -175,7 +176,7 @@ namespace orc {
ListVectorBatch::ListVectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
- // PASS
+ offsets.zeroOut();
}
ListVectorBatch::~ListVectorBatch() {
@@ -212,7 +213,7 @@ namespace orc {
MapVectorBatch::MapVectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
- // PASS
+ offsets.zeroOut();
}
MapVectorBatch::~MapVectorBatch() {
@@ -252,7 +253,8 @@ namespace orc {
UnionVectorBatch::UnionVectorBatch(uint64_t cap, MemoryPool& pool)
: ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) {
- // PASS
+ tags.zeroOut();
+ offsets.zeroOut();
}
UnionVectorBatch::~UnionVectorBatch() {