This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d15cb45 ARROW-4031: [C++] Refactor bitmap building
d15cb45 is described below
commit d15cb454402204ba3b2825afb71853b35a81a886
Author: Benjamin Kietzman <[email protected]>
AuthorDate: Wed Jan 23 13:47:54 2019 -0600
ARROW-4031: [C++] Refactor bitmap building
Haven't yet used `TypedBufferBuilder<bool>` for `BooleanBuilder`'s bitmap.
Coming soon
Author: Benjamin Kietzman <[email protected]>
Author: Wes McKinney <[email protected]>
Closes #3336 from bkietz/ARROW-4031-refactor-bitmap-building and squashes
the following commits:
7e28c2e75 <Wes McKinney> Merge remote-tracking branch 'apache/master' into
ARROW-4031-refactor-bitmap-building
9479ec148 <Wes McKinney> Fix another doxygen warning
e16a8e2e0 <Wes McKinney> Fix doxygen warning
567a53158 <Wes McKinney> Revert "Remove memset statements from
BufferBuilder"
d48c97381 <Wes McKinney> Add cstring include in bit-util.h
535f68af2 <Wes McKinney> Do not call ZeroPadding if buffer is null
79222c172 <Wes McKinney> Remove memset statements from BufferBuilder
2d3b33abd <Wes McKinney> Revert arrow/buffer.h
0c239c8bc <Wes McKinney> Inline ArrayBuilder::Reserve for better performance
79dfc347f <Wes McKinney> tweak
2a0e26fb4 <Benjamin Kietzman> fixing format issues
89ebde63c <Benjamin Kietzman> correct SetBitsTo; failed to set last byte
for longer spans
0b0a4fe34 <Benjamin Kietzman> port TestSetBitsTo from begin,end to
offset,length
8ee985d4d <Benjamin Kietzman> use NULLPTR macro instead of nullptr
39a6d977d <Benjamin Kietzman> adding explicit cast to uint8_t
c293a3183 <Benjamin Kietzman> fix clang-format issue
0255ab947 <Benjamin Kietzman> don't use a growth factor if we don't need to
resize
7fea6cfa2 <Benjamin Kietzman> add using UnsafeAppendToBitmap
46551ea31 <Benjamin Kietzman> remove erroneous size_ reference
360a6b2bf <Benjamin Kietzman> fix erroneous void return type
c0cbfd5cf <Benjamin Kietzman> Address some PR review comments
65f25205c <Benjamin Kietzman> Merge branch 'master' into
ARROW-4031-refactor-bitmap-building
ba35e2bc7 <Benjamin Kietzman> Fixing benchmarks for windows compilation
12478cc41 <Benjamin Kietzman> Merge branch 'master' into
ARROW-4031-refactor-bitmap-building
028e761ed <Benjamin Kietzman> ArrayBuilder now uses
TypedBufferBuilder<bool> for null bitmaps
fb3e75950 <Benjamin Kietzman> adding specialization
TypedBufferBuilder<bool> for building bitmaps
2553f4172 <Benjamin Kietzman> rewriting arithmetic TypedBufferBuilder to
composition from inheritance
---
cpp/build-support/lint_cpp_cli.py | 66 ++++----
cpp/src/arrow/array/builder_adaptive.cc | 14 +-
cpp/src/arrow/array/builder_base.cc | 85 +---------
cpp/src/arrow/array/builder_base.h | 81 +++------
cpp/src/arrow/array/builder_binary.cc | 44 ++---
cpp/src/arrow/array/builder_decimal.cc | 4 +-
cpp/src/arrow/array/builder_nested.cc | 12 +-
cpp/src/arrow/array/builder_primitive.cc | 15 +-
cpp/src/arrow/array/builder_primitive.h | 33 ++--
cpp/src/arrow/buffer-builder.h | 273 +++++++++++++++++++++++--------
cpp/src/arrow/buffer-test.cc | 76 +++++++++
cpp/src/arrow/builder-benchmark.cc | 17 +-
cpp/src/arrow/type_traits.h | 34 ++--
cpp/src/arrow/util/bit-util-test.cc | 37 +++++
cpp/src/arrow/util/bit-util.h | 72 ++++++--
cpp/src/arrow/util/hashing-benchmark.cc | 4 +-
16 files changed, 545 insertions(+), 322 deletions(-)
diff --git a/cpp/build-support/lint_cpp_cli.py
b/cpp/build-support/lint_cpp_cli.py
index c8b25df..ab2de59 100644
--- a/cpp/build-support/lint_cpp_cli.py
+++ b/cpp/build-support/lint_cpp_cli.py
@@ -19,8 +19,6 @@
import argparse
import re
import os
-import sys
-import traceback
parser = argparse.ArgumentParser(
description="Check for illegal headers for C++/CLI applications")
@@ -34,6 +32,10 @@ _NULLPTR_REGEX = re.compile(r'.*\bnullptr\b.*')
_RETURN_NOT_OK_REGEX = re.compile(r'.*\sRETURN_NOT_OK.*')
+def _paths(paths):
+ return [p.strip().replace('/', os.path.sep) for p in paths.splitlines()]
+
+
def _strip_comments(line):
m = _STRIP_COMMENT_REGEX.match(line)
if not m:
@@ -48,11 +50,11 @@ def lint_file(path):
(lambda x: '<mutex>' in x, 'Uses <mutex>', []),
(lambda x: re.match(_NULLPTR_REGEX, x), 'Uses nullptr', []),
(lambda x: re.match(_RETURN_NOT_OK_REGEX, x),
- 'Use ARROW_RETURN_NOT_OK in header files',
- ['arrow/status.h',
- 'test',
- 'arrow/util/hash.h',
- 'arrow/python/util'])
+ 'Use ARROW_RETURN_NOT_OK in header files', _paths('''\
+ arrow/status.h
+ test
+ arrow/util/hash.h
+ arrow/python/util'''))
]
with open(path) as f:
@@ -63,25 +65,23 @@ def lint_file(path):
continue
if rule(stripped_line):
- raise Exception('File {0} failed C++/CLI lint check: {1}\n'
- 'Line {2}: {3}'
- .format(path, why, i + 1, line))
-
-
-EXCLUSIONS = [
- 'arrow/python/iterators.h',
- 'arrow/util/hashing.h',
- 'arrow/util/macros.h',
- 'arrow/util/parallel.h',
- 'arrow/vendored',
- 'arrow/visitor_inline.h',
- 'gandiva/cache.h',
- 'gandiva/jni',
- 'test',
- 'internal'
-]
-
-try:
+ yield path, why, i, line
+
+
+EXCLUSIONS = _paths('''\
+ arrow/python/iterators.h
+ arrow/util/hashing.h
+ arrow/util/macros.h
+ arrow/util/parallel.h
+ arrow/vendored
+ arrow/visitor_inline.h
+ gandiva/cache.h
+ gandiva/jni
+ test
+ internal''')
+
+
+def lint_files():
for dirpath, _, filenames in os.walk(arguments.source_path):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
@@ -97,7 +97,13 @@ try:
# Only run on header files
if filename.endswith('.h'):
- lint_file(full_path)
-except Exception:
- traceback.print_exc()
- sys.exit(1)
+ yield from lint_file(full_path)
+
+
+if __name__ == '__main__':
+ failures = list(lint_files())
+ for path, why, i, line in failures:
+ print('File {0} failed C++/CLI lint check: {1}\n'
+ 'Line {2}: {3}'.format(path, why, i + 1, line))
+ if failures:
+ exit(1)
diff --git a/cpp/src/arrow/array/builder_adaptive.cc
b/cpp/src/arrow/array/builder_adaptive.cc
index 599e9e1..e96c9a2 100644
--- a/cpp/src/arrow/array/builder_adaptive.cc
+++ b/cpp/src/arrow/array/builder_adaptive.cc
@@ -90,12 +90,13 @@ Status
AdaptiveIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
}
- RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_),
null_bitmap_.get()));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
- *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_},
null_count_);
+ *out = ArrayData::Make(output_type, length_, {null_bitmap, data_},
null_count_);
- data_ = null_bitmap_ = nullptr;
+ data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
@@ -275,12 +276,13 @@ Status
AdaptiveUIntBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
return Status::NotImplemented("Only ints of size 1,2,4,8 are supported");
}
- RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_),
null_bitmap_.get()));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get()));
- *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_},
null_count_);
+ *out = ArrayData::Make(output_type, length_, {null_bitmap, data_},
null_count_);
- data_ = null_bitmap_ = nullptr;
+ data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_base.cc
b/cpp/src/arrow/array/builder_base.cc
index 321aa44..e805900 100644
--- a/cpp/src/arrow/array/builder_base.cc
+++ b/cpp/src/arrow/array/builder_base.cc
@@ -52,52 +52,21 @@ Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled,
ResizableBuffer* buf
}
Status ArrayBuilder::AppendToBitmap(bool is_valid) {
- if (length_ == capacity_) {
- // If the capacity was not already a multiple of 2, do so here
- // TODO(emkornfield) doubling isn't great default allocation practice
- // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md
- // fo discussion
- RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1)));
- }
+ RETURN_NOT_OK(Reserve(1));
UnsafeAppendToBitmap(is_valid);
return Status::OK();
}
Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t
length) {
RETURN_NOT_OK(Reserve(length));
-
UnsafeAppendToBitmap(valid_bytes, length);
return Status::OK();
}
Status ArrayBuilder::Resize(int64_t capacity) {
- // Target size of validity (null) bitmap data
- const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity);
RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
-
- if (capacity_ == 0) {
- RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_bitmap_size,
&null_bitmap_));
- null_bitmap_data_ = null_bitmap_->mutable_data();
-
- // Padding is zeroed by AllocateResizableBuffer
- memset(null_bitmap_data_, 0, static_cast<size_t>(new_bitmap_size));
- } else {
- const int64_t old_bitmap_capacity = null_bitmap_->capacity();
- RETURN_NOT_OK(null_bitmap_->Resize(new_bitmap_size));
-
- const int64_t new_bitmap_capacity = null_bitmap_->capacity();
- null_bitmap_data_ = null_bitmap_->mutable_data();
-
- // Zero the region between the original capacity and the new capacity,
- // including padding, which has not been zeroed, unlike
- // AllocateResizableBuffer
- if (old_bitmap_capacity < new_bitmap_capacity) {
- memset(null_bitmap_data_ + old_bitmap_capacity, 0,
- static_cast<size_t>(new_bitmap_capacity - old_bitmap_capacity));
- }
- }
capacity_ = capacity;
- return Status::OK();
+ return null_bitmap_builder_.Resize(capacity);
}
Status ArrayBuilder::Advance(int64_t elements) {
@@ -105,7 +74,7 @@ Status ArrayBuilder::Advance(int64_t elements) {
return Status::Invalid("Builder must be expanded");
}
length_ += elements;
- return Status::OK();
+ return null_bitmap_builder_.Advance(elements);
}
Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
@@ -115,18 +84,9 @@ Status ArrayBuilder::Finish(std::shared_ptr<Array>* out) {
return Status::OK();
}
-Status ArrayBuilder::Reserve(int64_t additional_elements) {
- if (length_ + additional_elements > capacity_) {
- // TODO(emkornfield) power of 2 growth is potentially suboptimal
- int64_t new_size = BitUtil::NextPower2(length_ + additional_elements);
- return Resize(new_size);
- }
- return Status::OK();
-}
-
void ArrayBuilder::Reset() {
capacity_ = length_ = null_count_ = 0;
- null_bitmap_ = nullptr;
+ null_bitmap_builder_.Reset();
}
Status ArrayBuilder::SetNotNull(int64_t length) {
@@ -135,42 +95,15 @@ Status ArrayBuilder::SetNotNull(int64_t length) {
return Status::OK();
}
-void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t
length) {
- if (valid_bytes == nullptr) {
- UnsafeSetNotNull(length);
- return;
- }
- UnsafeAppendToBitmap(valid_bytes, valid_bytes + length);
-}
-
void ArrayBuilder::UnsafeAppendToBitmap(const std::vector<bool>& is_valid) {
- UnsafeAppendToBitmap(is_valid.begin(), is_valid.end());
+ for (bool element_valid : is_valid) {
+ UnsafeAppendToBitmap(element_valid);
+ }
}
void ArrayBuilder::UnsafeSetNotNull(int64_t length) {
- const int64_t new_length = length + length_;
-
- // Fill up the bytes until we have a byte alignment
- int64_t pad_to_byte = std::min<int64_t>(8 - (length_ % 8), length);
-
- if (pad_to_byte == 8) {
- pad_to_byte = 0;
- }
- for (int64_t i = length_; i < length_ + pad_to_byte; ++i) {
- BitUtil::SetBit(null_bitmap_data_, i);
- }
-
- // Fast bitsetting
- int64_t fast_length = (length - pad_to_byte) / 8;
- memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 0xFF,
- static_cast<size_t>(fast_length));
-
- // Trailing bits
- for (int64_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length;
++i) {
- BitUtil::SetBit(null_bitmap_data_, i);
- }
-
- length_ = new_length;
+ length_ += length;
+ null_bitmap_builder_.UnsafeAppend(length, true);
}
} // namespace arrow
diff --git a/cpp/src/arrow/array/builder_base.h
b/cpp/src/arrow/array/builder_base.h
index ae400fc..f4655fa 100644
--- a/cpp/src/arrow/array/builder_base.h
+++ b/cpp/src/arrow/array/builder_base.h
@@ -17,8 +17,6 @@
#pragma once
-#include "arrow/array/builder_base.h"
-
#include <algorithm> // IWYU pragma: keep
#include <array>
#include <cstddef>
@@ -31,7 +29,7 @@
#include <type_traits>
#include <vector>
-#include "arrow/buffer.h"
+#include "arrow/buffer-builder.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/type.h"
@@ -61,13 +59,7 @@ constexpr int64_t kListMaximumElements =
std::numeric_limits<int32_t>::max() - 1
class ARROW_EXPORT ArrayBuilder {
public:
explicit ArrayBuilder(const std::shared_ptr<DataType>& type, MemoryPool*
pool)
- : type_(type),
- pool_(pool),
- null_bitmap_(NULLPTR),
- null_count_(0),
- null_bitmap_data_(NULLPTR),
- length_(0),
- capacity_(0) {}
+ : type_(type), pool_(pool), null_bitmap_builder_(pool) {}
virtual ~ArrayBuilder() = default;
@@ -98,7 +90,14 @@ class ARROW_EXPORT ArrayBuilder {
/// allocations in STL containers like std::vector
/// \param[in] additional_capacity the number of additional array values
/// \return Status
- Status Reserve(int64_t additional_capacity);
+ Status Reserve(int64_t additional_capacity) {
+ auto min_capacity = length() + additional_capacity;
+ if (min_capacity <= capacity()) return Status::OK();
+
+ // leave growth factor up to BufferBuilder
+ auto new_capacity = BufferBuilder::GrowByFactor(min_capacity);
+ return Resize(new_capacity);
+ }
/// Reset the builder.
virtual void Reset();
@@ -126,8 +125,6 @@ class ARROW_EXPORT ArrayBuilder {
std::shared_ptr<DataType> type() const { return type_; }
protected:
- ArrayBuilder() {}
-
/// Append to null bitmap
Status AppendToBitmap(bool is_valid);
@@ -144,49 +141,21 @@ class ARROW_EXPORT ArrayBuilder {
// Append to null bitmap, update the length
void UnsafeAppendToBitmap(bool is_valid) {
- if (is_valid) {
- BitUtil::SetBit(null_bitmap_data_, length_);
- } else {
- ++null_count_;
- }
+ null_bitmap_builder_.UnsafeAppend(is_valid);
++length_;
- }
-
- template <typename IterType>
- void UnsafeAppendToBitmap(const IterType& begin, const IterType& end) {
- int64_t byte_offset = length_ / 8;
- int64_t bit_offset = length_ % 8;
- uint8_t bitset = null_bitmap_data_[byte_offset];
-
- for (auto iter = begin; iter != end; ++iter) {
- if (bit_offset == 8) {
- bit_offset = 0;
- null_bitmap_data_[byte_offset] = bitset;
- byte_offset++;
- // TODO: Except for the last byte, this shouldn't be needed
- bitset = null_bitmap_data_[byte_offset];
- }
-
- if (*iter) {
- bitset |= BitUtil::kBitmask[bit_offset];
- } else {
- bitset &= BitUtil::kFlippedBitmask[bit_offset];
- ++null_count_;
- }
-
- bit_offset++;
- }
-
- if (bit_offset != 0) {
- null_bitmap_data_[byte_offset] = bitset;
- }
-
- length_ += std::distance(begin, end);
+ if (!is_valid) ++null_count_;
}
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
// assume all of length bits are valid.
- void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+ void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
+ if (valid_bytes == NULLPTR) {
+ return UnsafeSetNotNull(length);
+ }
+ null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
+ length_ += length;
+ null_count_ = null_bitmap_builder_.false_count();
+ }
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
@@ -208,14 +177,12 @@ class ARROW_EXPORT ArrayBuilder {
std::shared_ptr<DataType> type_;
MemoryPool* pool_;
- // When null_bitmap are first appended to the builder, the null bitmap is
allocated
- std::shared_ptr<ResizableBuffer> null_bitmap_;
- int64_t null_count_;
- uint8_t* null_bitmap_data_;
+ TypedBufferBuilder<bool> null_bitmap_builder_;
+ int64_t null_count_ = 0;
// Array length, so far. Also, the index of the next element to be added
- int64_t length_;
- int64_t capacity_;
+ int64_t length_ = 0;
+ int64_t capacity_ = 0;
// Child value array builders. These are owned by this class
std::vector<std::shared_ptr<ArrayBuilder>> children_;
diff --git a/cpp/src/arrow/array/builder_binary.cc
b/cpp/src/arrow/array/builder_binary.cc
index 8739859..4fef135 100644
--- a/cpp/src/arrow/array/builder_binary.cc
+++ b/cpp/src/arrow/array/builder_binary.cc
@@ -54,7 +54,7 @@ Status BinaryBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
// one more then requested for offsets
- RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
+ RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
@@ -78,12 +78,13 @@ Status
BinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(AppendNextOffset());
// These buffers' padding zeroed by BufferBuilder
- std::shared_ptr<Buffer> offsets, value_data;
+ std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
- *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets, value_data},
null_count_,
- 0);
+ *out =
+ ArrayData::Make(type_, length_, {null_bitmap, offsets, value_data},
null_count_, 0);
Reset();
return Status::OK();
}
@@ -131,17 +132,17 @@ Status StringBuilder::AppendValues(const
std::vector<std::string>& values,
if (valid_bytes) {
for (std::size_t i = 0; i < values.size(); ++i) {
- RETURN_NOT_OK(AppendNextOffset());
+ UnsafeAppendNextOffset();
if (valid_bytes[i]) {
- RETURN_NOT_OK(value_data_builder_.Append(
- reinterpret_cast<const uint8_t*>(values[i].data()),
values[i].size()));
+ value_data_builder_.UnsafeAppend(
+ reinterpret_cast<const uint8_t*>(values[i].data()),
values[i].size());
}
}
} else {
for (std::size_t i = 0; i < values.size(); ++i) {
- RETURN_NOT_OK(AppendNextOffset());
- RETURN_NOT_OK(value_data_builder_.Append(
- reinterpret_cast<const uint8_t*>(values[i].data()),
values[i].size()));
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const
uint8_t*>(values[i].data()),
+ values[i].size());
}
}
@@ -170,11 +171,11 @@ Status StringBuilder::AppendValues(const char** values,
int64_t length,
if (valid_bytes) {
int64_t valid_bytes_offset = 0;
for (int64_t i = 0; i < length; ++i) {
- RETURN_NOT_OK(AppendNextOffset());
+ UnsafeAppendNextOffset();
if (valid_bytes[i]) {
if (values[i]) {
- RETURN_NOT_OK(value_data_builder_.Append(
- reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const
uint8_t*>(values[i]),
+ value_lengths[i]);
} else {
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i -
valid_bytes_offset);
UnsafeAppendToBitmap(false);
@@ -187,19 +188,19 @@ Status StringBuilder::AppendValues(const char** values,
int64_t length,
if (have_null_value) {
std::vector<uint8_t> valid_vector(length, 0);
for (int64_t i = 0; i < length; ++i) {
- RETURN_NOT_OK(AppendNextOffset());
+ UnsafeAppendNextOffset();
if (values[i]) {
- RETURN_NOT_OK(value_data_builder_.Append(
- reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const
uint8_t*>(values[i]),
+ value_lengths[i]);
valid_vector[i] = 1;
}
}
UnsafeAppendToBitmap(valid_vector.data(), length);
} else {
for (int64_t i = 0; i < length; ++i) {
- RETURN_NOT_OK(AppendNextOffset());
- RETURN_NOT_OK(value_data_builder_.Append(
- reinterpret_cast<const uint8_t*>(values[i]), value_lengths[i]));
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(reinterpret_cast<const
uint8_t*>(values[i]),
+ value_lengths[i]);
}
UnsafeAppendToBitmap(nullptr, length);
}
@@ -250,9 +251,10 @@ Status
FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> data;
RETURN_NOT_OK(byte_builder_.Finish(&data));
- *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_);
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_);
- null_bitmap_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_decimal.cc
b/cpp/src/arrow/array/builder_decimal.cc
index d64c4db..191a0ff 100644
--- a/cpp/src/arrow/array/builder_decimal.cc
+++ b/cpp/src/arrow/array/builder_decimal.cc
@@ -55,8 +55,10 @@ Status Decimal128Builder::Append(const Decimal128& value) {
Status Decimal128Builder::FinishInternal(std::shared_ptr<ArrayData>* out) {
std::shared_ptr<Buffer> data;
RETURN_NOT_OK(byte_builder_.Finish(&data));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
- *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_);
+ *out = ArrayData::Make(type_, length_, {null_bitmap, data}, null_count_);
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_nested.cc
b/cpp/src/arrow/array/builder_nested.cc
index 87c302a..2f600cd 100644
--- a/cpp/src/arrow/array/builder_nested.cc
+++ b/cpp/src/arrow/array/builder_nested.cc
@@ -77,7 +77,7 @@ Status ListBuilder::Resize(int64_t capacity) {
RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
// one more then requested for offsets
- RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
+ RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
return ArrayBuilder::Resize(capacity);
}
@@ -99,7 +99,9 @@ Status
ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(value_builder_->FinishInternal(&items));
}
- *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets}, null_count_);
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_);
(*out)->child_data.emplace_back(std::move(items));
Reset();
return Status::OK();
@@ -134,8 +136,9 @@ void StructBuilder::Reset() {
}
Status StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_),
null_bitmap_.get()));
- *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_);
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type_, length_, {null_bitmap}, null_count_);
(*out)->child_data.resize(children_.size());
for (size_t i = 0; i < children_.size(); ++i) {
@@ -146,7 +149,6 @@ Status
StructBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
RETURN_NOT_OK(children_[i]->FinishInternal(&(*out)->child_data[i]));
}
- null_bitmap_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_primitive.cc
b/cpp/src/arrow/array/builder_primitive.cc
index bc14000..a593f36 100644
--- a/cpp/src/arrow/array/builder_primitive.cc
+++ b/cpp/src/arrow/array/builder_primitive.cc
@@ -113,12 +113,12 @@ Status PrimitiveBuilder<T>::AppendValues(const
std::vector<value_type>& values)
template <typename T>
Status PrimitiveBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
- RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_),
null_bitmap_.get()));
RETURN_NOT_OK(TrimBuffer(TypeTraits<T>::bytes_required(length_),
data_.get()));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type_, length_, {null_bitmap, data_}, null_count_);
- *out = ArrayData::Make(type_, length_, {null_bitmap_, data_}, null_count_);
-
- data_ = null_bitmap_ = nullptr;
+ data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
@@ -195,12 +195,13 @@ Status
BooleanBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
data_->mutable_data()[length_ / 8] &=
BitUtil::kPrecedingBitmask[bit_offset];
}
- RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_),
null_bitmap_.get()));
+ std::shared_ptr<Buffer> null_bitmap;
+ RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), data_.get()));
- *out = ArrayData::Make(boolean(), length_, {null_bitmap_, data_},
null_count_);
+ *out = ArrayData::Make(boolean(), length_, {null_bitmap, data_},
null_count_);
- data_ = null_bitmap_ = nullptr;
+ data_ = nullptr;
capacity_ = length_ = null_count_ = 0;
return Status::OK();
}
diff --git a/cpp/src/arrow/array/builder_primitive.h
b/cpp/src/arrow/array/builder_primitive.h
index 13f6c22..d17a130 100644
--- a/cpp/src/arrow/array/builder_primitive.h
+++ b/cpp/src/arrow/array/builder_primitive.h
@@ -53,8 +53,8 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
using ArrayBuilder::Advance;
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated
memory
- /// The memory at the corresponding data slot is set to 0 to prevent
uninitialized
- /// memory access
+ /// The memory at the corresponding data slot is set to 0 to prevent
+ /// uninitialized memory access
Status AppendNulls(const uint8_t* valid_bytes, int64_t length) {
ARROW_RETURN_NOT_OK(Reserve(length));
memset(raw_data_ + length_, 0,
@@ -141,7 +141,10 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
std::copy(values_begin, values_end, raw_data_ + length_);
// this updates the length_
- UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
+ for (int64_t i = 0; i != length; ++i) {
+ UnsafeAppendToBitmap(*valid_begin);
+ ++valid_begin;
+ }
return Status::OK();
}
@@ -158,7 +161,10 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
- UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
+ for (int64_t i = 0; i != length; ++i) {
+ UnsafeAppendToBitmap(*valid_begin);
+ ++valid_begin;
+ }
}
return Status::OK();
@@ -188,6 +194,7 @@ class ARROW_EXPORT NumericBuilder : public
PrimitiveBuilder<T> {
: PrimitiveBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
using ArrayBuilder::UnsafeAppendNull;
+ using ArrayBuilder::UnsafeAppendToBitmap;
using PrimitiveBuilder<T>::AppendValues;
using PrimitiveBuilder<T>::Resize;
using PrimitiveBuilder<T>::Reserve;
@@ -205,13 +212,12 @@ class ARROW_EXPORT NumericBuilder : public
PrimitiveBuilder<T> {
/// This method does not capacity-check; make sure to call Reserve
/// beforehand.
void UnsafeAppend(const value_type val) {
- BitUtil::SetBit(null_bitmap_data_, length_);
- raw_data_[length_++] = val;
+ raw_data_[length_] = val;
+ UnsafeAppendToBitmap(true);
}
protected:
using PrimitiveBuilder<T>::length_;
- using PrimitiveBuilder<T>::null_bitmap_data_;
using PrimitiveBuilder<T>::raw_data_;
};
@@ -272,13 +278,12 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
/// Scalar append, without checking for capacity
void UnsafeAppend(const bool val) {
- BitUtil::SetBit(null_bitmap_data_, length_);
if (val) {
BitUtil::SetBit(raw_data_, length_);
} else {
BitUtil::ClearBit(raw_data_, length_);
}
- ++length_;
+ UnsafeAppendToBitmap(true);
}
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
@@ -364,7 +369,10 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
[&iter]() -> bool { return *(iter++); });
// this updates length_
- ArrayBuilder::UnsafeAppendToBitmap(valid_begin, std::next(valid_begin,
length));
+ for (int64_t i = 0; i != length; ++i) {
+ ArrayBuilder::UnsafeAppendToBitmap(*valid_begin);
+ ++valid_begin;
+ }
return Status::OK();
}
@@ -383,7 +391,10 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
if (valid_begin == NULLPTR) {
UnsafeSetNotNull(length);
} else {
- UnsafeAppendToBitmap(valid_begin, std::next(valid_begin, length));
+ for (int64_t i = 0; i != length; ++i) {
+ ArrayBuilder::UnsafeAppendToBitmap(*valid_begin);
+ ++valid_begin;
+ }
}
return Status::OK();
diff --git a/cpp/src/arrow/buffer-builder.h b/cpp/src/arrow/buffer-builder.h
index dafa3ee..9344d5d 100644
--- a/cpp/src/arrow/buffer-builder.h
+++ b/cpp/src/arrow/buffer-builder.h
@@ -37,7 +37,8 @@ namespace arrow {
// Buffer builder classes
/// \class BufferBuilder
-/// \brief A class for incrementally building a contiguous chunk of in-memory
data
+/// \brief A class for incrementally building a contiguous chunk of in-memory
+/// data
class ARROW_EXPORT BufferBuilder {
public:
explicit BufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
@@ -45,23 +46,22 @@ class ARROW_EXPORT BufferBuilder {
/// \brief Resize the buffer to the nearest multiple of 64 bytes
///
- /// \param elements the new capacity of the of the builder. Will be rounded
- /// up to a multiple of 64 bytes for padding
- /// \param shrink_to_fit if new capacity is smaller than the existing size,
- /// reallocate internal buffer. Set to false to avoid reallocations when
- /// shrinking the builder.
+ /// \param new_capacity the new capacity of the of the builder. Will be
+ /// rounded up to a multiple of 64 bytes for padding \param shrink_to_fit if
+ /// new capacity is smaller than the existing size, reallocate internal
+ /// buffer. Set to false to avoid reallocations when shrinking the builder.
/// \return Status
- Status Resize(const int64_t elements, bool shrink_to_fit = true) {
+ Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
// Resize(0) is a no-op
- if (elements == 0) {
+ if (new_capacity == 0) {
return Status::OK();
}
int64_t old_capacity = capacity_;
if (buffer_ == NULLPTR) {
- ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, elements, &buffer_));
+ ARROW_RETURN_NOT_OK(AllocateResizableBuffer(pool_, new_capacity,
&buffer_));
} else {
- ARROW_RETURN_NOT_OK(buffer_->Resize(elements, shrink_to_fit));
+ ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
}
capacity_ = buffer_->capacity();
data_ = buffer_->mutable_data();
@@ -74,59 +74,72 @@ class ARROW_EXPORT BufferBuilder {
/// \brief Ensure that builder can accommodate the additional number of bytes
/// without the need to perform allocations
///
- /// \param size number of additional bytes to make space for
+ /// \param[in] additional_bytes number of additional bytes to make space for
+ /// \param[in] grow_by_factor if true, round up allocations using the
+ /// strategy in BufferBuilder::GrowByFactor
/// \return Status
- Status Reserve(const int64_t size) { return Resize(size_ + size, false); }
+ Status Reserve(const int64_t additional_bytes, bool grow_by_factor = false) {
+ auto min_capacity = size_ + additional_bytes;
+ if (min_capacity <= capacity_) return Status::OK();
+ if (grow_by_factor) {
+ min_capacity = GrowByFactor(min_capacity);
+ }
+ return Resize(min_capacity, false);
+ }
+
+ /// \brief Return a capacity expanded by a growth factor of 2
+ static int64_t GrowByFactor(const int64_t min_capacity) {
+ // If the capacity was not already a multiple of 2, do so here
+ // TODO(emkornfield) doubling isn't great default allocation practice
+ // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md
+ // for discussion
+ return BitUtil::NextPower2(min_capacity);
+ }
/// \brief Append the given data to the buffer
///
/// The buffer is automatically expanded if necessary.
- Status Append(const void* data, int64_t length) {
- if (capacity_ < length + size_) {
- int64_t new_capacity = BitUtil::NextPower2(length + size_);
- ARROW_RETURN_NOT_OK(Resize(new_capacity));
+ Status Append(const void* data, const int64_t length) {
+ if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
+ ARROW_RETURN_NOT_OK(Resize(GrowByFactor(size_ + length), false));
}
UnsafeAppend(data, length);
return Status::OK();
}
+ /// \brief Append copies of a value to the buffer
+ ///
+ /// The buffer is automatically expanded if necessary.
+ Status Append(const int64_t num_copies, uint8_t value) {
+ ARROW_RETURN_NOT_OK(Reserve(num_copies, true));
+ UnsafeAppend(num_copies, value);
+ return Status::OK();
+ }
+
/// \brief Append the given data to the buffer
///
/// The buffer is automatically expanded if necessary.
template <size_t NBYTES>
Status Append(const std::array<uint8_t, NBYTES>& data) {
constexpr auto nbytes = static_cast<int64_t>(NBYTES);
- if (capacity_ < nbytes + size_) {
- int64_t new_capacity = BitUtil::NextPower2(nbytes + size_);
- ARROW_RETURN_NOT_OK(Resize(new_capacity));
- }
-
- if (nbytes > 0) {
- std::copy(data.cbegin(), data.cend(), data_ + size_);
- size_ += nbytes;
- }
+ ARROW_RETURN_NOT_OK(Reserve(NBYTES, true));
+ std::copy(data.cbegin(), data.cend(), data_ + size_);
+ size_ += nbytes;
return Status::OK();
}
// Advance pointer and zero out memory
- Status Advance(const int64_t length) {
- if (capacity_ < length + size_) {
- int64_t new_capacity = BitUtil::NextPower2(length + size_);
- ARROW_RETURN_NOT_OK(Resize(new_capacity));
- }
- if (length > 0) {
- memset(data_ + size_, 0, static_cast<size_t>(length));
- size_ += length;
- }
- return Status::OK();
- }
+ Status Advance(const int64_t length) { return Append(length, 0); }
// Unsafe methods don't check existing size
- void UnsafeAppend(const void* data, int64_t length) {
- if (length > 0) {
- memcpy(data_ + size_, data, static_cast<size_t>(length));
- size_ += length;
- }
+ void UnsafeAppend(const void* data, const int64_t length) {
+ memcpy(data_ + size_, data, static_cast<size_t>(length));
+ size_ += length;
+ }
+
+ void UnsafeAppend(const int64_t num_copies, uint8_t value) {
+ memset(data_ + size_, value, static_cast<size_t>(num_copies));
+ size_ += num_copies;
}
/// \brief Return result of builder as a Buffer object.
@@ -153,8 +166,9 @@ class ARROW_EXPORT BufferBuilder {
int64_t capacity() const { return capacity_; }
int64_t length() const { return size_; }
const uint8_t* data() const { return data_; }
+ uint8_t* mutable_data() { return data_; }
- protected:
+ private:
std::shared_ptr<ResizableBuffer> buffer_;
MemoryPool* pool_;
uint8_t* data_;
@@ -162,42 +176,171 @@ class ARROW_EXPORT BufferBuilder {
int64_t size_;
};
-/// \brief A BufferBuilder subclass with convenience methods to append typed
data
+template <typename T, typename Enable = void>
+class TypedBufferBuilder;
+
+/// \brief A BufferBuilder for building a buffer of arithmetic elements
template <typename T>
-class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder {
+class TypedBufferBuilder<T, typename
std::enable_if<std::is_arithmetic<T>::value>::type> {
public:
- explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {}
+ explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
+ : bytes_builder_(pool) {}
- Status Append(T arithmetic_value) {
- static_assert(std::is_arithmetic<T>::value,
- "Convenience buffer append only supports arithmetic types");
- return BufferBuilder::Append(reinterpret_cast<uint8_t*>(&arithmetic_value),
- sizeof(T));
+ Status Append(T value) {
+ return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value),
sizeof(T));
}
- Status Append(const T* arithmetic_values, int64_t num_elements) {
- static_assert(std::is_arithmetic<T>::value,
- "Convenience buffer append only supports arithmetic types");
- return BufferBuilder::Append(reinterpret_cast<const
uint8_t*>(arithmetic_values),
+ Status Append(const T* values, int64_t num_elements) {
+ return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
num_elements * sizeof(T));
}
- void UnsafeAppend(T arithmetic_value) {
- static_assert(std::is_arithmetic<T>::value,
- "Convenience buffer append only supports arithmetic types");
- BufferBuilder::UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value),
sizeof(T));
+ Status Append(const int64_t num_copies, T value) {
+ ARROW_RETURN_NOT_OK(Resize(GrowByFactor(num_copies + length()), false));
+ UnsafeAppend(num_copies, value);
+ return Status::OK();
}
- void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) {
- static_assert(std::is_arithmetic<T>::value,
- "Convenience buffer append only supports arithmetic types");
- BufferBuilder::UnsafeAppend(reinterpret_cast<const
uint8_t*>(arithmetic_values),
+ void UnsafeAppend(T value) {
+ bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
+ }
+
+ void UnsafeAppend(const T* values, int64_t num_elements) {
+ bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
num_elements * sizeof(T));
}
- const T* data() const { return reinterpret_cast<const T*>(data_); }
- int64_t length() const { return size_ / sizeof(T); }
- int64_t capacity() const { return capacity_ / sizeof(T); }
+ void UnsafeAppend(const int64_t num_copies, T value) {
+ auto data = mutable_data() + length();
+ bytes_builder_.UnsafeAppend(num_copies * sizeof(T), 0);
+ for (const auto end = data + num_copies; data != end; ++data) {
+ *data = value;
+ }
+ }
+
+ Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+ return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
+ }
+
+ Status Reserve(const int64_t additional_elements) {
+ return bytes_builder_.Reserve(additional_elements * sizeof(T));
+ }
+
+ Status Advance(const int64_t length) {
+ return bytes_builder_.Advance(length * sizeof(T));
+ }
+
+ Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ return bytes_builder_.Finish(out, shrink_to_fit);
+ }
+
+ void Reset() { bytes_builder_.Reset(); }
+
+ int64_t length() const { return bytes_builder_.length() / sizeof(T); }
+ int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
+ const T* data() const { return reinterpret_cast<const
T*>(bytes_builder_.data()); }
+ T* mutable_data() { return
reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
+
+ private:
+ BufferBuilder bytes_builder_;
+};
+
+/// \brief A BufferBuilder for building a buffer containing a bitmap
+template <>
+class TypedBufferBuilder<bool> {
+ public:
+ explicit TypedBufferBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT)
+ : bytes_builder_(pool) {}
+
+ Status Append(bool value) {
+ ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + 1));
+ UnsafeAppend(value);
+ return Status::OK();
+ }
+
+ Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
+ ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_elements));
+ UnsafeAppend(valid_bytes, num_elements);
+ return Status::OK();
+ }
+
+ Status Append(const int64_t num_copies, bool value) {
+ ARROW_RETURN_NOT_OK(ResizeWithGrowthFactor(bit_length_ + num_copies));
+ UnsafeAppend(num_copies, value);
+ return Status::OK();
+ }
+
+ void UnsafeAppend(bool value) {
+ BitUtil::SetBitTo(mutable_data(), bit_length_, value);
+ if (!value) {
+ ++false_count_;
+ }
+ ++bit_length_;
+ }
+
+ void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
+ if (num_elements == 0) return;
+ int64_t i = 0;
+ internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
[&] {
+ bool value = bytes[i++];
+ if (!value) ++false_count_;
+ return value;
+ });
+ bit_length_ += num_elements;
+ }
+
+ void UnsafeAppend(const int64_t num_copies, bool value) {
+ BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
+ if (!value) {
+ false_count_ += num_copies;
+ }
+ bit_length_ += num_copies;
+ }
+
+ Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+ const int64_t old_byte_capacity = bytes_builder_.capacity();
+ const int64_t new_byte_capacity = BitUtil::BytesForBits(new_capacity);
+ ARROW_RETURN_NOT_OK(bytes_builder_.Resize(new_byte_capacity,
shrink_to_fit));
+ if (new_byte_capacity > old_byte_capacity) {
+ memset(mutable_data() + old_byte_capacity, 0,
+ static_cast<size_t>(new_byte_capacity - old_byte_capacity));
+ }
+ return Status::OK();
+ }
+
+ Status Reserve(const int64_t additional_elements) {
+ return Resize(bit_length_ + additional_elements, false);
+ }
+
+ Status Advance(const int64_t length) {
+ bit_length_ += length;
+ false_count_ += length;
+ return ResizeWithGrowthFactor(bit_length_);
+ }
+
+ Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+ bit_length_ = false_count_ = 0;
+ return bytes_builder_.Finish(out, shrink_to_fit);
+ }
+
+ void Reset() {
+ bytes_builder_.Reset();
+ bit_length_ = false_count_ = 0;
+ }
+
+ int64_t length() const { return bit_length_; }
+ int64_t capacity() const { return bytes_builder_.capacity() * 8; }
+ const uint8_t* data() const { return bytes_builder_.data(); }
+ uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
+ int64_t false_count() const { return false_count_; }
+
+ private:
+ Status ResizeWithGrowthFactor(const int64_t min_capacity) {
+ return Resize(BufferBuilder::GrowByFactor(min_capacity), false);
+ }
+ BufferBuilder bytes_builder_;
+ int64_t bit_length_ = 0;
+ int64_t false_count_ = 0;
};
} // namespace arrow
diff --git a/cpp/src/arrow/buffer-test.cc b/cpp/src/arrow/buffer-test.cc
index 7c54e13..da8295f 100644
--- a/cpp/src/arrow/buffer-test.cc
+++ b/cpp/src/arrow/buffer-test.cc
@@ -262,6 +262,82 @@ TEST(TestBufferBuilder, ResizeReserve) {
}
template <typename T>
+class TypedTestBufferBuilder : public ::testing::Test {};
+
+using BufferBuilderElements = ::testing::Types<int16_t, uint32_t, double>;
+
+TYPED_TEST_CASE(TypedTestBufferBuilder, BufferBuilderElements);
+
+TYPED_TEST(TypedTestBufferBuilder, BasicTypedBufferBuilderUsage) {
+ TypedBufferBuilder<TypeParam> builder;
+
+ ASSERT_OK(builder.Append(static_cast<TypeParam>(0)));
+ ASSERT_EQ(builder.length(), 1);
+ ASSERT_EQ(builder.capacity(), 64 / sizeof(TypeParam));
+
+ constexpr int nvalues = 4;
+ TypeParam values[nvalues];
+ for (int i = 0; i != nvalues; ++i) {
+ values[i] = static_cast<TypeParam>(i);
+ }
+ ASSERT_OK(builder.Append(values, nvalues));
+ ASSERT_EQ(builder.length(), nvalues + 1);
+
+ std::shared_ptr<Buffer> built;
+ ASSERT_OK(builder.Finish(&built));
+
+ auto data = reinterpret_cast<const TypeParam*>(built->data());
+ ASSERT_EQ(data[0], static_cast<TypeParam>(0));
+ for (auto value : values) {
+ ++data;
+ ASSERT_EQ(*data, value);
+ }
+}
+
+TEST(TestBufferBuilder, BasicBoolBufferBuilderUsage) {
+ TypedBufferBuilder<bool> builder;
+
+ ASSERT_OK(builder.Append(false));
+ ASSERT_EQ(builder.length(), 1);
+ ASSERT_EQ(builder.capacity(), 64 * 8);
+
+ constexpr int nvalues = 4;
+ uint8_t values[nvalues];
+ for (int i = 0; i != nvalues; ++i) {
+ values[i] = static_cast<uint8_t>(i);
+ }
+ ASSERT_OK(builder.Append(values, nvalues));
+ ASSERT_EQ(builder.length(), nvalues + 1);
+
+ ASSERT_EQ(builder.false_count(), 2);
+
+ std::shared_ptr<Buffer> built;
+ ASSERT_OK(builder.Finish(&built));
+
+ ASSERT_EQ(BitUtil::GetBit(built->data(), 0), false);
+ for (int i = 0; i != nvalues; ++i) {
+ ASSERT_EQ(BitUtil::GetBit(built->data(), i + 1),
static_cast<bool>(values[i]));
+ }
+}
+
+TEST(TestBufferBuilder, BoolBufferBuilderAppendCopies) {
+ TypedBufferBuilder<bool> builder;
+
+ ASSERT_OK(builder.Append(13, true));
+ ASSERT_OK(builder.Append(17, false));
+ ASSERT_EQ(builder.length(), 13 + 17);
+ ASSERT_EQ(builder.capacity(), 64 * 8);
+ ASSERT_EQ(builder.false_count(), 17);
+
+ std::shared_ptr<Buffer> built;
+ ASSERT_OK(builder.Finish(&built));
+
+ for (int i = 0; i != 13 + 17; ++i) {
+ EXPECT_EQ(BitUtil::GetBit(built->data(), i), i < 13) << "index = " << i;
+ }
+}
+
+template <typename T>
class TypedTestBuffer : public ::testing::Test {};
using BufferPtrs =
diff --git a/cpp/src/arrow/builder-benchmark.cc
b/cpp/src/arrow/builder-benchmark.cc
index fae9c89..e4a56bf 100644
--- a/cpp/src/arrow/builder-benchmark.cc
+++ b/cpp/src/arrow/builder-benchmark.cc
@@ -18,6 +18,7 @@
#include <algorithm>
#include <cstdint>
#include <limits>
+#include <numeric>
#include <random>
#include <string>
#include <vector>
@@ -148,7 +149,7 @@ static void BM_BuildBooleanArrayNoNulls(
constexpr uint8_t bit_pattern = 0xcc; // 0b11001100
uint64_t index = 0;
std::generate(data.begin(), data.end(),
- [&index]() -> uint8_t { return (bit_pattern >> ((index++) %
8)) & 1; });
+ [&]() -> uint8_t { return (bit_pattern >> ((index++) % 8)) &
1; });
while (state.KeepRunning()) {
BooleanBuilder builder;
@@ -292,13 +293,13 @@ static std::vector<std::string>
MakeStringDictFodder(int32_t n_values,
*it++ = "abcfgh";
// Add random strings
std::uniform_int_distribution<int32_t> length_dist(2, 20);
- std::independent_bits_engine<std::default_random_engine, 8, uint8_t>
bytes_gen(42);
+ std::independent_bits_engine<std::default_random_engine, 8, uint16_t>
bytes_gen(42);
- std::generate(it, values_dict.end(), [&]() {
+ std::generate(it, values_dict.end(), [&] {
auto length = length_dist(gen);
std::string s(length, 'X');
for (int32_t i = 0; i < length; ++i) {
- s[i] = bytes_gen();
+ s[i] = static_cast<char>(bytes_gen());
}
return s;
});
@@ -306,7 +307,7 @@ static std::vector<std::string>
MakeStringDictFodder(int32_t n_values,
{
std::uniform_int_distribution<int32_t> indices_dist(0, n_distinct - 1);
std::generate(values.begin(), values.end(),
- [&]() { return values_dict[indices_dist(gen)]; });
+ [&] { return values_dict[indices_dist(gen)]; });
}
return values;
}
@@ -349,7 +350,7 @@ static void BM_BuildStringDictionaryArray(
const auto fodder = MakeStringDictFodder(10000, 100);
auto type = binary();
auto fodder_size =
- std::accumulate(fodder.begin(), fodder.end(), 0,
+ std::accumulate(fodder.begin(), fodder.end(), static_cast<size_t>(0),
[&](size_t acc, const std::string& s) { return acc +
s.size(); });
while (state.KeepRunning()) {
@@ -394,9 +395,7 @@ BENCHMARK(BM_BuildAdaptiveUIntNoNullsScalarAppend)
BENCHMARK(BM_BuildBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_BuildChunkedBinaryArray)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildFixedSizeBinaryArray)
- ->Repetitions(kRepetitions)
- ->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildFixedSizeBinaryArray)->MinTime(3.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_BuildInt64DictionaryArrayRandom)
->Repetitions(kRepetitions)
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index b89f52f..fea1044 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -45,7 +45,7 @@ struct TypeTraits<UInt8Type> {
using ArrayType = UInt8Array;
using BuilderType = UInt8Builder;
using TensorType = UInt8Tensor;
- static inline int64_t bytes_required(int64_t elements) { return elements; }
+ static constexpr int64_t bytes_required(int64_t elements) { return elements;
}
constexpr static bool is_parameter_free = true;
static inline std::shared_ptr<DataType> type_singleton() { return uint8(); }
};
@@ -55,7 +55,7 @@ struct TypeTraits<Int8Type> {
using ArrayType = Int8Array;
using BuilderType = Int8Builder;
using TensorType = Int8Tensor;
- static inline int64_t bytes_required(int64_t elements) { return elements; }
+ static constexpr int64_t bytes_required(int64_t elements) { return elements;
}
constexpr static bool is_parameter_free = true;
static inline std::shared_ptr<DataType> type_singleton() { return int8(); }
};
@@ -66,7 +66,7 @@ struct TypeTraits<UInt16Type> {
using BuilderType = UInt16Builder;
using TensorType = UInt16Tensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(uint16_t);
}
constexpr static bool is_parameter_free = true;
@@ -79,7 +79,7 @@ struct TypeTraits<Int16Type> {
using BuilderType = Int16Builder;
using TensorType = Int16Tensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int16_t);
}
constexpr static bool is_parameter_free = true;
@@ -92,7 +92,7 @@ struct TypeTraits<UInt32Type> {
using BuilderType = UInt32Builder;
using TensorType = UInt32Tensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(uint32_t);
}
constexpr static bool is_parameter_free = true;
@@ -105,7 +105,7 @@ struct TypeTraits<Int32Type> {
using BuilderType = Int32Builder;
using TensorType = Int32Tensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int32_t);
}
constexpr static bool is_parameter_free = true;
@@ -118,7 +118,7 @@ struct TypeTraits<UInt64Type> {
using BuilderType = UInt64Builder;
using TensorType = UInt64Tensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(uint64_t);
}
constexpr static bool is_parameter_free = true;
@@ -131,7 +131,7 @@ struct TypeTraits<Int64Type> {
using BuilderType = Int64Builder;
using TensorType = Int64Tensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int64_t);
}
constexpr static bool is_parameter_free = true;
@@ -143,7 +143,7 @@ struct TypeTraits<Date64Type> {
using ArrayType = Date64Array;
using BuilderType = Date64Builder;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int64_t);
}
constexpr static bool is_parameter_free = true;
@@ -155,7 +155,7 @@ struct TypeTraits<Date32Type> {
using ArrayType = Date32Array;
using BuilderType = Date32Builder;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int32_t);
}
constexpr static bool is_parameter_free = true;
@@ -167,7 +167,7 @@ struct TypeTraits<TimestampType> {
using ArrayType = TimestampArray;
using BuilderType = TimestampBuilder;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int64_t);
}
constexpr static bool is_parameter_free = false;
@@ -178,7 +178,7 @@ struct TypeTraits<Time32Type> {
using ArrayType = Time32Array;
using BuilderType = Time32Builder;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int32_t);
}
constexpr static bool is_parameter_free = false;
@@ -189,7 +189,7 @@ struct TypeTraits<Time64Type> {
using ArrayType = Time64Array;
using BuilderType = Time64Builder;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(int64_t);
}
constexpr static bool is_parameter_free = false;
@@ -201,7 +201,7 @@ struct TypeTraits<HalfFloatType> {
using BuilderType = HalfFloatBuilder;
using TensorType = HalfFloatTensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return elements * sizeof(uint16_t);
}
constexpr static bool is_parameter_free = true;
@@ -214,7 +214,7 @@ struct TypeTraits<FloatType> {
using BuilderType = FloatBuilder;
using TensorType = FloatTensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return static_cast<int64_t>(elements * sizeof(float));
}
constexpr static bool is_parameter_free = true;
@@ -227,7 +227,7 @@ struct TypeTraits<DoubleType> {
using BuilderType = DoubleBuilder;
using TensorType = DoubleTensor;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return static_cast<int64_t>(elements * sizeof(double));
}
constexpr static bool is_parameter_free = true;
@@ -246,7 +246,7 @@ struct TypeTraits<BooleanType> {
using ArrayType = BooleanArray;
using BuilderType = BooleanBuilder;
- static inline int64_t bytes_required(int64_t elements) {
+ static constexpr int64_t bytes_required(int64_t elements) {
return BitUtil::BytesForBits(elements);
}
constexpr static bool is_parameter_free = true;
diff --git a/cpp/src/arrow/util/bit-util-test.cc
b/cpp/src/arrow/util/bit-util-test.cc
index 174e6d0..6709ae4 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -516,6 +516,43 @@ TEST(BitUtilTests, TestCountSetBits) {
}
}
+TEST(BitUtilTests, TestSetBitsTo) {
+ using BitUtil::SetBitsTo;
+ for (const auto fill_byte_int : {0x00, 0xff}) {
+ const uint8_t fill_byte = static_cast<uint8_t>(fill_byte_int);
+ {
+ // test set within a byte
+ uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+ SetBitsTo(bitmap, 2, 2, true);
+ SetBitsTo(bitmap, 4, 2, false);
+ ASSERT_BYTES_EQ(bitmap, {static_cast<uint8_t>((fill_byte & ~0x3C) |
0xC)});
+ }
+ {
+ // test straddling a single byte boundary
+ uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+ SetBitsTo(bitmap, 4, 7, true);
+ SetBitsTo(bitmap, 11, 7, false);
+ ASSERT_BYTES_EQ(bitmap, {static_cast<uint8_t>((fill_byte & 0xF) | 0xF0),
0x7,
+ static_cast<uint8_t>(fill_byte & ~0x3)});
+ }
+ {
+ // test byte aligned end
+ uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+ SetBitsTo(bitmap, 4, 4, true);
+ SetBitsTo(bitmap, 8, 8, false);
+ ASSERT_BYTES_EQ(bitmap,
+ {static_cast<uint8_t>((fill_byte & 0xF) | 0xF0), 0x00,
fill_byte});
+ }
+ {
+ // test byte aligned end, multiple bytes
+ uint8_t bitmap[] = {fill_byte, fill_byte, fill_byte, fill_byte};
+ SetBitsTo(bitmap, 0, 24, false);
+ uint8_t false_byte = static_cast<uint8_t>(0);
+ ASSERT_BYTES_EQ(bitmap, {false_byte, false_byte, false_byte, fill_byte});
+ }
+ }
+}
+
TEST(BitUtilTests, TestCopyBitmap) {
const int kBufferSize = 1000;
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 415684e..8e6979f 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -53,6 +53,7 @@
#endif
#include <cstdint>
+#include <cstring>
#include <limits>
#include <memory>
#include <type_traits>
@@ -84,11 +85,11 @@ namespace BitUtil {
//
// Returns the ceil of value/divisor
-static inline int64_t CeilDiv(int64_t value, int64_t divisor) {
+constexpr int64_t CeilDiv(int64_t value, int64_t divisor) {
return value / divisor + (value % divisor != 0);
}
-static inline int64_t BytesForBits(int64_t bits) { return (bits + 7) >> 3; }
+constexpr int64_t BytesForBits(int64_t bits) { return (bits + 7) >> 3; }
// Returns the smallest power of two that contains v. If v is already a
// power of two, it is returned as is.
@@ -106,12 +107,12 @@ static inline int64_t NextPower2(int64_t n) {
return n;
}
-static inline bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
+constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
-static inline bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
+constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
// Returns 'value' rounded up to the nearest multiple of 'factor'
-static inline int64_t RoundUp(int64_t value, int64_t factor) {
+constexpr int64_t RoundUp(int64_t value, int64_t factor) {
return (value + (factor - 1)) / factor * factor;
}
@@ -119,16 +120,14 @@ static inline int64_t RoundUp(int64_t value, int64_t
factor) {
// is a power of two.
// The result is undefined on overflow, i.e. if `value > 2**64 - factor`,
// since we cannot return the correct result which would be 2**64.
-static inline int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
+constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
// DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
return (value + (factor - 1)) & ~(factor - 1);
}
-static inline int64_t RoundUpToMultipleOf8(int64_t num) {
- return RoundUpToPowerOf2(num, 8);
-}
+constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return
RoundUpToPowerOf2(num, 8); }
-static inline int64_t RoundUpToMultipleOf64(int64_t num) {
+constexpr int64_t RoundUpToMultipleOf64(int64_t num) {
return RoundUpToPowerOf2(num, 64);
}
@@ -329,6 +328,48 @@ static inline void SetBitTo(uint8_t* bits, int64_t i, bool
bit_is_set) {
kBitmask[i % 8];
}
+/// \brief set or clear a range of bits quickly
+static inline void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t
length,
+ bool bits_are_set) {
+ if (length == 0) return;
+
+ const auto i_begin = start_offset;
+ const auto i_end = start_offset + length;
+ const uint8_t fill_byte =
static_cast<uint8_t>(-static_cast<uint8_t>(bits_are_set));
+
+ const auto bytes_begin = i_begin / 8;
+ const auto bytes_end = i_end / 8 + 1;
+
+ const auto first_byte_mask = kPrecedingBitmask[i_begin % 8];
+ const auto last_byte_mask = kTrailingBitmask[i_end % 8];
+
+ if (bytes_end == bytes_begin + 1) {
+ // set bits within a single byte
+ const auto only_byte_mask =
+ i_end % 8 == 0 ? first_byte_mask
+ : static_cast<uint8_t>(first_byte_mask |
last_byte_mask);
+ bits[bytes_begin] &= only_byte_mask;
+ bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~only_byte_mask);
+ return;
+ }
+
+ // set/clear trailing bits of first byte
+ bits[bytes_begin] &= first_byte_mask;
+ bits[bytes_begin] |= static_cast<uint8_t>(fill_byte & ~first_byte_mask);
+
+ if (bytes_end - bytes_begin > 2) {
+ // set/clear whole bytes
+ std::memset(bits + bytes_begin + 1, fill_byte,
+ static_cast<size_t>(bytes_end - bytes_begin - 2));
+ }
+
+ if (i_end % 8 == 0) return;
+
+ // set/clear leading bits of last byte
+ bits[bytes_end - 1] &= last_byte_mask;
+ bits[bytes_end - 1] |= static_cast<uint8_t>(fill_byte & ~last_byte_mask);
+}
+
/// \brief Convert vector of bytes to bitmap buffer
ARROW_EXPORT
Status BytesToBits(const std::vector<uint8_t>&, MemoryPool*,
std::shared_ptr<Buffer>*);
@@ -578,8 +619,8 @@ Status CopyBitmap(MemoryPool* pool, const uint8_t* bitmap,
int64_t offset, int64
/// \param[in] offset bit offset into the source data
/// \param[in] length number of bits to copy
/// \param[in] dest_offset bit offset into the destination
-/// \param[out] dest the destination buffer, must have at least space for
(offset +
-/// length) bits
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
ARROW_EXPORT
void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length,
uint8_t* dest,
int64_t dest_offset);
@@ -590,8 +631,8 @@ void CopyBitmap(const uint8_t* bitmap, int64_t offset,
int64_t length, uint8_t*
/// \param[in] offset bit offset into the source data
/// \param[in] length number of bits to copy
/// \param[in] dest_offset bit offset into the destination
-/// \param[out] dest the destination buffer, must have at least space for
(offset +
-/// length) bits
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
ARROW_EXPORT
void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length,
uint8_t* dest,
int64_t dest_offset);
@@ -613,7 +654,8 @@ Status InvertBitmap(MemoryPool* pool, const uint8_t*
bitmap, int64_t offset,
///
/// \param[in] data a packed LSB-ordered bitmap as a byte array
/// \param[in] bit_offset a bitwise offset into the bitmap
-/// \param[in] length the number of bits to inspect in the bitmap relative to
the offset
+/// \param[in] length the number of bits to inspect in the bitmap relative to
+/// the offset
///
/// \return The number of set (1) bits in the range
ARROW_EXPORT
diff --git a/cpp/src/arrow/util/hashing-benchmark.cc
b/cpp/src/arrow/util/hashing-benchmark.cc
index 09d00af..ee70391 100644
--- a/cpp/src/arrow/util/hashing-benchmark.cc
+++ b/cpp/src/arrow/util/hashing-benchmark.cc
@@ -49,13 +49,13 @@ static std::vector<std::string> MakeStrings(int32_t
n_values, int32_t min_length
// Generate strings between 2 and 20 bytes
std::uniform_int_distribution<int32_t> length_dist(min_length, max_length);
- std::independent_bits_engine<std::default_random_engine, 8, uint8_t>
bytes_gen(42);
+ std::independent_bits_engine<std::default_random_engine, 8, uint16_t>
bytes_gen(42);
std::generate(values.begin(), values.end(), [&]() {
auto length = length_dist(gen);
std::string s(length, 'X');
for (int32_t i = 0; i < length; ++i) {
- s[i] = bytes_gen();
+ s[i] = static_cast<uint8_t>(bytes_gen());
}
return s;
});