Repository: arrow Updated Branches: refs/heads/master a4002c6e2 -> fbbee3d2d
ARROW-77: [C++] Conform bitmap interpretation to ARROW-62; 1 for nulls, 0 for non-nulls Author: Wes McKinney <w...@apache.org> Closes #35 from wesm/ARROW-77 and squashes the following commits: 848d1fe [Wes McKinney] Clean up variable names to indicate valid_bytes vs null_bytes and change nulls to null_bitmap to be more clear 8960c7d [Wes McKinney] Flip bit interpretation so that 1 is null and 0 is not-null. Do not compare null slots in primitive arrays Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fbbee3d2 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fbbee3d2 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fbbee3d2 Branch: refs/heads/master Commit: fbbee3d2db5beb1ae6c623fc6392095cffdf74fe Parents: a4002c6 Author: Wes McKinney <w...@apache.org> Authored: Thu Mar 24 09:31:56 2016 -0700 Committer: Wes McKinney <w...@apache.org> Committed: Thu Mar 24 09:31:56 2016 -0700 ---------------------------------------------------------------------- cpp/src/arrow/array-test.cc | 30 +++++----- cpp/src/arrow/array.cc | 10 ++-- cpp/src/arrow/array.h | 16 ++--- cpp/src/arrow/builder.cc | 16 ++--- cpp/src/arrow/builder.h | 14 ++--- cpp/src/arrow/column-benchmark.cc | 6 +- cpp/src/arrow/ipc/adapter.cc | 10 ++-- cpp/src/arrow/ipc/ipc-adapter-test.cc | 8 +-- cpp/src/arrow/test-util.h | 25 ++++---- cpp/src/arrow/types/construct.cc | 4 +- cpp/src/arrow/types/construct.h | 2 +- cpp/src/arrow/types/list.cc | 6 +- cpp/src/arrow/types/list.h | 23 ++++---- cpp/src/arrow/types/primitive-test.cc | 55 +++++++++++------- cpp/src/arrow/types/primitive.cc | 29 +++++++--- cpp/src/arrow/types/primitive.h | 93 ++++++++++++++++-------------- cpp/src/arrow/types/string-test.cc | 18 +++--- cpp/src/arrow/types/string.h | 8 +-- cpp/src/arrow/util/bit-util.h | 8 ++- 19 files changed, 213 insertions(+), 168 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index eded594..7c6eaf5 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -44,9 +44,9 @@ class TestArray : public ::testing::Test { TEST_F(TestArray, TestNullCount) { auto data = std::make_shared<PoolBuffer>(pool_); - auto nulls = std::make_shared<PoolBuffer>(pool_); + auto null_bitmap = std::make_shared<PoolBuffer>(pool_); - std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, nulls)); + std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, null_bitmap)); ASSERT_EQ(10, arr->null_count()); std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data)); @@ -61,28 +61,28 @@ TEST_F(TestArray, TestLength) { } TEST_F(TestArray, TestIsNull) { - std::vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 0, 1}; + std::vector<uint8_t> null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; int32_t null_count = 0; - for (uint8_t x : nulls) { - if (x > 0) ++null_count; + for (uint8_t x : null_bitmap) { + if (x == 0) ++null_count; } - std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(nulls.data(), - nulls.size()); + std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(null_bitmap.data(), + null_bitmap.size()); std::unique_ptr<Array> arr; - arr.reset(new Int32Array(nulls.size(), nullptr, null_count, null_buf)); + arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf)); ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); - ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get())); + ASSERT_TRUE(arr->null_bitmap()->Equals(*null_buf.get())); - for (size_t i = 0; i < nulls.size(); ++i) { - ASSERT_EQ(static_cast<bool>(nulls[i]), arr->IsNull(i)); + for (size_t i = 0; i < null_bitmap.size(); ++i) { + EXPECT_EQ(static_cast<bool>(null_bitmap[i]), !arr->IsNull(i)) << i; } } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 5a5bc10..3736732 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -27,13 +27,13 @@ namespace arrow { // Base array class Array::Array(const TypePtr& type, int32_t length, int32_t null_count, - const std::shared_ptr<Buffer>& nulls) { + const std::shared_ptr<Buffer>& null_bitmap) { type_ = type; length_ = length; null_count_ = null_count; - nulls_ = nulls; - if (nulls_) { - null_bits_ = nulls_->data(); + null_bitmap_ = null_bitmap; + if (null_bitmap_) { + null_bitmap_data_ = null_bitmap_->data(); } } @@ -44,7 +44,7 @@ bool Array::EqualsExact(const Array& other) const { return false; } if (null_count_ > 0) { - return nulls_->Equals(*other.nulls_, util::bytes_for_bits(length_)); + return null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } else { return true; } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 65fc0aa..133adf3 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -32,8 +32,8 @@ class Buffer; // Immutable data array with some logical type and some length. Any memory is // owned by the respective Buffer instance (or its parents). // -// The base class is only required to have a nulls buffer if the null count is -// greater than 0 +// The base class is only required to have a null bitmap buffer if the null +// count is greater than 0 // // Any buffers used to initialize the array have their references "stolen". If // you wish to use the buffer beyond the lifetime of the array, you need to @@ -41,13 +41,13 @@ class Buffer; class Array { public: Array(const TypePtr& type, int32_t length, int32_t null_count = 0, - const std::shared_ptr<Buffer>& nulls = nullptr); + const std::shared_ptr<Buffer>& null_bitmap = nullptr); virtual ~Array() {} // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && util::get_bit(null_bits_, i); + return null_count_ > 0 && util::bit_not_set(null_bitmap_data_, i); } int32_t length() const { return length_;} @@ -56,8 +56,8 @@ class Array { const std::shared_ptr<DataType>& type() const { return type_;} Type::type type_enum() const { return type_->type;} - const std::shared_ptr<Buffer>& nulls() const { - return nulls_; + const std::shared_ptr<Buffer>& null_bitmap() const { + return null_bitmap_; } bool EqualsExact(const Array& arr) const; @@ -68,8 +68,8 @@ class Array { int32_t null_count_; int32_t length_; - std::shared_ptr<Buffer> nulls_; - const uint8_t* null_bits_; + std::shared_ptr<Buffer> null_bitmap_; + const uint8_t* null_bitmap_data_; private: Array() {} http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ba70add..6a62dc3 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -28,20 +28,20 @@ namespace arrow { Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; int32_t to_alloc = util::ceil_byte(capacity) / 8; - nulls_ = std::make_shared<PoolBuffer>(pool_); - RETURN_NOT_OK(nulls_->Resize(to_alloc)); - null_bits_ = nulls_->mutable_data(); - memset(null_bits_, 0, to_alloc); + null_bitmap_ = std::make_shared<PoolBuffer>(pool_); + RETURN_NOT_OK(null_bitmap_->Resize(to_alloc)); + null_bitmap_data_ = null_bitmap_->mutable_data(); + memset(null_bitmap_data_, 0, to_alloc); return Status::OK(); } Status ArrayBuilder::Resize(int32_t new_bits) { int32_t new_bytes = util::ceil_byte(new_bits) / 8; - int32_t old_bytes = nulls_->size(); - RETURN_NOT_OK(nulls_->Resize(new_bytes)); - null_bits_ = nulls_->mutable_data(); + int32_t old_bytes = null_bitmap_->size(); + RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); + null_bitmap_data_ = null_bitmap_->mutable_data(); if (old_bytes < new_bytes) { - memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); + memset(null_bitmap_data_ + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index d5d1fdf..308e54c 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -40,9 +40,9 @@ class ArrayBuilder { explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : pool_(pool), type_(type), - nulls_(nullptr), + null_bitmap_(nullptr), null_count_(0), - null_bits_(nullptr), + null_bitmap_data_(nullptr), length_(0), capacity_(0) {} @@ -66,7 +66,7 @@ class ArrayBuilder { // initialized independently Status Init(int32_t capacity); - // Resizes the nulls array + // Resizes the null_bitmap array Status Resize(int32_t new_bits); // For cases where raw data was memcpy'd into the internal buffers, allows us @@ -74,7 +74,7 @@ class ArrayBuilder { // this function responsibly. Status Advance(int32_t elements); - const std::shared_ptr<PoolBuffer>& nulls() const { return nulls_;} + const std::shared_ptr<PoolBuffer>& null_bitmap() const { return null_bitmap_;} // Creates new array object to hold the contents of the builder and transfers // ownership of the data @@ -89,10 +89,10 @@ class ArrayBuilder { std::shared_ptr<DataType> type_; - // When nulls are first appended to the builder, the null bitmap is allocated - std::shared_ptr<PoolBuffer> nulls_; + // When null_bitmap are first appended to the builder, the null bitmap is allocated + std::shared_ptr<PoolBuffer> null_bitmap_; int32_t null_count_; - uint8_t* null_bits_; + uint8_t* null_bitmap_data_; // Array length, so far. Also, the index of the next element to be added int32_t length_; http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/column-benchmark.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index 69ee52c..335d581 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -28,10 +28,10 @@ namespace { std::shared_ptr<Array> MakePrimitive(int32_t length, int32_t null_count = 0) { auto pool = default_memory_pool(); auto data = std::make_shared<PoolBuffer>(pool); - auto nulls = std::make_shared<PoolBuffer>(pool); + auto null_bitmap = std::make_shared<PoolBuffer>(pool); data->Resize(length * sizeof(typename ArrayType::value_type)); - nulls->Resize(util::bytes_for_bits(length)); - return std::make_shared<ArrayType>(length, data, 10, nulls); + null_bitmap->Resize(util::bytes_for_bits(length)); + return std::make_shared<ArrayType>(length, data, 10, null_bitmap); } } // anonymous namespace http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/ipc/adapter.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 8a7d818..c79e846 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -75,7 +75,7 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes flatbuf::FieldNode(prim_arr->length(), prim_arr->null_count())); if (prim_arr->null_count() > 0) { - buffers->push_back(prim_arr->nulls()); + buffers->push_back(prim_arr->null_bitmap()); } else { // Push a dummy zero-length buffer, not to be copied buffers->push_back(std::make_shared<Buffer>(nullptr, 0)); @@ -230,13 +230,13 @@ class RowBatchReader::Impl { FieldMetadata field_meta = metadata_->field(field_index_++); if (IsPrimitive(type.get())) { - std::shared_ptr<Buffer> nulls; + std::shared_ptr<Buffer> null_bitmap; std::shared_ptr<Buffer> data; if (field_meta.null_count == 0) { - nulls = nullptr; + null_bitmap = nullptr; ++buffer_index_; } else { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &nulls)); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); } if (field_meta.length > 0) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); @@ -244,7 +244,7 @@ class RowBatchReader::Impl { data.reset(new Buffer(nullptr, 0)); } return MakePrimitiveArray(type, field_meta.length, data, - field_meta.null_count, nulls, out); + field_meta.null_count, null_bitmap, out); } else { return Status::NotImplemented("Non-primitive types not complete yet"); } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/ipc/ipc-adapter-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index d75998f..79b4d71 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -77,14 +77,14 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { test::rand_uniform_int(length, 0, 0, std::numeric_limits<int32_t>::max(), reinterpret_cast<int32_t*>(data->mutable_data())); - auto nulls = std::make_shared<PoolBuffer>(pool_); + auto null_bitmap = std::make_shared<PoolBuffer>(pool_); int null_bytes = util::bytes_for_bits(length); - ASSERT_OK(nulls->Resize(null_bytes)); - test::random_bytes(null_bytes, 0, nulls->mutable_data()); + ASSERT_OK(null_bitmap->Resize(null_bytes)); + test::random_bytes(null_bytes, 0, null_bitmap->mutable_data()); auto a0 = std::make_shared<Int32Array>(length, data); auto a1 = std::make_shared<Int32Array>(length, data, - test::bitmap_popcount(nulls->data(), length), nulls); + test::bitmap_popcount(null_bitmap->data(), length), null_bitmap); RowBatch batch(schema, length, {a0, a1}); http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index a9fb2a7..ea3ce5f 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -72,10 +72,10 @@ class TestBase : public ::testing::Test { template <typename ArrayType> std::shared_ptr<Array> MakePrimitive(int32_t length, int32_t null_count = 0) { auto data = std::make_shared<PoolBuffer>(pool_); - auto nulls = std::make_shared<PoolBuffer>(pool_); + auto null_bitmap = std::make_shared<PoolBuffer>(pool_); EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); - return std::make_shared<ArrayType>(length, data, 10, nulls); + EXPECT_OK(null_bitmap->Resize(util::bytes_for_bits(length))); + return std::make_shared<ArrayType>(length, data, 10, null_bitmap); } protected: @@ -104,17 +104,22 @@ std::shared_ptr<Buffer> to_buffer(const std::vector<T>& values) { values.size() * sizeof(T)); } -void random_nulls(int64_t n, double pct_null, std::vector<uint8_t>* nulls) { +void random_null_bitmap(int64_t n, double pct_null, std::vector<uint8_t>* null_bitmap) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - nulls->push_back(static_cast<uint8_t>(rng.NextDoubleFraction() > pct_null)); + if (rng.NextDoubleFraction() > pct_null) { + null_bitmap->push_back(1); + } else { + // null + null_bitmap->push_back(0); + } } } -void random_nulls(int64_t n, double pct_null, std::vector<bool>* nulls) { +void random_null_bitmap(int64_t n, double pct_null, std::vector<bool>* null_bitmap) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - nulls->push_back(rng.NextDoubleFraction() > pct_null); + null_bitmap->push_back(rng.NextDoubleFraction() > pct_null); } } @@ -145,10 +150,10 @@ static inline int bitmap_popcount(const uint8_t* data, int length) { return count; } -static inline int null_count(const std::vector<uint8_t>& nulls) { +static inline int null_count(const std::vector<uint8_t>& valid_bytes) { int result = 0; - for (size_t i = 0; i < nulls.size(); ++i) { - if (nulls[i] > 0) { + for (size_t i = 0; i < valid_bytes.size(); ++i) { + if (valid_bytes[i] == 0) { ++result; } } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/construct.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 290decd..df2317c 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -75,12 +75,12 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ case Type::ENUM: \ - out->reset(new ArrayType(type, length, data, null_count, nulls)); \ + out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ return Status::OK(); Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, const std::shared_ptr<Buffer>& data, - int32_t null_count, const std::shared_ptr<Buffer>& nulls, + int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/construct.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 089c484..228faec 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -35,7 +35,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, const std::shared_ptr<Buffer>& data, - int32_t null_count, const std::shared_ptr<Buffer>& nulls, + int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out); } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/list.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 670ee4d..d64c06d 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -27,13 +27,13 @@ bool ListArray::EqualsExact(const ListArray& other) const { bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, length_ + 1); - bool equal_nulls = true; + bool equal_null_bitmap = true; if (null_count_ > 0) { - equal_nulls = nulls_->Equals(*other.nulls_, + equal_null_bitmap = null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } - if (!(equal_offsets && equal_nulls)) { + if (!(equal_offsets && equal_null_bitmap)) { return false; } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/list.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 141f762..72e20e9 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -39,8 +39,8 @@ class ListArray : public Array { ListArray(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets, const ArrayPtr& values, int32_t null_count = 0, - std::shared_ptr<Buffer> nulls = nullptr) : - Array(type, length, null_count, nulls) { + std::shared_ptr<Buffer> null_bitmap = nullptr) : + Array(type, length, null_count, null_bitmap) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast<const int32_t*>(offset_buf_->data()); @@ -109,17 +109,17 @@ class ListBuilder : public Int32Builder { // Vector append // - // If passed, null_bytes is of equal length to values, and any nonzero byte + // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot - Status Append(value_type* values, int32_t length, uint8_t* null_bytes = nullptr) { + Status Append(value_type* values, int32_t length, uint8_t* valid_bytes = nullptr) { if (length_ + length > capacity_) { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (null_bytes != nullptr) { - AppendNulls(null_bytes, length); + if (valid_bytes != nullptr) { + AppendNulls(valid_bytes, length); } length_ += length; @@ -136,9 +136,9 @@ class ListBuilder : public Int32Builder { } auto result = std::make_shared<Container>(type_, length_, values_, items, - null_count_, nulls_); + null_count_, null_bitmap_); - values_ = nulls_ = nullptr; + values_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; @@ -159,16 +159,13 @@ class ListBuilder : public Int32Builder { } if (is_null) { ++null_count_; - util::set_bit(null_bits_, length_); + } else { + util::set_bit(null_bitmap_data_, length_); } raw_buffer()[length_++] = value_builder_->length(); return Status::OK(); } - // Status Append(int32_t* offsets, int length, uint8_t* null_bytes) { - // return Int32Builder::Append(offsets, length, null_bytes); - // } - Status AppendNull() { return Append(true); } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 7eae8cd..10ba113 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -71,10 +71,10 @@ PRIMITIVE_TEST(BooleanType, BOOL, "bool"); TEST_F(TestBuilder, TestResize) { builder_->Init(10); - ASSERT_EQ(2, builder_->nulls()->size()); + ASSERT_EQ(2, builder_->null_bitmap()->size()); builder_->Resize(30); - ASSERT_EQ(4, builder_->nulls()->size()); + ASSERT_EQ(4, builder_->null_bitmap()->size()); } template <typename Attrs> @@ -99,7 +99,7 @@ class TestPrimitiveBuilder : public TestBuilder { void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); - test::random_nulls(N, pct_null, &nulls_); + test::random_null_bitmap(N, pct_null, &valid_bytes_); } void CheckNullable() { @@ -109,10 +109,11 @@ class TestPrimitiveBuilder : public TestBuilder { reinterpret_cast<uint8_t*>(draws_.data()), size * sizeof(T)); - auto ex_nulls = test::bytes_to_null_buffer(nulls_.data(), size); - int32_t ex_null_count = test::null_count(nulls_); + auto ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_.data(), size); + int32_t ex_null_count = test::null_count(valid_bytes_); - auto expected = std::make_shared<ArrayType>(size, ex_data, ex_null_count, ex_nulls); + auto expected = std::make_shared<ArrayType>(size, ex_data, ex_null_count, + ex_null_bitmap); std::shared_ptr<ArrayType> result = std::dynamic_pointer_cast<ArrayType>( builder_->Finish()); @@ -123,8 +124,8 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result->EqualsExact(*expected.get())); ASSERT_EQ(ex_null_count, result->null_count()); + ASSERT_TRUE(result->EqualsExact(*expected.get())); } void CheckNonNullable() { @@ -154,7 +155,7 @@ class TestPrimitiveBuilder : public TestBuilder { shared_ptr<BuilderType> builder_nn_; vector<T> draws_; - vector<uint8_t> nulls_; + vector<uint8_t> valid_bytes_; }; #define PTYPE_DECL(CapType, c_type) \ @@ -210,7 +211,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { } TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { - int size = 10000; + int size = 1000; for (int i = 0; i < size; ++i) { ASSERT_OK(this->builder_->AppendNull()); } @@ -218,17 +219,17 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { auto result = this->builder_->Finish(); for (int i = 0; i < size; ++i) { - ASSERT_TRUE(result->IsNull(i)); + ASSERT_TRUE(result->IsNull(i)) << i; } } TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { DECL_T(); - int size = 10000; + int size = 1000; vector<T>& draws = this->draws_; - vector<uint8_t>& nulls = this->nulls_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; int64_t memory_before = this->pool_->bytes_allocated(); @@ -236,7 +237,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { int i; for (i = 0; i < size; ++i) { - ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + if (valid_bytes[i] > 0) { + ASSERT_OK(this->builder_->Append(draws[i])); + } else { + ASSERT_OK(this->builder_->AppendNull()); + } } do { @@ -249,17 +254,21 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); - int size = 10000; + const int size = 10000; vector<T>& draws = this->draws_; - vector<uint8_t>& nulls = this->nulls_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; this->RandomData(size); int i; // Append the first 1000 for (i = 0; i < 1000; ++i) { - ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + if (valid_bytes[i] > 0) { + ASSERT_OK(this->builder_->Append(draws[i])); + } else { + ASSERT_OK(this->builder_->AppendNull()); + } ASSERT_OK(this->builder_nn_->Append(draws[i])); } @@ -271,7 +280,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { // Append the next 9000 for (i = 1000; i < size; ++i) { - ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + if (valid_bytes[i] > 0) { + ASSERT_OK(this->builder_->Append(draws[i])); + } else { + ASSERT_OK(this->builder_->AppendNull()); + } ASSERT_OK(this->builder_nn_->Append(draws[i])); } @@ -293,12 +306,12 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { this->RandomData(size); vector<T>& draws = this->draws_; - vector<uint8_t>& nulls = this->nulls_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; // first slug int K = 1000; - ASSERT_OK(this->builder_->Append(draws.data(), K, nulls.data())); + ASSERT_OK(this->builder_->Append(draws.data(), K, valid_bytes.data())); ASSERT_OK(this->builder_nn_->Append(draws.data(), K)); ASSERT_EQ(1000, this->builder_->length()); @@ -308,7 +321,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { ASSERT_EQ(1024, this->builder_nn_->capacity()); // Append the next 9000 - ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, nulls.data() + K)); + ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, valid_bytes.data() + K)); ASSERT_OK(this->builder_nn_->Append(draws.data() + K, size - K)); ASSERT_EQ(size, this->builder_->length()); @@ -338,7 +351,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_EQ(cap, this->builder_->capacity()); ASSERT_EQ(cap * sizeof(T), this->builder_->buffer()->size()); - ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->nulls()->size()); + ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/primitive.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 32b8bfa..ecd5d68 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,13 +26,14 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, int value_size, const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& nulls) : - Array(type, length, null_count, nulls) { + const std::shared_ptr<Buffer>& null_bitmap) : + Array(type, length, null_count, null_bitmap) { data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); + value_size_ = value_size; } bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { @@ -41,12 +42,26 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { return false; } - bool equal_data = data_->Equals(*other.data_, length_); if (null_count_ > 0) { - return equal_data && - nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8); + bool equal_bitmap = null_bitmap_->Equals(*other.null_bitmap_, + util::ceil_byte(length_) / 8); + if (!equal_bitmap) { + return false; + } + + const uint8_t* this_data = raw_data_; + const uint8_t* other_data = other.raw_data_; + + for (int i = 0; i < length_; ++i) { + if (!IsNull(i) && memcmp(this_data, other_data, value_size_)) { + return false; + } + this_data += value_size_; + other_data += value_size_; + } + return true; } else { - return equal_data; + return data_->Equals(*other.data_, length_); } } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/primitive.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index e01027c..4eaff43 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -37,10 +37,10 @@ class MemoryPool; // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray(const TypePtr& type, int32_t length, + PrimitiveArray(const TypePtr& type, int32_t length, int value_size, const std::shared_ptr<Buffer>& data, int32_t null_count = 0, - const std::shared_ptr<Buffer>& nulls = nullptr); + const std::shared_ptr<Buffer>& null_bitmap = nullptr); virtual ~PrimitiveArray() {} const std::shared_ptr<Buffer>& data() const { return data_;} @@ -51,31 +51,38 @@ class PrimitiveArray : public Array { protected: std::shared_ptr<Buffer> data_; const uint8_t* raw_data_; + int value_size_; }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ -class NAME : public PrimitiveArray { \ - public: \ - using value_type = T; \ - using PrimitiveArray::PrimitiveArray; \ - NAME(int32_t length, const std::shared_ptr<Buffer>& data, \ - int32_t null_count = 0, \ - const std::shared_ptr<Buffer>& nulls = nullptr) : \ - PrimitiveArray(std::make_shared<TypeClass>(), length, data, \ - null_count, nulls) {} \ - \ - bool EqualsExact(const NAME& other) const { \ - return PrimitiveArray::EqualsExact( \ - *static_cast<const PrimitiveArray*>(&other)); \ - } \ - \ - const T* raw_data() const { \ - return reinterpret_cast<const T*>(raw_data_); \ - } \ - \ - T Value(int i) const { \ - return raw_data()[i]; \ - } \ +#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ +class NAME : public PrimitiveArray { \ + public: \ + using value_type = T; \ + NAME(const TypePtr& type, int32_t length, \ + const std::shared_ptr<Buffer>& data, \ + int32_t null_count = 0, \ + const std::shared_ptr<Buffer>& null_bitmap = nullptr) : \ + PrimitiveArray(std::make_shared<TypeClass>(), length, \ + sizeof(T), data, null_count, null_bitmap) {} \ + \ + NAME(int32_t length, const std::shared_ptr<Buffer>& data, \ + int32_t null_count = 0, \ + const std::shared_ptr<Buffer>& null_bitmap = nullptr) : \ + PrimitiveArray(std::make_shared<TypeClass>(), length, \ + sizeof(T), data, null_count, null_bitmap) {} \ + \ + bool EqualsExact(const NAME& other) const { \ + return PrimitiveArray::EqualsExact( \ + *static_cast<const PrimitiveArray*>(&other)); \ + } \ + \ + const T* raw_data() const { \ + return reinterpret_cast<const T*>(raw_data_); \ + } \ + \ + T Value(int i) const { \ + return raw_data()[i]; \ + } \ }; NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type, uint8_t); @@ -137,25 +144,22 @@ class PrimitiveBuilder : public ArrayBuilder { } // Scalar append - Status Append(value_type val, bool is_null = false) { + Status Append(value_type val) { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (is_null) { - ++null_count_; - util::set_bit(null_bits_, length_); - } + util::set_bit(null_bitmap_data_, length_); raw_buffer()[length_++] = val; return Status::OK(); } // Vector append // - // If passed, null_bytes is of equal length to values, and any nonzero byte + // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot Status Append(const value_type* values, int32_t length, - const uint8_t* null_bytes = nullptr) { + const uint8_t* valid_bytes = nullptr) { if (length_ + length > capacity_) { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); @@ -164,21 +168,26 @@ class PrimitiveBuilder : public ArrayBuilder { memcpy(raw_buffer() + length_, values, length * elsize_); } - if (null_bytes != nullptr) { - AppendNulls(null_bytes, length); + if (valid_bytes != nullptr) { + AppendNulls(valid_bytes, length); + } else { + for (int i = 0; i < length; ++i) { + util::set_bit(null_bitmap_data_, length_ + i); + } } length_ += length; return Status::OK(); } - // Write nulls as uint8_t* into pre-allocated memory - void AppendNulls(const uint8_t* null_bytes, int32_t length) { - // If null_bytes is all not null, then none of the values are null + // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + void AppendNulls(const uint8_t* valid_bytes, int32_t length) { + // If valid_bytes is all not null, then none of the values are null for (int i = 0; i < length; ++i) { - if (static_cast<bool>(null_bytes[i])) { + if (valid_bytes[i] == 0) { ++null_count_; - util::set_bit(null_bits_, length_ + i); + } else { + util::set_bit(null_bitmap_data_, length_ + i); } } } @@ -189,15 +198,15 @@ class PrimitiveBuilder : public ArrayBuilder { RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } ++null_count_; - util::set_bit(null_bits_, length_++); + ++length_; return Status::OK(); } std::shared_ptr<Array> Finish() override { std::shared_ptr<ArrayType> result = std::make_shared<ArrayType>( - type_, length_, values_, null_count_, nulls_); + type_, length_, values_, null_count_, null_bitmap_); - values_ = nulls_ = nullptr; + values_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; } http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 7dc3d68..b329b4f 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -77,7 +77,7 @@ class TestStringContainer : public ::testing::Test { void SetUp() { chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; offsets_ = {0, 1, 1, 1, 3, 6}; - nulls_ = {0, 0, 1, 0, 0}; + valid_bytes_ = {1, 1, 0, 1, 1}; expected_ = {"a", "", "", "bb", "ccc"}; MakeArray(); @@ -92,23 +92,23 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = test::to_buffer(offsets_); - nulls_buf_ = test::bytes_to_null_buffer(nulls_.data(), nulls_.size()); - null_count_ = test::null_count(nulls_); + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_.data(), valid_bytes_.size()); + null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared<StringArray>(length_, offsets_buf_, values_, - null_count_, nulls_buf_); + null_count_, null_bitmap_); } protected: std::vector<int32_t> offsets_; std::vector<char> chars_; - std::vector<uint8_t> nulls_; + std::vector<uint8_t> valid_bytes_; std::vector<std::string> expected_; std::shared_ptr<Buffer> value_buf_; std::shared_ptr<Buffer> offsets_buf_; - std::shared_ptr<Buffer> nulls_buf_; + std::shared_ptr<Buffer> null_bitmap_; int null_count_; int length_; @@ -143,12 +143,12 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { auto arr = std::make_shared<StringArray>(length_, offsets_buf_, values_, - null_count_, nulls_buf_); + null_count_, null_bitmap_); } TEST_F(TestStringContainer, TestGetString) { for (size_t i = 0; i < expected_.size(); ++i) { - if (nulls_[i]) { + if (valid_bytes_[i] == 0) { ASSERT_TRUE(strings_->IsNull(i)); } else { ASSERT_EQ(expected_[i], strings_->GetString(i)); @@ -197,7 +197,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps * test::null_count(is_null), result_->null_count()); + ASSERT_EQ(reps, result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); int32_t length; http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/types/string.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 2b3fba5..fda722b 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -68,8 +68,8 @@ class StringArray : public ListArray { const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr<Buffer>& nulls = nullptr) : - ListArray(type, length, offsets, values, null_count, nulls) { + const std::shared_ptr<Buffer>& null_bitmap = nullptr) : + ListArray(type, length, offsets, values, null_count, null_bitmap) { // For convenience bytes_ = static_cast<UInt8Array*>(values.get()); raw_bytes_ = bytes_->raw_data(); @@ -79,9 +79,9 @@ class StringArray : public ListArray { const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr<Buffer>& nulls = nullptr) : + const std::shared_ptr<Buffer>& null_bitmap = nullptr) : StringArray(std::make_shared<StringType>(), length, offsets, values, - null_count, nulls) {} + null_count, null_bitmap) {} // Compute the pointer t const uint8_t* GetValue(int i, int32_t* out_length) const { http://git-wip-us.apache.org/repos/asf/arrow/blob/fbbee3d2/cpp/src/arrow/util/bit-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 1d2d1d5..08222d5 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -40,8 +40,14 @@ static inline int64_t ceil_2bytes(int64_t size) { return (size + 15) & ~15; } +static constexpr uint8_t BITMASK[] = {1, 2, 4, 8, 16, 32, 64, 128}; + static inline bool get_bit(const uint8_t* bits, int i) { - return bits[i / 8] & (1 << (i % 8)); + return bits[i / 8] & BITMASK[i % 8]; +} + +static inline bool bit_not_set(const uint8_t* bits, int i) { + return (bits[i / 8] & BITMASK[i % 8]) == 0; } static inline void set_bit(uint8_t* bits, int i) {