Repository: arrow Updated Branches: refs/heads/master e41802085 -> b88b69e20
ARROW-20: Add null_count_ member to array containers, remove nullable_ member Based off of ARROW-19. After some contemplation / discussion, I believe it would be better to track nullability at the schema metadata level (if at all!) rather than making it a property of the data structures. This allows the data containers to be "plain ol' data" and thus both nullable data with `null_count == 0` and non-nullable data (implicitly `null_count == 0`) can be treated as semantically equivalent in algorithms code. If it is deemed useful we can validate (cheaply) that physical data meets the metadata requirements (e.g. non-nullable type metadata cannot be associated with data containers having nulls). Author: Wes McKinney <w...@apache.org> Closes #9 from wesm/ARROW-20 and squashes the following commits: 98be016 [Wes McKinney] ARROW-20: Add null_count_ member to Array containers, remove nullable member Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/b88b69e2 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/b88b69e2 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/b88b69e2 Branch: refs/heads/master Commit: b88b69e204b59fa8f19cd20dcb6c091fe9bde3a9 Parents: e418020 Author: Wes McKinney <w...@apache.org> Authored: Thu Mar 3 14:56:31 2016 -0800 Committer: Wes McKinney <w...@apache.org> Committed: Thu Mar 3 14:56:31 2016 -0800 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/array-test.cc | 57 +++++++++--------- cpp/src/arrow/array.cc | 11 ++-- cpp/src/arrow/array.h | 37 ++++++++---- cpp/src/arrow/builder.cc | 35 +++++------ cpp/src/arrow/builder.h | 29 ++++----- cpp/src/arrow/test-util.h | 10 ++++ cpp/src/arrow/type.h | 12 ++-- cpp/src/arrow/types/collection.h | 2 +- cpp/src/arrow/types/datetime.h | 12 ++-- cpp/src/arrow/types/json.h | 4 +- cpp/src/arrow/types/list-test.cc | 12 +--- cpp/src/arrow/types/list.h | 46 ++++++++------- cpp/src/arrow/types/primitive-test.cc | 34 ++++++----- cpp/src/arrow/types/primitive.cc | 11 ++-- cpp/src/arrow/types/primitive.h | 95 +++++++++++++++++------------- cpp/src/arrow/types/string-test.cc | 31 ++++------ cpp/src/arrow/types/string.cc | 2 +- cpp/src/arrow/types/string.h | 43 +++++++------- cpp/src/arrow/types/struct-test.cc | 6 +- cpp/src/arrow/types/struct.h | 5 +- cpp/src/arrow/types/test-common.h | 4 +- cpp/src/arrow/types/union.h | 10 ++-- cpp/src/arrow/util/bit-util.cc | 4 +- cpp/src/arrow/util/bit-util.h | 4 +- 25 files changed, 265 insertions(+), 253 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d2c840a..f0eb73d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -92,7 +92,7 @@ endif() # For CMAKE_BUILD_TYPE=Release # -O3: Enable all compiler optimizations # -g: Enable symbols for profiler tools (TODO: remove for shipping) -set(CXX_FLAGS_DEBUG "-ggdb") +set(CXX_FLAGS_DEBUG "-ggdb -O0") set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG") http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 16afb9b..df827aa 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -20,7 +20,6 @@ #include <cstdint> #include <cstdlib> #include <memory> -#include <string> #include <vector> #include "arrow/array.h" @@ -32,60 +31,60 @@ #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" -using std::string; -using std::vector; - namespace arrow { static TypePtr int32 = TypePtr(new Int32Type()); -static TypePtr int32_nn = TypePtr(new Int32Type(false)); - class TestArray : public ::testing::Test { public: void SetUp() { pool_ = GetDefaultMemoryPool(); - - auto data = std::make_shared<PoolBuffer>(pool_); - auto nulls = std::make_shared<PoolBuffer>(pool_); - - ASSERT_OK(data->Resize(400)); - ASSERT_OK(nulls->Resize(128)); - - arr_.reset(new Int32Array(100, data, nulls)); } protected: MemoryPool* pool_; - std::unique_ptr<Int32Array> arr_; }; -TEST_F(TestArray, TestNullable) { - std::shared_ptr<Buffer> tmp = arr_->data(); - std::unique_ptr<Int32Array> arr_nn(new Int32Array(100, tmp)); +TEST_F(TestArray, TestNullCount) { + auto data = std::make_shared<PoolBuffer>(pool_); + auto nulls = std::make_shared<PoolBuffer>(pool_); - ASSERT_TRUE(arr_->nullable()); - ASSERT_FALSE(arr_nn->nullable()); + std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, nulls)); + ASSERT_EQ(10, arr->null_count()); + + std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data)); + ASSERT_EQ(0, arr_no_nulls->null_count()); } TEST_F(TestArray, TestLength) { - ASSERT_EQ(arr_->length(), 100); + auto data = std::make_shared<PoolBuffer>(pool_); + std::unique_ptr<Int32Array> arr(new Int32Array(100, data)); + ASSERT_EQ(arr->length(), 100); } TEST_F(TestArray, TestIsNull) { - vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 0, 1}; + std::vector<uint8_t> nulls = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; + int32_t null_count = 0; + for (uint8_t x : nulls) { + if (x > 0) ++null_count; + } - std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(), nulls.size()); + std::shared_ptr<Buffer> null_buf = bytes_to_null_buffer(nulls.data(), + nulls.size()); std::unique_ptr<Array> arr; - arr.reset(new Array(int32, nulls.size(), null_buf)); + arr.reset(new Array(int32, nulls.size(), null_count, null_buf)); + + ASSERT_EQ(null_count, arr->null_count()); + ASSERT_EQ(5, null_buf->size()); + + ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get())); - ASSERT_EQ(null_buf->size(), 5); for (size_t i = 0; i < nulls.size(); ++i) { ASSERT_EQ(static_cast<bool>(nulls[i]), arr->IsNull(i)); } http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 1726a2f..ee4ef66 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -17,6 +17,8 @@ #include "arrow/array.h" +#include <cstdint> + #include "arrow/util/buffer.h" namespace arrow { @@ -24,18 +26,17 @@ namespace arrow { // ---------------------------------------------------------------------- // Base array class -Array::Array(const TypePtr& type, int64_t length, +Array::Array(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr<Buffer>& nulls) { - Init(type, length, nulls); + Init(type, length, null_count, nulls); } -void Array::Init(const TypePtr& type, int64_t length, +void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr<Buffer>& nulls) { type_ = type; length_ = length; + null_count_ = null_count; nulls_ = nulls; - - nullable_ = type->nullable; if (nulls_) { null_bits_ = nulls_->data(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 0eaa28d..3d748c1 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -30,38 +30,49 @@ namespace arrow { class Buffer; // Immutable data array with some logical type and some length. Any memory is -// owned by the respective Buffer instance (or its parents). May or may not be -// nullable. +// owned by the respective Buffer instance (or its parents). // -// The base class only has a null array (if the data type is nullable) +// The base class is only required to have a nulls buffer if the null count is +// greater than 0 // // Any buffers used to initialize the array have their references "stolen". If // you wish to use the buffer beyond the lifetime of the array, you need to // explicitly increment its reference count class Array { public: - Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {} - Array(const TypePtr& type, int64_t length, + Array() : + null_count_(0), + length_(0), + nulls_(nullptr), + null_bits_(nullptr) {} + + Array(const TypePtr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr); virtual ~Array() {} - void Init(const TypePtr& type, int64_t length, const std::shared_ptr<Buffer>& nulls); + void Init(const TypePtr& type, int32_t length, int32_t null_count, + const std::shared_ptr<Buffer>& nulls); - // Determine if a slot if null. For inner loops. Does *not* boundscheck - bool IsNull(int64_t i) const { - return nullable_ && util::get_bit(null_bits_, i); + // Determine if a slot is null. For inner loops. Does *not* boundscheck + bool IsNull(int i) const { + return null_count_ > 0 && util::get_bit(null_bits_, i); } - int64_t length() const { return length_;} - bool nullable() const { return nullable_;} + int32_t length() const { return length_;} + int32_t null_count() const { return null_count_;} + const TypePtr& type() const { return type_;} TypeEnum type_enum() const { return type_->type;} + const std::shared_ptr<Buffer>& nulls() const { + return nulls_; + } + protected: TypePtr type_; - bool nullable_; - int64_t length_; + int32_t null_count_; + int32_t length_; std::shared_ptr<Buffer> nulls_; const uint8_t* null_bits_; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index cb85067..ba70add 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,34 +25,29 @@ namespace arrow { -Status ArrayBuilder::Init(int64_t capacity) { +Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; - - if (nullable_) { - int64_t to_alloc = util::ceil_byte(capacity) / 8; - nulls_ = std::make_shared<PoolBuffer>(pool_); - RETURN_NOT_OK(nulls_->Resize(to_alloc)); - null_bits_ = nulls_->mutable_data(); - memset(null_bits_, 0, to_alloc); - } + int32_t to_alloc = util::ceil_byte(capacity) / 8; + nulls_ = std::make_shared<PoolBuffer>(pool_); + RETURN_NOT_OK(nulls_->Resize(to_alloc)); + null_bits_ = nulls_->mutable_data(); + memset(null_bits_, 0, to_alloc); return Status::OK(); } -Status ArrayBuilder::Resize(int64_t new_bits) { - if (nullable_) { - int64_t new_bytes = util::ceil_byte(new_bits) / 8; - int64_t old_bytes = nulls_->size(); - RETURN_NOT_OK(nulls_->Resize(new_bytes)); - null_bits_ = nulls_->mutable_data(); - if (old_bytes < new_bytes) { - memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); - } +Status ArrayBuilder::Resize(int32_t new_bits) { + int32_t new_bytes = util::ceil_byte(new_bits) / 8; + int32_t old_bytes = nulls_->size(); + RETURN_NOT_OK(nulls_->Resize(new_bytes)); + null_bits_ = nulls_->mutable_data(); + if (old_bytes < new_bytes) { + memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); } -Status ArrayBuilder::Advance(int64_t elements) { - if (nullable_ && length_ + elements > capacity_) { +Status ArrayBuilder::Advance(int32_t elements) { + if (length_ + elements > capacity_) { return Status::Invalid("Builder must be expanded"); } length_ += elements; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 456bb04..491b913 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; // Base class for all data array builders class ArrayBuilder { @@ -40,8 +40,9 @@ class ArrayBuilder { explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : pool_(pool), type_(type), - nullable_(type_->nullable), - nulls_(nullptr), null_bits_(nullptr), + nulls_(nullptr), + null_count_(0), + null_bits_(nullptr), length_(0), capacity_(0) {} @@ -57,21 +58,21 @@ class ArrayBuilder { return children_.size(); } - int64_t length() const { return length_;} - int64_t capacity() const { return capacity_;} - bool nullable() const { return nullable_;} + int32_t length() const { return length_;} + int32_t null_count() const { return null_count_;} + int32_t capacity() const { return capacity_;} // Allocates requires memory at this level, but children need to be // initialized independently - Status Init(int64_t capacity); + Status Init(int32_t capacity); - // Resizes the nulls array (if nullable) - Status Resize(int64_t new_bits); + // Resizes the nulls array + Status Resize(int32_t new_bits); // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use // this function responsibly. - Status Advance(int64_t elements); + Status Advance(int32_t elements); const std::shared_ptr<PoolBuffer>& nulls() const { return nulls_;} @@ -83,15 +84,15 @@ class ArrayBuilder { MemoryPool* pool_; TypePtr type_; - bool nullable_; - // If the type is not nullable, then null_ is nullptr after initialization + // When nulls are first appended to the builder, the null bitmap is allocated std::shared_ptr<PoolBuffer> nulls_; + int32_t null_count_; uint8_t* null_bits_; // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; + int32_t length_; + int32_t capacity_; // Child value array builders. These are owned by this class std::vector<std::unique_ptr<ArrayBuilder> > children_; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2233a4f..0898c8e 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -84,6 +84,16 @@ void random_nulls(int64_t n, double pct_null, std::vector<bool>* nulls) { } } +static inline int null_count(const std::vector<uint8_t>& nulls) { + int result = 0; + for (size_t i = 0; i < nulls.size(); ++i) { + if (nulls[i] > 0) { + ++result; + } + } + return result; +} + std::shared_ptr<Buffer> bytes_to_null_buffer(uint8_t* bytes, int length) { std::shared_ptr<Buffer> out; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 220f99f..12f1960 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -57,11 +57,9 @@ struct LayoutType { // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) -// -// Any data type can be nullable enum class TypeEnum: char { - // A degerate NULL type represented as 0 bytes/bits + // A degenerate NULL type represented as 0 bytes/bits NA = 0, // Little-endian integer types @@ -138,14 +136,12 @@ enum class TypeEnum: char { struct DataType { TypeEnum type; - bool nullable; - explicit DataType(TypeEnum type, bool nullable = true) - : type(type), nullable(nullable) {} + explicit DataType(TypeEnum type) + : type(type) {} virtual bool Equals(const DataType* other) { - return (this == other) || (this->type == other->type && - this->nullable == other->nullable); + return this == other || this->type == other->type; } virtual std::string ToString() const = 0; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/collection.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 59ba614..094b63f 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -29,7 +29,7 @@ template <TypeEnum T> struct CollectionType : public DataType { std::vector<TypePtr> child_types_; - explicit CollectionType(bool nullable = true) : DataType(T, nullable) {} + CollectionType() : DataType(T) {} const TypePtr& child(int i) const { return child_types_[i]; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/datetime.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index b4d6252..d90883c 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,12 +31,12 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY, bool nullable = true) - : DataType(TypeEnum::DATE, nullable), + explicit DateType(Unit unit = Unit::DAY) + : DataType(TypeEnum::DATE), unit(unit) {} DateType(const DateType& other) - : DateType(other.unit, other.nullable) {} + : DateType(other.unit) {} static char const *name() { return "date"; @@ -58,12 +58,12 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) - : DataType(TypeEnum::TIMESTAMP, nullable), + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(TypeEnum::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) - : TimestampType(other.unit, other.nullable) {} + : TimestampType(other.unit) {} static char const *name() { return "timestamp"; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/json.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index 91fd132..6c2b097 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,8 +28,8 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true, bool nullable = true) - : DataType(TypeEnum::JSON_SCALAR, nullable), + explicit JSONScalar(bool dense = true) + : DataType(TypeEnum::JSON_SCALAR), dense(dense) {} }; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/list-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index abfc8a3..1d9ddbe 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -44,11 +44,7 @@ TEST(TypesTest, TestListType) { std::shared_ptr<DataType> vt = std::make_shared<UInt8Type>(); ListType list_type(vt); - ListType list_type_nn(vt, false); - ASSERT_EQ(list_type.type, TypeEnum::LIST); - ASSERT_TRUE(list_type.nullable); - ASSERT_FALSE(list_type_nn.nullable); ASSERT_EQ(list_type.name(), string("list")); ASSERT_EQ(list_type.ToString(), string("list<uint8>")); @@ -132,8 +128,8 @@ TEST_F(TestListBuilder, TestBasics) { Done(); - ASSERT_TRUE(result_->nullable()); - ASSERT_TRUE(result_->values()->nullable()); + ASSERT_EQ(1, result_->null_count()); + ASSERT_EQ(0, result_->values()->null_count()); ASSERT_EQ(3, result_->length()); vector<int32_t> ex_offsets = {0, 3, 3, 7}; @@ -153,10 +149,6 @@ TEST_F(TestListBuilder, TestBasics) { } } -TEST_F(TestListBuilder, TestBasicsNonNullable) { -} - - TEST_F(TestListBuilder, TestZeroLength) { // All buffers are null Done(); http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/list.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 4ca0f13..4190b53 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -40,8 +40,8 @@ struct ListType : public DataType { // List can contain any other logical value type TypePtr value_type; - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(TypeEnum::LIST, nullable), + explicit ListType(const TypePtr& value_type) + : DataType(TypeEnum::LIST), value_type(value_type) {} static char const *name() { @@ -56,21 +56,25 @@ class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} - ListArray(const TypePtr& type, int64_t length, std::shared_ptr<Buffer> offsets, - const ArrayPtr& values, std::shared_ptr<Buffer> nulls = nullptr) { - Init(type, length, offsets, values, nulls); + ListArray(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets, + const ArrayPtr& values, + int32_t null_count = 0, + std::shared_ptr<Buffer> nulls = nullptr) { + Init(type, length, offsets, values, null_count, nulls); } virtual ~ListArray() {} - void Init(const TypePtr& type, int64_t length, std::shared_ptr<Buffer> offsets, - const ArrayPtr& values, std::shared_ptr<Buffer> nulls = nullptr) { + void Init(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets, + const ArrayPtr& values, + int32_t null_count = 0, + std::shared_ptr<Buffer> nulls = nullptr) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast<const int32_t*>(offset_buf_->data()); values_ = values; - Array::Init(type, length, nulls); + Array::Init(type, length, null_count, nulls); } // Return a shared pointer in case the requestor desires to share ownership @@ -108,7 +112,7 @@ class ListBuilder : public Int32Builder { value_builder_.reset(value_builder); } - Status Init(int64_t elements) { + Status Init(int32_t elements) { // One more than requested. // // XXX: This is slightly imprecise, because we might trigger null mask @@ -116,7 +120,7 @@ class ListBuilder : public Int32Builder { return Int32Builder::Init(elements + 1); } - Status Resize(int64_t capacity) { + Status Resize(int32_t capacity) { // Need space for the end offset RETURN_NOT_OK(Int32Builder::Resize(capacity + 1)); @@ -129,18 +133,15 @@ class ListBuilder : public Int32Builder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(T* values, int64_t length, uint8_t* null_bytes = nullptr) { + Status Append(T* values, int32_t length, uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { - int64_t new_capacity = util::next_power2(length_ + length); + int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (nullable_ && null_bytes != nullptr) { - // If null_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - util::set_bit(null_bits_, length_ + i, static_cast<bool>(null_bytes[i])); - } + if (null_bytes != nullptr) { + AppendNulls(null_bytes, length); } length_ += length; @@ -159,9 +160,10 @@ class ListBuilder : public Int32Builder { raw_buffer()[length_] = child_values->length(); } - out->Init(type_, length_, values_, ArrayPtr(child_values), nulls_); + out->Init(type_, length_, values_, ArrayPtr(child_values), + null_count_, nulls_); values_ = nulls_ = nullptr; - capacity_ = length_ = 0; + capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -181,10 +183,10 @@ class ListBuilder : public Int32Builder { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (nullable_) { - util::set_bit(null_bits_, length_, is_null); + if (is_null) { + ++null_count_; + util::set_bit(null_bits_, length_); } - raw_buffer()[length_++] = value_builder_->length(); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 3484294..9363443 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -53,15 +53,12 @@ TEST(TypesTest, TestBytesType) { #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ - KLASS tp_nn(false); \ \ ASSERT_EQ(tp.type, TypeEnum::ENUM); \ ASSERT_EQ(tp.name(), string(NAME)); \ - ASSERT_TRUE(tp.nullable); \ - ASSERT_FALSE(tp_nn.nullable); \ \ - KLASS tp_copy = tp_nn; \ - ASSERT_FALSE(tp_copy.nullable); \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, TypeEnum::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -100,17 +97,16 @@ class TestPrimitiveBuilder : public TestBuilder { TestBuilder::SetUp(); type_ = Attrs::type(); - type_nn_ = Attrs::type(false); ArrayBuilder* tmp; ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_.reset(static_cast<BuilderType*>(tmp)); - ASSERT_OK(make_builder(pool_, type_nn_, &tmp)); + ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_nn_.reset(static_cast<BuilderType*>(tmp)); } - void RandomData(int64_t N, double pct_null = 0.1) { + void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); random_nulls(N, pct_null, &nulls_); } @@ -118,28 +114,33 @@ class TestPrimitiveBuilder : public TestBuilder { void CheckNullable() { ArrayType result; ArrayType expected; - int64_t size = builder_->length(); + int size = builder_->length(); - auto ex_data = std::make_shared<Buffer>(reinterpret_cast<uint8_t*>(draws_.data()), + auto ex_data = std::make_shared<Buffer>( + reinterpret_cast<uint8_t*>(draws_.data()), size * sizeof(T)); auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size); - expected.Init(size, ex_data, ex_nulls); + int32_t ex_null_count = null_count(nulls_); + + expected.Init(size, ex_data, ex_null_count, ex_nulls); ASSERT_OK(builder_->Transfer(&result)); // Builder is now reset ASSERT_EQ(0, builder_->length()); ASSERT_EQ(0, builder_->capacity()); + ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); ASSERT_TRUE(result.Equals(expected)); + ASSERT_EQ(ex_null_count, result.null_count()); } void CheckNonNullable() { ArrayType result; ArrayType expected; - int64_t size = builder_nn_->length(); + int size = builder_nn_->length(); auto ex_data = std::make_shared<Buffer>(reinterpret_cast<uint8_t*>(draws_.data()), size * sizeof(T)); @@ -153,6 +154,7 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(nullptr, builder_nn_->buffer()); ASSERT_TRUE(result.Equals(expected)); + ASSERT_EQ(0, result.null_count()); } protected: @@ -171,14 +173,14 @@ class TestPrimitiveBuilder : public TestBuilder { typedef CapType##Type Type; \ typedef c_type T; \ \ - static TypePtr type(bool nullable = true) { \ - return TypePtr(new Type(nullable)); \ + static TypePtr type() { \ + return TypePtr(new Type()); \ } #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type); \ - static void draw(int64_t N, vector<T>* draws) { \ + static void draw(int N, vector<T>* draws) { \ randint<T>(N, LOWER, UPPER, draws); \ } \ } @@ -208,7 +210,7 @@ TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); TYPED_TEST(TestPrimitiveBuilder, TestInit) { DECL_T(); - int64_t n = 1000; + int n = 1000; ASSERT_OK(this->builder_->Init(n)); ASSERT_EQ(n, this->builder_->capacity()); ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size()); http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/primitive.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 2612e8c..c86260b 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,20 +26,23 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -void PrimitiveArray::Init(const TypePtr& type, int64_t length, +void PrimitiveArray::Init(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& data, + int32_t null_count, const std::shared_ptr<Buffer>& nulls) { - Array::Init(type, length, nulls); + Array::Init(type, length, null_count, nulls); data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); } bool PrimitiveArray::Equals(const PrimitiveArray& other) const { if (this == &other) return true; - if (type_->nullable != other.type_->nullable) return false; + if (null_count_ != other.null_count_) { + return false; + } bool equal_data = data_->Equals(*other.data_, length_); - if (type_->nullable) { + if (null_count_ > 0) { return equal_data && nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8); } else { http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/primitive.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c5ae0f7..aa8f351 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -36,24 +36,24 @@ class MemoryPool; template <typename Derived> struct PrimitiveType : public DataType { - explicit PrimitiveType(bool nullable = true) - : DataType(Derived::type_enum, nullable) {} + PrimitiveType() + : DataType(Derived::type_enum) {} virtual std::string ToString() const { return std::string(static_cast<const Derived*>(this)->name()); } }; -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ - static constexpr int size = SIZE; \ - \ - explicit TYPENAME(bool nullable = true) \ - : PrimitiveType<TYPENAME>(nullable) {} \ - \ - static const char* name() { \ - return NAME; \ +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ + static constexpr int size = SIZE; \ + \ + TYPENAME() \ + : PrimitiveType<TYPENAME>() {} \ + \ + static const char* name() { \ + return NAME; \ } @@ -64,7 +64,9 @@ class PrimitiveArray : public Array { virtual ~PrimitiveArray() {} - void Init(const TypePtr& type, int64_t length, const std::shared_ptr<Buffer>& data, + void Init(const TypePtr& type, int32_t length, + const std::shared_ptr<Buffer>& data, + int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr); const std::shared_ptr<Buffer>& data() const { return data_;} @@ -84,15 +86,17 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} - PrimitiveArrayImpl(int64_t length, const std::shared_ptr<Buffer>& data, + PrimitiveArrayImpl(int32_t length, const std::shared_ptr<Buffer>& data, + int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr) { - Init(length, data, nulls); + Init(length, data, null_count, nulls); } - void Init(int64_t length, const std::shared_ptr<Buffer>& data, + void Init(int32_t length, const std::shared_ptr<Buffer>& data, + int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr) { - TypePtr type(new TypeClass(nulls != nullptr)); - PrimitiveArray::Init(type, length, data, nulls); + TypePtr type(new TypeClass()); + PrimitiveArray::Init(type, length, data, null_count, nulls); } bool Equals(const PrimitiveArrayImpl& other) const { @@ -101,7 +105,7 @@ class PrimitiveArrayImpl : public PrimitiveArray { const T* raw_data() const { return reinterpret_cast<const T*>(raw_data_);} - T Value(int64_t i) const { + T Value(int i) const { return raw_data()[i]; } @@ -124,7 +128,7 @@ class PrimitiveBuilder : public ArrayBuilder { virtual ~PrimitiveBuilder() {} - Status Resize(int64_t capacity) { + Status Resize(int32_t capacity) { // XXX: Set floor size for now if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; @@ -135,27 +139,26 @@ class PrimitiveBuilder : public ArrayBuilder { } else { RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); RETURN_NOT_OK(values_->Resize(capacity * elsize_)); - capacity_ = capacity; } + capacity_ = capacity; return Status::OK(); } - Status Init(int64_t capacity) { + Status Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - values_ = std::make_shared<PoolBuffer>(pool_); return values_->Resize(capacity * elsize_); } - Status Reserve(int64_t elements) { + Status Reserve(int32_t elements) { if (length_ + elements > capacity_) { - int64_t new_capacity = util::next_power2(length_ + elements); + int32_t new_capacity = util::next_power2(length_ + elements); return Resize(new_capacity); } return Status::OK(); } - Status Advance(int64_t elements) { + Status Advance(int32_t elements) { return ArrayBuilder::Advance(elements); } @@ -165,8 +168,9 @@ class PrimitiveBuilder : public ArrayBuilder { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (nullable_) { - util::set_bit(null_bits_, length_, is_null); + if (is_null) { + ++null_count_; + util::set_bit(null_bits_, length_); } raw_buffer()[length_++] = val; return Status::OK(); @@ -176,42 +180,49 @@ class PrimitiveBuilder : public ArrayBuilder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(const T* values, int64_t length, uint8_t* null_bytes = nullptr) { + Status Append(const T* values, int32_t length, + const uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { - int64_t new_capacity = util::next_power2(length_ + length); + int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (nullable_ && null_bytes != nullptr) { - // If null_bytes is all not null, then none of the values are null - for (int64_t i = 0; i < length; ++i) { - util::set_bit(null_bits_, length_ + i, static_cast<bool>(null_bytes[i])); - } + if (null_bytes != nullptr) { + AppendNulls(null_bytes, length); } length_ += length; return Status::OK(); } - Status AppendNull() { - if (!nullable_) { - return Status::Invalid("not nullable"); + // Write nulls as uint8_t* into pre-allocated memory + void AppendNulls(const uint8_t* null_bytes, int32_t length) { + // If null_bytes is all not null, then none of the values are null + for (int i = 0; i < length; ++i) { + if (static_cast<bool>(null_bytes[i])) { + ++null_count_; + util::set_bit(null_bits_, length_ + i); + } } + } + + Status AppendNull() { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - util::set_bit(null_bits_, length_++, true); + ++null_count_; + util::set_bit(null_bits_, length_++); return Status::OK(); } // Initialize an array type instance with the results of this builder // Transfers ownership of all buffers Status Transfer(PrimitiveArray* out) { - out->Init(type_, length_, values_, nulls_); + out->Init(type_, length_, values_, null_count_, nulls_); values_ = nulls_ = nullptr; - capacity_ = length_ = 0; + capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -236,7 +247,7 @@ class PrimitiveBuilder : public ArrayBuilder { protected: std::shared_ptr<PoolBuffer> values_; - int64_t elsize_; + int elsize_; }; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index a2d87ea..e1dcebe 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -39,7 +39,6 @@ TEST(TypesTest, TestCharType) { CharType t1(5); ASSERT_EQ(t1.type, TypeEnum::CHAR); - ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.ToString(), std::string("char(5)")); @@ -47,7 +46,6 @@ TEST(TypesTest, TestCharType) { // Test copy constructor CharType t2 = t1; ASSERT_EQ(t2.type, TypeEnum::CHAR); - ASSERT_TRUE(t2.nullable); ASSERT_EQ(t2.size, 5); } @@ -56,7 +54,6 @@ TEST(TypesTest, TestVarcharType) { VarcharType t1(5); ASSERT_EQ(t1.type, TypeEnum::VARCHAR); - ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.physical_type.size, 6); @@ -65,19 +62,14 @@ TEST(TypesTest, TestVarcharType) { // Test copy constructor VarcharType t2 = t1; ASSERT_EQ(t2.type, TypeEnum::VARCHAR); - ASSERT_TRUE(t2.nullable); ASSERT_EQ(t2.size, 5); ASSERT_EQ(t2.physical_type.size, 6); } TEST(TypesTest, TestStringType) { StringType str; - StringType str_nn(false); - ASSERT_EQ(str.type, TypeEnum::STRING); ASSERT_EQ(str.name(), std::string("string")); - ASSERT_TRUE(str.nullable); - ASSERT_FALSE(str_nn.nullable); } // ---------------------------------------------------------------------- @@ -96,7 +88,7 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int64_t nchars = chars_.size(); + int nchars = chars_.size(); value_buf_ = to_buffer(chars_); values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); @@ -104,7 +96,9 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = to_buffer(offsets_); nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size()); - strings_.Init(length_, offsets_buf_, values_, nulls_buf_); + null_count_ = null_count(nulls_); + + strings_.Init(length_, offsets_buf_, values_, null_count_, nulls_buf_); } protected: @@ -118,7 +112,8 @@ class TestStringContainer : public ::testing::Test { std::shared_ptr<Buffer> offsets_buf_; std::shared_ptr<Buffer> nulls_buf_; - int64_t length_; + int null_count_; + int length_; ArrayPtr values_; StringArray strings_; @@ -127,7 +122,7 @@ class TestStringContainer : public ::testing::Test { TEST_F(TestStringContainer, TestArrayBasics) { ASSERT_EQ(length_, strings_.length()); - ASSERT_TRUE(strings_.nullable()); + ASSERT_EQ(1, strings_.null_count()); } TEST_F(TestStringContainer, TestType) { @@ -149,7 +144,8 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { - auto arr = std::make_shared<StringArray>(length_, offsets_buf_, values_, nulls_buf_); + auto arr = std::make_shared<StringArray>(length_, offsets_buf_, values_, + null_count_, nulls_buf_); } TEST_F(TestStringContainer, TestGetString) { @@ -189,10 +185,6 @@ class TestStringBuilder : public TestBuilder { std::unique_ptr<StringArray> result_; }; -TEST_F(TestStringBuilder, TestAttrs) { - ASSERT_FALSE(builder_->value_builder()->nullable()); -} - TEST_F(TestStringBuilder, TestScalarAppend) { std::vector<std::string> strings = {"a", "bb", "", "", "ccc"}; std::vector<uint8_t> is_null = {0, 0, 0, 1, 0}; @@ -212,10 +204,11 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps * null_count(is_null), result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); - int64_t length; - int64_t pos = 0; + int32_t length; + int32_t pos = 0; for (int i = 0; i < N * reps; ++i) { if (is_null[i % N]) { ASSERT_TRUE(result_->IsNull(i)); http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/string.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index f3dfbdc..dea42e1 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -35,6 +35,6 @@ std::string VarcharType::ToString() const { return s.str(); } -TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type(false)); +TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type()); } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/string.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index d0690d9..0845625 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -40,13 +40,13 @@ struct CharType : public DataType { BytesType physical_type; - explicit CharType(int size, bool nullable = true) - : DataType(TypeEnum::CHAR, nullable), + explicit CharType(int size) + : DataType(TypeEnum::CHAR), size(size), physical_type(BytesType(size)) {} CharType(const CharType& other) - : CharType(other.size, other.nullable) {} + : CharType(other.size) {} virtual std::string ToString() const; }; @@ -58,12 +58,12 @@ struct VarcharType : public DataType { BytesType physical_type; - explicit VarcharType(int size, bool nullable = true) - : DataType(TypeEnum::VARCHAR, nullable), + explicit VarcharType(int size) + : DataType(TypeEnum::VARCHAR), size(size), physical_type(BytesType(size + 1)) {} VarcharType(const VarcharType& other) - : VarcharType(other.size, other.nullable) {} + : VarcharType(other.size) {} virtual std::string ToString() const; }; @@ -73,11 +73,11 @@ static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(TypeEnum::STRING, nullable) {} + StringType() + : DataType(TypeEnum::STRING) {} StringType(const StringType& other) - : StringType(other.nullable) {} + : StringType() {} const LayoutPtr& physical_type() { return physical_string; @@ -98,17 +98,19 @@ class StringArray : public ListArray { public: StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {} - StringArray(int64_t length, const std::shared_ptr<Buffer>& offsets, + StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr) { - Init(length, offsets, values, nulls); + Init(length, offsets, values, null_count, nulls); } - void Init(const TypePtr& type, int64_t length, + void Init(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr) { - ListArray::Init(type, length, offsets, values, nulls); + ListArray::Init(type, length, offsets, values, null_count, nulls); // TODO: type validation for values array @@ -117,23 +119,24 @@ class StringArray : public ListArray { raw_bytes_ = bytes_->raw_data(); } - void Init(int64_t length, const std::shared_ptr<Buffer>& offsets, + void Init(int32_t length, const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr<Buffer>& nulls = nullptr) { - TypePtr type(new StringType(nulls != nullptr)); - Init(type, length, offsets, values, nulls); + TypePtr type(new StringType()); + Init(type, length, offsets, values, null_count, nulls); } // Compute the pointer t - const uint8_t* GetValue(int64_t i, int64_t* out_length) const { + const uint8_t* GetValue(int i, int32_t* out_length) const { int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; return raw_bytes_ + pos; } // Construct a std::string - std::string GetString(int64_t i) const { - int64_t nchars; + std::string GetString(int i) const { + int32_t nchars; const uint8_t* str = GetValue(i, &nchars); return std::string(reinterpret_cast<const char*>(str), nchars); } @@ -161,7 +164,7 @@ class StringBuilder : public ListBuilder { value.size()); } - Status Append(const uint8_t* value, int64_t length); + Status Append(const uint8_t* value, int32_t length); Status Append(const std::vector<std::string>& values, uint8_t* null_bytes); http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/struct-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 644b545..1a9fc6b 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -43,11 +43,7 @@ TEST(TestStructType, Basics) { vector<Field> fields = {f0, f1, f2}; - StructType struct_type(fields, true); - StructType struct_type_nn(fields, false); - - ASSERT_TRUE(struct_type.nullable); - ASSERT_FALSE(struct_type_nn.nullable); + StructType struct_type(fields); ASSERT_TRUE(struct_type.field(0).Equals(f0)); ASSERT_TRUE(struct_type.field(1).Equals(f1)); http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/struct.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 7d8885b..afba19a 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -29,9 +29,8 @@ namespace arrow { struct StructType : public DataType { std::vector<Field> fields_; - StructType(const std::vector<Field>& fields, - bool nullable = true) - : DataType(TypeEnum::STRUCT, nullable) { + explicit StructType(const std::vector<Field>& fields) + : DataType(TypeEnum::STRUCT) { fields_ = fields; } http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/test-common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 3ecb0de..1744efc 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -36,15 +36,13 @@ class TestBuilder : public ::testing::Test { void SetUp() { pool_ = GetDefaultMemoryPool(); type_ = TypePtr(new UInt8Type()); - type_nn_ = TypePtr(new UInt8Type(false)); builder_.reset(new UInt8Builder(pool_, type_)); - builder_nn_.reset(new UInt8Builder(pool_, type_nn_)); + builder_nn_.reset(new UInt8Builder(pool_, type_)); } protected: MemoryPool* pool_; TypePtr type_; - TypePtr type_nn_; unique_ptr<ArrayBuilder> builder_; unique_ptr<ArrayBuilder> builder_nn_; }; http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/types/union.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 7b66c3b..62a3d1c 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -33,9 +33,8 @@ class Buffer; struct DenseUnionType : public CollectionType<TypeEnum::DENSE_UNION> { typedef CollectionType<TypeEnum::DENSE_UNION> Base; - DenseUnionType(const std::vector<TypePtr>& child_types, - bool nullable = true) - : Base(nullable) { + explicit DenseUnionType(const std::vector<TypePtr>& child_types) : + Base() { child_types_ = child_types; } @@ -46,9 +45,8 @@ struct DenseUnionType : public CollectionType<TypeEnum::DENSE_UNION> { struct SparseUnionType : public CollectionType<TypeEnum::SPARSE_UNION> { typedef CollectionType<TypeEnum::SPARSE_UNION> Base; - SparseUnionType(const std::vector<TypePtr>& child_types, - bool nullable = true) - : Base(nullable) { + explicit SparseUnionType(const std::vector<TypePtr>& child_types) : + Base() { child_types_ = child_types; } http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/util/bit-util.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index dbac0a4..292cb33 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -25,7 +25,9 @@ namespace arrow { void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) { for (int i = 0; i < length; ++i) { - set_bit(bits, i, static_cast<bool>(bytes[i])); + if (static_cast<bool>(bytes[i])) { + set_bit(bits, i); + } } } http://git-wip-us.apache.org/repos/asf/arrow/blob/b88b69e2/cpp/src/arrow/util/bit-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 9ae6127..841f617 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -41,8 +41,8 @@ static inline bool get_bit(const uint8_t* bits, int i) { return bits[i / 8] & (1 << (i % 8)); } -static inline void set_bit(uint8_t* bits, int i, bool is_set) { - bits[i / 8] |= (1 << (i % 8)) * is_set; +static inline void set_bit(uint8_t* bits, int i) { + bits[i / 8] |= 1 << (i % 8); } static inline int64_t next_power2(int64_t n) {