ARROW-33: [C++] Implement zero-copy array slicing, integrate with IPC code paths
This turned into a bit of a refactoring bloodbath. I have sorted through most of the issues that this turned up, so I should have this all completely working within a day or so. There will be some follow up work to do to polish things up Closes #56. Author: Wes McKinney <[email protected]> Closes #322 from wesm/ARROW-33 and squashes the following commits: 61afe42 [Wes McKinney] Some API cleaning in builder.h 86511a3 [Wes McKinney] Python fixes, clang warning fixes 9a00870 [Wes McKinney] Make ApproxEquals for floating point arrays work on slices 2a13929 [Wes McKinney] Implement slicing IPC logic for dense array 4f08628 [Wes McKinney] Add missing include 1a6fcb4 [Wes McKinney] Make some more progress. dense union needs more work c6d814d [Wes McKinney] Work on adding sliced array support to IPC code path, with pretty printer and comparison fixed for sliced bitmaps, etc. Not all working yet b6c511e [Wes McKinney] Add RecordBatch::Slice convenience method 8900d58 [Wes McKinney] Add Slice tests for DictionaryArray. Test recomputing the null count 55454d7 [Wes McKinney] Add slice tests for struct, union, string, list a72653d [Wes McKinney] Rename offsets to value_offsets in list/binary/string/union for better clarity. Test Slice for primitive arrays 0355f71 [Wes McKinney] Implement CopyBitmap function a228b50 [Wes McKinney] Implement Slice methods on Array classes e502901 [Wes McKinney] Move null_count and offset as last two parameters of all array ctors. Implement/test bitmap set bit count with offset bae6922 [Wes McKinney] Temporary work on adding offset parameter to Array classes for slicing Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5439b715 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5439b715 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5439b715 Branch: refs/heads/master Commit: 5439b71586f4b0f9a36544b9e2417ee6ad7b48e8 Parents: 74bc4dd Author: Wes McKinney <[email protected]> Authored: Mon Feb 6 11:25:18 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Mon Feb 6 11:25:18 2017 -0500 ---------------------------------------------------------------------- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/array-dictionary-test.cc | 62 ++++-- cpp/src/arrow/array-list-test.cc | 36 +++- cpp/src/arrow/array-primitive-test.cc | 78 ++++++- cpp/src/arrow/array-string-test.cc | 90 ++++++-- cpp/src/arrow/array-struct-test.cc | 19 +- cpp/src/arrow/array-test.cc | 32 ++- cpp/src/arrow/array-union-test.cc | 67 ++++++ cpp/src/arrow/array.cc | 233 ++++++++++++++------- cpp/src/arrow/array.h | 265 +++++++++++++++--------- cpp/src/arrow/buffer.cc | 16 ++ cpp/src/arrow/buffer.h | 21 +- cpp/src/arrow/builder.cc | 64 +++--- cpp/src/arrow/builder.h | 21 +- cpp/src/arrow/column-test.cc | 14 +- cpp/src/arrow/compare.cc | 122 ++++++++--- cpp/src/arrow/io/file.cc | 4 +- cpp/src/arrow/io/hdfs.cc | 8 +- cpp/src/arrow/io/io-hdfs-test.cc | 10 +- cpp/src/arrow/io/io-memory-test.cc | 4 +- cpp/src/arrow/ipc/adapter.cc | 260 +++++++++++++++++++---- cpp/src/arrow/ipc/adapter.h | 8 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 52 ++++- cpp/src/arrow/ipc/ipc-json-test.cc | 21 +- cpp/src/arrow/ipc/json-integration-test.cc | 6 +- cpp/src/arrow/ipc/json-internal.cc | 37 ++-- cpp/src/arrow/ipc/stream.cc | 15 +- cpp/src/arrow/ipc/stream.h | 8 + cpp/src/arrow/ipc/test-common.h | 79 ++++--- cpp/src/arrow/pretty_print-test.cc | 6 +- cpp/src/arrow/pretty_print.cc | 53 +++-- cpp/src/arrow/table-test.cc | 26 +++ cpp/src/arrow/table.cc | 19 +- cpp/src/arrow/table.h | 4 + cpp/src/arrow/test-util.h | 43 +--- cpp/src/arrow/type.cc | 6 +- cpp/src/arrow/type.h | 8 +- cpp/src/arrow/type_traits.h | 9 + cpp/src/arrow/util/bit-util-test.cc | 62 +++++- cpp/src/arrow/util/bit-util.cc | 83 +++++++- cpp/src/arrow/util/bit-util.h | 45 ++++ cpp/src/arrow/util/logging.h | 4 +- cpp/src/arrow/util/macros.h | 2 +- python/CMakeLists.txt | 2 +- python/pyarrow/includes/libarrow.pxd | 4 +- python/pyarrow/scalar.pyx | 2 +- python/src/pyarrow/adapters/builtin.cc | 2 +- python/src/pyarrow/adapters/pandas.cc | 20 +- python/src/pyarrow/io.cc | 21 +- 49 files changed, 1524 insertions(+), 550 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index b002bb7..824ced1 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -53,6 +53,7 @@ ADD_ARROW_TEST(array-list-test) ADD_ARROW_TEST(array-primitive-test) ADD_ARROW_TEST(array-string-test) ADD_ARROW_TEST(array-struct-test) +ADD_ARROW_TEST(array-union-test) ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(column-test) ADD_ARROW_TEST(memory_pool-test) http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-dictionary-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-dictionary-test.cc b/cpp/src/arrow/array-dictionary-test.cc index 1a0d49a..61381b7 100644 --- a/cpp/src/arrow/array-dictionary-test.cc +++ b/cpp/src/arrow/array-dictionary-test.cc @@ -34,7 +34,7 @@ namespace arrow { TEST(TestDictionary, Basics) { std::vector<int32_t> values = {100, 1000, 10000, 100000}; std::shared_ptr<Array> dict; - ArrayFromVector<Int32Type, int32_t>(int32(), values, &dict); + ArrayFromVector<Int32Type, int32_t>(values, &dict); std::shared_ptr<DictionaryType> type1 = std::dynamic_pointer_cast<DictionaryType>(dictionary(int16(), dict)); @@ -54,45 +54,67 @@ TEST(TestDictionary, Equals) { std::shared_ptr<Array> dict; std::vector<std::string> dict_values = {"foo", "bar", "baz"}; - ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict); + ArrayFromVector<StringType, std::string>(dict_values, &dict); std::shared_ptr<DataType> dict_type = dictionary(int16(), dict); std::shared_ptr<Array> dict2; std::vector<std::string> dict2_values = {"foo", "bar", "baz", "qux"}; - ArrayFromVector<StringType, std::string>(utf8(), dict2_values, &dict2); + ArrayFromVector<StringType, std::string>(dict2_values, &dict2); std::shared_ptr<DataType> dict2_type = dictionary(int16(), dict2); std::shared_ptr<Array> indices; std::vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0}; - ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices_values, &indices); + ArrayFromVector<Int16Type, int16_t>(is_valid, indices_values, &indices); std::shared_ptr<Array> indices2; std::vector<int16_t> indices2_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices2_values, &indices2); + ArrayFromVector<Int16Type, int16_t>(is_valid, indices2_values, &indices2); std::shared_ptr<Array> indices3; std::vector<int16_t> indices3_values = {1, 1, 0, 0, 2, 0}; - ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices3_values, &indices3); + ArrayFromVector<Int16Type, int16_t>(is_valid, indices3_values, &indices3); - auto arr = std::make_shared<DictionaryArray>(dict_type, indices); - auto arr2 = std::make_shared<DictionaryArray>(dict_type, indices2); - auto arr3 = std::make_shared<DictionaryArray>(dict2_type, indices); - auto arr4 = std::make_shared<DictionaryArray>(dict_type, indices3); + auto array = std::make_shared<DictionaryArray>(dict_type, indices); + auto array2 = std::make_shared<DictionaryArray>(dict_type, indices2); + auto array3 = std::make_shared<DictionaryArray>(dict2_type, indices); + auto array4 = std::make_shared<DictionaryArray>(dict_type, indices3); - ASSERT_TRUE(arr->Equals(arr)); + ASSERT_TRUE(array->Equals(array)); // Equal, because the unequal index is masked by null - ASSERT_TRUE(arr->Equals(arr2)); + ASSERT_TRUE(array->Equals(array2)); // Unequal dictionaries - ASSERT_FALSE(arr->Equals(arr3)); + ASSERT_FALSE(array->Equals(array3)); // Unequal indices - ASSERT_FALSE(arr->Equals(arr4)); + ASSERT_FALSE(array->Equals(array4)); // RangeEquals - ASSERT_TRUE(arr->RangeEquals(3, 6, 3, arr4)); - ASSERT_FALSE(arr->RangeEquals(1, 3, 1, arr4)); + ASSERT_TRUE(array->RangeEquals(3, 6, 3, array4)); + ASSERT_FALSE(array->RangeEquals(1, 3, 1, array4)); + + // ARROW-33 Test slices + const int size = array->length(); + + std::shared_ptr<Array> slice, slice2; + slice = array->Array::Slice(2); + slice2 = array->Array::Slice(2); + ASSERT_EQ(size - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice)); + + // Chained slices + slice2 = array->Array::Slice(1)->Array::Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(1, 3); + slice2 = array->Slice(1, 3); + ASSERT_EQ(3, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 4, 0, slice)); } TEST(TestDictionary, Validate) { @@ -100,20 +122,20 @@ TEST(TestDictionary, Validate) { std::shared_ptr<Array> dict; std::vector<std::string> dict_values = {"foo", "bar", "baz"}; - ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict); + ArrayFromVector<StringType, std::string>(dict_values, &dict); std::shared_ptr<DataType> dict_type = dictionary(int16(), dict); std::shared_ptr<Array> indices; std::vector<uint8_t> indices_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector<UInt8Type, uint8_t>(uint8(), is_valid, indices_values, &indices); + ArrayFromVector<UInt8Type, uint8_t>(is_valid, indices_values, &indices); std::shared_ptr<Array> indices2; std::vector<float> indices2_values = {1., 2., 0., 0., 2., 0.}; - ArrayFromVector<FloatType, float>(float32(), is_valid, indices2_values, &indices2); + ArrayFromVector<FloatType, float>(is_valid, indices2_values, &indices2); std::shared_ptr<Array> indices3; std::vector<int64_t> indices3_values = {1, 2, 0, 0, 2, 0}; - ArrayFromVector<Int64Type, int64_t>(int64(), is_valid, indices3_values, &indices3); + ArrayFromVector<Int64Type, int64_t>(is_valid, indices3_values, &indices3); std::shared_ptr<Array> arr = std::make_shared<DictionaryArray>(dict_type, indices); std::shared_ptr<Array> arr2 = std::make_shared<DictionaryArray>(dict_type, indices2); http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-list-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc index 8e4d319..a144fd9 100644 --- a/cpp/src/arrow/array-list-test.cc +++ b/cpp/src/arrow/array-list-test.cc @@ -90,9 +90,9 @@ TEST_F(TestListBuilder, Equality) { Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get()); std::shared_ptr<Array> array, equal_array, unequal_array; - vector<int32_t> equal_offsets = {0, 1, 2, 5}; - vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2}; - vector<int32_t> unequal_offsets = {0, 1, 4}; + vector<int32_t> equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10}; + vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6}; + vector<int32_t> unequal_offsets = {0, 1, 4, 7}; vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5}; // setup two equal arrays @@ -122,7 +122,27 @@ TEST_F(TestListBuilder, Equality) { EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); - EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array)); + + // Check with slices, ARROW-33 + std::shared_ptr<Array> slice, slice2; + + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(array->length() - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(1)->Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(1, 4); + slice2 = array->Slice(1, 4); + ASSERT_EQ(4, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); } TEST_F(TestListBuilder, TestResize) {} @@ -137,9 +157,9 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_EQ(0, result_->raw_offsets()[0]); - ASSERT_EQ(0, result_->offset(1)); - ASSERT_EQ(0, result_->offset(2)); + ASSERT_EQ(0, result_->raw_value_offsets()[0]); + ASSERT_EQ(0, result_->value_offset(1)); + ASSERT_EQ(0, result_->value_offset(2)); Int32Array* values = static_cast<Int32Array*>(result_->values().get()); ASSERT_EQ(0, values->length()); @@ -154,7 +174,7 @@ void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& valu ASSERT_EQ(3, result->length()); vector<int32_t> ex_offsets = {0, 3, 3, 7}; for (size_t i = 0; i < ex_offsets.size(); ++i) { - ASSERT_EQ(ex_offsets[i], result->offset(i)); + ASSERT_EQ(ex_offsets[i], result->value_offset(i)); } for (int i = 0; i < result->length(); ++i) { http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc index c839fb9..a20fdbf 100644 --- a/cpp/src/arrow/array-primitive-test.cc +++ b/cpp/src/arrow/array-primitive-test.cc @@ -121,7 +121,7 @@ class TestPrimitiveBuilder : public TestBuilder { } auto expected = - std::make_shared<ArrayType>(size, ex_data, ex_null_count, ex_null_bitmap); + std::make_shared<ArrayType>(size, ex_data, ex_null_bitmap, ex_null_count); std::shared_ptr<Array> out; ASSERT_OK(builder->Finish(&out)); @@ -217,7 +217,7 @@ void TestPrimitiveBuilder<PBoolean>::Check( } auto expected = - std::make_shared<BooleanArray>(size, ex_data, ex_null_count, ex_null_bitmap); + std::make_shared<BooleanArray>(size, ex_data, ex_null_bitmap, ex_null_count); std::shared_ptr<Array> out; ASSERT_OK(builder->Finish(&out)); @@ -235,15 +235,14 @@ void TestPrimitiveBuilder<PBoolean>::Check( for (int i = 0; i < result->length(); ++i) { if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } - bool actual = BitUtil::GetBit(result->raw_data(), i); + bool actual = BitUtil::GetBit(result->data()->data(), i); ASSERT_EQ(static_cast<bool>(draws_[i]), actual) << i; } ASSERT_TRUE(result->Equals(*expected)); } typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16, - PInt32, PInt64, PFloat, PDouble> - Primitives; + PInt32, PInt64, PFloat, PDouble> Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); @@ -347,6 +346,39 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) { array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array)); } +TYPED_TEST(TestPrimitiveBuilder, SliceEquality) { + DECL_T(); + + const int size = 1000; + this->RandomData(size); + vector<T>& draws = this->draws_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; + auto builder = this->builder_.get(); + + std::shared_ptr<Array> array; + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array)); + + std::shared_ptr<Array> slice, slice2; + + slice = array->Slice(5); + slice2 = array->Slice(5); + ASSERT_EQ(size - 5, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, array->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(2)->Slice(3); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(5, 10); + slice2 = array->Slice(5, 10); + ASSERT_EQ(10, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, 15, 0, slice)); +} + TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); @@ -473,4 +505,40 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity()); } +template <typename TYPE> +void CheckSliceApproxEquals() { + using T = typename TYPE::c_type; + + const int kSize = 50; + std::vector<T> draws1; + std::vector<T> draws2; + + const uint32_t kSeed = 0; + test::random_real<T>(kSize, kSeed, 0, 100, &draws1); + test::random_real<T>(kSize, kSeed + 1, 0, 100, &draws2); + + // Make the draws equal in the sliced segment, but unequal elsewhere (to + // catch not using the slice offset) + for (int i = 10; i < 30; ++i) { + draws2[i] = draws1[i]; + } + + std::vector<bool> is_valid; + test::random_is_valid(kSize, 0.1, &is_valid); + + std::shared_ptr<Array> array1, array2; + ArrayFromVector<TYPE, T>(is_valid, draws1, &array1); + ArrayFromVector<TYPE, T>(is_valid, draws2, &array2); + + std::shared_ptr<Array> slice1 = array1->Slice(10, 20); + std::shared_ptr<Array> slice2 = array2->Slice(10, 20); + + ASSERT_TRUE(slice1->ApproxEquals(slice2)); +} + +TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) { + CheckSliceApproxEquals<FloatType>(); + CheckSliceApproxEquals<DoubleType>(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc index 5ea384a..8b7eb41 100644 --- a/cpp/src/arrow/array-string-test.cc +++ b/cpp/src/arrow/array-string-test.cc @@ -27,6 +27,7 @@ #include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" namespace arrow { @@ -70,7 +71,7 @@ class TestStringArray : public ::testing::Test { null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared<StringArray>( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } protected: @@ -114,7 +115,7 @@ TEST_F(TestStringArray, TestListFunctions) { TEST_F(TestStringArray, TestDestructor) { auto arr = std::make_shared<StringArray>( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } TEST_F(TestStringArray, TestGetString) { @@ -133,9 +134,9 @@ TEST_F(TestStringArray, TestEmptyStringComparison) { length_ = offsets_.size() - 1; auto strings_a = std::make_shared<StringArray>( - length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + length_, offsets_buf_, nullptr, null_bitmap_, null_count_); auto strings_b = std::make_shared<StringArray>( - length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + length_, offsets_buf_, nullptr, null_bitmap_, null_count_); ASSERT_TRUE(strings_a->Equals(strings_b)); } @@ -146,8 +147,7 @@ class TestStringBuilder : public TestBuilder { public: void SetUp() { TestBuilder::SetUp(); - type_ = TypePtr(new StringType()); - builder_.reset(new StringBuilder(pool_, type_)); + builder_.reset(new StringBuilder(pool_)); } void Done() { @@ -159,8 +159,6 @@ class TestStringBuilder : public TestBuilder { } protected: - TypePtr type_; - std::unique_ptr<StringBuilder> builder_; std::shared_ptr<StringArray> result_; }; @@ -195,7 +193,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { } else { ASSERT_FALSE(result_->IsNull(i)); result_->GetValue(i, &length); - ASSERT_EQ(pos, result_->offset(i)); + ASSERT_EQ(pos, result_->value_offset(i)); ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); ASSERT_EQ(strings[i % N], result_->GetString(i)); @@ -232,7 +230,7 @@ class TestBinaryArray : public ::testing::Test { null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared<BinaryArray>( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } protected: @@ -276,7 +274,7 @@ TEST_F(TestBinaryArray, TestListFunctions) { TEST_F(TestBinaryArray, TestDestructor) { auto arr = std::make_shared<BinaryArray>( - length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_bitmap_, null_count_); } TEST_F(TestBinaryArray, TestGetValue) { @@ -306,8 +304,8 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) { ASSERT_OK(builder.Finish(&left_arr)); const BinaryArray& left = static_cast<const BinaryArray&>(*left_arr); - std::shared_ptr<Array> right = std::make_shared<BinaryArray>( - left.length(), left.offsets(), nullptr, left.null_count(), left.null_bitmap()); + std::shared_ptr<Array> right = std::make_shared<BinaryArray>(left.length(), + left.value_offsets(), nullptr, left.null_bitmap(), left.null_count()); ASSERT_TRUE(left.Equals(right)); ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right)); @@ -317,8 +315,7 @@ class TestBinaryBuilder : public TestBuilder { public: void SetUp() { TestBuilder::SetUp(); - type_ = TypePtr(new BinaryType()); - builder_.reset(new BinaryBuilder(pool_, type_)); + builder_.reset(new BinaryBuilder(pool_)); } void Done() { @@ -330,8 +327,6 @@ class TestBinaryBuilder : public TestBuilder { } protected: - TypePtr type_; - std::unique_ptr<BinaryBuilder> builder_; std::shared_ptr<BinaryArray> result_; }; @@ -348,8 +343,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { if (is_null[i]) { builder_->AppendNull(); } else { - builder_->Append( - reinterpret_cast<const uint8_t*>(strings[i].data()), strings[i].size()); + builder_->Append(strings[i]); } } } @@ -377,4 +371,62 @@ TEST_F(TestBinaryBuilder, TestZeroLength) { Done(); } +// ---------------------------------------------------------------------- +// Slice tests + +template <typename TYPE> +void CheckSliceEquality() { + using Traits = TypeTraits<TYPE>; + using BuilderType = typename Traits::BuilderType; + + BuilderType builder(default_memory_pool()); + + std::vector<std::string> strings = {"foo", "", "bar", "baz", "qux", ""}; + std::vector<uint8_t> is_null = {0, 1, 0, 1, 0, 0}; + + int N = strings.size(); + int reps = 10; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder.AppendNull(); + } else { + builder.Append(strings[i]); + } + } + } + + std::shared_ptr<Array> array; + ASSERT_OK(builder.Finish(&array)); + + std::shared_ptr<Array> slice, slice2; + + slice = array->Slice(5); + slice2 = array->Slice(5); + ASSERT_EQ(N * reps - 5, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(2)->Slice(3); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(5, 20); + slice2 = array->Slice(5, 20); + ASSERT_EQ(20, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice)); +} + +TEST_F(TestBinaryArray, TestSliceEquality) { + CheckSliceEquality<BinaryType>(); +} + +TEST_F(TestStringArray, TestSliceEquality) { + CheckSliceEquality<BinaryType>(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-struct-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc index 5827c39..f4e7409 100644 --- a/cpp/src/arrow/array-struct-test.cc +++ b/cpp/src/arrow/array-struct-test.cc @@ -75,7 +75,7 @@ void ValidateBasicStructArray(const StructArray* result, ASSERT_EQ(4, list_char_arr->length()); ASSERT_EQ(10, list_char_arr->values()->length()); for (size_t i = 0; i < list_offsets.size(); ++i) { - ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]); + ASSERT_EQ(list_offsets[i], list_char_arr->raw_value_offsets()[i]); } for (size_t i = 0; i < list_values.size(); ++i) { ASSERT_EQ(list_values[i], char_arr->Value(i)); @@ -381,6 +381,23 @@ TEST_F(TestStructBuilder, TestEquality) { EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array)); EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array)); EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array)); + + // ARROW-33 Slice / equality + std::shared_ptr<Array> slice, slice2; + + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(array->length() - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice)); + + slice = array->Slice(1, 2); + slice2 = array->Slice(1, 2); + ASSERT_EQ(2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 3, 0, slice)); } TEST_F(TestStructBuilder, TestZeroLength) { http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index a1d8fdf..45130d8 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -43,7 +43,7 @@ TEST_F(TestArray, TestNullCount) { auto data = std::make_shared<PoolBuffer>(pool_); auto null_bitmap = std::make_shared<PoolBuffer>(pool_); - std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, null_bitmap)); + std::unique_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10)); ASSERT_EQ(10, arr->null_count()); std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data)); @@ -67,7 +67,7 @@ std::shared_ptr<Array> MakeArrayFromValidBytes( } std::shared_ptr<Array> arr( - new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf)); + new Int32Array(v.size(), value_builder.Finish(), null_buf, null_count)); return arr; } @@ -87,6 +87,32 @@ TEST_F(TestArray, TestEquality) { EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); } +TEST_F(TestArray, SliceRecomputeNullCount) { + std::vector<uint8_t> valid_bytes = {1, 0, 1, 1, 0, 1, 0, 0}; + + auto array = MakeArrayFromValidBytes(valid_bytes, pool_); + + ASSERT_EQ(4, array->null_count()); + + auto slice = array->Slice(1, 4); + ASSERT_EQ(2, slice->null_count()); + + slice = array->Slice(4); + ASSERT_EQ(1, slice->null_count()); + + slice = array->Slice(0); + ASSERT_EQ(4, slice->null_count()); + + // No bitmap, compute 0 + std::shared_ptr<MutableBuffer> data; + const int kBufferSize = 64; + ASSERT_OK(AllocateBuffer(pool_, kBufferSize, &data)); + memset(data->mutable_data(), 0, kBufferSize); + + auto arr = std::make_shared<Int32Array>(16, data, nullptr, -1); + ASSERT_EQ(0, arr->null_count()); +} + TEST_F(TestArray, TestIsNull) { // clang-format off std::vector<uint8_t> null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, @@ -102,7 +128,7 @@ TEST_F(TestArray, TestIsNull) { std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(null_bitmap); std::unique_ptr<Array> arr; - arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf)); + arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count)); ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-union-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc new file mode 100644 index 0000000..eb9bd7d --- /dev/null +++ b/cpp/src/arrow/array-union-test.cc @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Tests for UnionArray + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/ipc/test-common.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +TEST(TestUnionArrayAdHoc, TestSliceEquals) { + std::shared_ptr<RecordBatch> batch; + ASSERT_OK(ipc::MakeUnion(&batch)); + + const int size = batch->num_rows(); + + auto CheckUnion = [&size](std::shared_ptr<Array> array) { + std::shared_ptr<Array> slice, slice2; + slice = array->Slice(2); + slice2 = array->Slice(2); + ASSERT_EQ(size - 2, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice)); + + // Chained slices + slice2 = array->Slice(1)->Slice(1); + ASSERT_TRUE(slice->Equals(slice2)); + + slice = array->Slice(1, 5); + slice2 = array->Slice(1, 5); + ASSERT_EQ(5, slice->length()); + + ASSERT_TRUE(slice->Equals(slice2)); + ASSERT_TRUE(array->RangeEquals(1, 6, 0, slice)); + }; + + CheckUnion(batch->column(1)); + CheckUnion(batch->column(2)); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 6fc7fb6..f84023e 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -17,6 +17,7 @@ #include "arrow/array.h" +#include <algorithm> #include <cstdint> #include <cstring> #include <sstream> @@ -30,28 +31,37 @@ namespace arrow { -Status GetEmptyBitmap( - MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result) { - auto buffer = std::make_shared<PoolBuffer>(pool); - RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length))); - memset(buffer->mutable_data(), 0, buffer->size()); - - *result = buffer; - return Status::OK(); -} +// When slicing, we do not know the null count of the sliced range without +// doing some computation. To avoid doing this eagerly, we set the null count +// to -1 (any negative number will do). When Array::null_count is called the +// first time, the null count will be computed. See ARROW-33 +constexpr int32_t kUnknownNullCount = -1; // ---------------------------------------------------------------------- // Base array class -Array::Array(const std::shared_ptr<DataType>& type, int32_t length, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap) { - type_ = type; - length_ = length; - null_count_ = null_count; - null_bitmap_ = null_bitmap; +Array::Array(const std::shared_ptr<DataType>& type, int32_t length, + const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset) + : type_(type), + length_(length), + offset_(offset), + null_count_(null_count), + null_bitmap_(null_bitmap), + null_bitmap_data_(nullptr) { if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); } } +int32_t Array::null_count() const { + if (null_count_ < 0) { + if (null_bitmap_) { + null_count_ = CountSetBits(null_bitmap_data_, offset_, length_); + } else { + null_count_ = 0; + } + } + return null_count_; +} + bool Array::Equals(const Array& arr) const { bool are_equal = false; Status error = ArrayEquals(*this, arr, &are_equal); @@ -86,10 +96,32 @@ bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_ return are_equal; } +// Last two parameters are in-out parameters +static inline void ConformSliceParams( + int32_t array_offset, int32_t array_length, int32_t* offset, int32_t* length) { + DCHECK_LE(*offset, array_length); + DCHECK_GE(offset, 0); + *length = std::min(array_length - *offset, *length); + *offset = array_offset + *offset; +} + +std::shared_ptr<Array> Array::Slice(int32_t offset) const { + int32_t slice_length = length_ - offset; + return Slice(offset, slice_length); +} + Status Array::Validate() const { return Status::OK(); } +NullArray::NullArray(int32_t length) : Array(null(), length, nullptr, length) {} + +std::shared_ptr<Array> NullArray::Slice(int32_t offset, int32_t length) const { + DCHECK_LE(offset, length_); + length = std::min(length_ - offset, length); + return std::make_shared<NullArray>(length); +} + Status NullArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } @@ -98,9 +130,9 @@ Status NullArray::Accept(ArrayVisitor* visitor) const { // Primitive array base PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap) - : Array(type, length, null_count, null_bitmap) { + const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap, + int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset) { data_ = data; raw_data_ = data == nullptr ? nullptr : data_->data(); } @@ -110,6 +142,13 @@ Status NumericArray<T>::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +template <typename T> +std::shared_ptr<Array> NumericArray<T>::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<NumericArray<T>>( + type_, length, data_, null_bitmap_, kUnknownNullCount, offset); +} + template class NumericArray<UInt8Type>; template class NumericArray<UInt16Type>; template class NumericArray<UInt32Type>; @@ -129,32 +168,33 @@ template class NumericArray<DoubleType>; // BooleanArray BooleanArray::BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data, - int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) - : PrimitiveArray( - std::make_shared<BooleanType>(), length, data, null_count, null_bitmap) {} - -BooleanArray::BooleanArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap) - : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset) + : PrimitiveArray(std::make_shared<BooleanType>(), length, data, null_bitmap, + null_count, offset) {} Status BooleanArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr<Array> BooleanArray::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<BooleanArray>( + length, data_, null_bitmap_, kUnknownNullCount, offset); +} + // ---------------------------------------------------------------------- // ListArray Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offsets_buffer_) { return Status::Invalid("offsets_buffer_ was null"); } - if (offsets_buffer_->size() / static_cast<int>(sizeof(int32_t)) < length_) { + if (!value_offsets_) { return Status::Invalid("value_offsets_ was null"); } + if (value_offsets_->size() / static_cast<int>(sizeof(int32_t)) < length_) { std::stringstream ss; - ss << "offset buffer size (bytes): " << offsets_buffer_->size() + ss << "offset buffer size (bytes): " << value_offsets_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } - const int32_t last_offset = offset(length_); + const int32_t last_offset = this->value_offset(length_); if (last_offset > 0) { if (!values_) { return Status::Invalid("last offset was non-zero and values was null"); @@ -174,14 +214,15 @@ Status ListArray::Validate() const { } } - int32_t prev_offset = offset(0); + int32_t prev_offset = this->value_offset(0); if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } for (int32_t i = 1; i <= length_; ++i) { - int32_t current_offset = offset(i); + int32_t current_offset = this->value_offset(i); if (IsNull(i - 1) && current_offset != prev_offset) { std::stringstream ss; - ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot" - << current_offset << "!=" << prev_offset; + ss << "Offset invariant failure at: " << i + << " inconsistent value_offsets for null slot" << current_offset + << "!=" << prev_offset; return Status::Invalid(ss.str()); } if (current_offset < prev_offset) { @@ -200,26 +241,33 @@ Status ListArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr<Array> ListArray::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<ListArray>( + type_, length, value_offsets_, values_, null_bitmap_, kUnknownNullCount, offset); +} + // ---------------------------------------------------------------------- // String and binary static std::shared_ptr<DataType> kBinary = std::make_shared<BinaryType>(); static std::shared_ptr<DataType> kString = std::make_shared<StringType>(); -BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap) - : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {} +BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets, + const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap, + int32_t null_count, int32_t offset) + : BinaryArray(kBinary, length, value_offsets, data, null_bitmap, null_count, offset) { +} BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data, - int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) - : Array(type, length, null_count, null_bitmap), - offsets_buffer_(offsets), - offsets_(reinterpret_cast<const int32_t*>(offsets_buffer_->data())), - data_buffer_(data), - data_(nullptr) { - if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); } + const std::shared_ptr<Buffer>& value_offsets, const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset), + value_offsets_(value_offsets), + raw_value_offsets_(reinterpret_cast<const int32_t*>(value_offsets_->data())), + data_(data), + raw_data_(nullptr) { + if (data_ != nullptr) { raw_data_ = data_->data(); } } Status BinaryArray::Validate() const { @@ -231,10 +279,17 @@ Status BinaryArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } -StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap) - : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {} +std::shared_ptr<Array> BinaryArray::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<BinaryArray>( + length, value_offsets_, data_, null_bitmap_, kUnknownNullCount, offset); +} + +StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets, + const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap, + int32_t null_count, int32_t offset) + : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) { +} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -245,12 +300,26 @@ Status StringArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr<Array> StringArray::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<StringArray>( + length, value_offsets_, data_, null_bitmap_, kUnknownNullCount, offset); +} + // ---------------------------------------------------------------------- // Struct +StructArray::StructArray(const std::shared_ptr<DataType>& type, int32_t length, + const std::vector<std::shared_ptr<Array>>& children, + std::shared_ptr<Buffer> null_bitmap, int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset) { + type_ = type; + children_ = children; +} + std::shared_ptr<Array> StructArray::field(int32_t pos) const { - DCHECK_GT(field_arrays_.size(), 0); - return field_arrays_[pos]; + DCHECK_GT(children_.size(), 0); + return children_[pos]; } Status StructArray::Validate() const { @@ -260,11 +329,11 @@ Status StructArray::Validate() const { return Status::Invalid("Null count exceeds the length of this struct"); } - if (field_arrays_.size() > 0) { + if (children_.size() > 0) { // Validate fields - int32_t array_length = field_arrays_[0]->length(); + int32_t array_length = children_[0]->length(); size_t idx = 0; - for (auto it : field_arrays_) { + for (auto it : children_) { if (it->length() != array_length) { std::stringstream ss; ss << "Length is not equal from field " << it->type()->ToString() @@ -293,19 +362,27 @@ Status StructArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr<Array> StructArray::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<StructArray>( + type_, length, children_, null_bitmap_, kUnknownNullCount, offset); +} + // ---------------------------------------------------------------------- // UnionArray UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int32_t length, const std::vector<std::shared_ptr<Array>>& children, - const std::shared_ptr<Buffer>& type_ids, const std::shared_ptr<Buffer>& offsets, - int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) - : Array(type, length, null_count, null_bitmap), + const std::shared_ptr<Buffer>& type_ids, const std::shared_ptr<Buffer>& value_offsets, + const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset) + : Array(type, length, null_bitmap, null_count, offset), children_(children), - type_ids_buffer_(type_ids), - offsets_buffer_(offsets) { - type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data()); - if (offsets) { offsets_ = reinterpret_cast<const int32_t*>(offsets->data()); } + type_ids_(type_ids), + value_offsets_(value_offsets) { + raw_type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data()); + if (value_offsets) { + raw_value_offsets_ = reinterpret_cast<const int32_t*>(value_offsets->data()); + } } std::shared_ptr<Array> UnionArray::child(int32_t pos) const { @@ -328,18 +405,24 @@ Status UnionArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr<Array> UnionArray::Slice(int32_t offset, int32_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<UnionArray>(type_, length, children_, type_ids_, value_offsets_, + null_bitmap_, kUnknownNullCount, offset); +} + // ---------------------------------------------------------------------- // DictionaryArray Status DictionaryArray::FromBuffer(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& indices, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<DictionaryArray>* out) { + const std::shared_ptr<Buffer>& indices, const std::shared_ptr<Buffer>& null_bitmap, + int32_t null_count, int32_t offset, std::shared_ptr<DictionaryArray>* out) { DCHECK_EQ(type->type, Type::DICTIONARY); const auto& dict_type = static_cast<const DictionaryType*>(type.get()); std::shared_ptr<Array> boxed_indices; - RETURN_NOT_OK(MakePrimitiveArray( - dict_type->index_type(), length, indices, null_count, null_bitmap, &boxed_indices)); + RETURN_NOT_OK(MakePrimitiveArray(dict_type->index_type(), length, indices, null_bitmap, + null_count, offset, &boxed_indices)); *out = std::make_shared<DictionaryArray>(type, boxed_indices); return Status::OK(); @@ -347,7 +430,8 @@ Status DictionaryArray::FromBuffer(const std::shared_ptr<DataType>& type, int32_ DictionaryArray::DictionaryArray( const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices) - : Array(type, indices->length(), indices->null_count(), indices->null_bitmap()), + : Array(type, indices->length(), indices->null_bitmap(), indices->null_count(), + indices->offset()), dict_type_(static_cast<const DictionaryType*>(type.get())), indices_(indices) { DCHECK_EQ(type->type, Type::DICTIONARY); @@ -369,16 +453,21 @@ Status DictionaryArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +std::shared_ptr<Array> DictionaryArray::Slice(int32_t offset, int32_t length) const { + std::shared_ptr<Array> sliced_indices = indices_->Slice(offset, length); + return std::make_shared<DictionaryArray>(type_, sliced_indices); +} + // ---------------------------------------------------------------------- -#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ - case Type::ENUM: \ - out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_bitmap, null_count, offset)); \ break; Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out) { + const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap, + int32_t null_count, int32_t offset, std::shared_ptr<Array>* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 3b6e93f..f3e8f9a 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -27,6 +27,7 @@ #include "arrow/buffer.h" #include "arrow/type.h" #include "arrow/type_fwd.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -71,23 +72,36 @@ class ArrayVisitor { /// /// The base class is only required to have a null bitmap buffer if the null /// count is greater than 0 +/// +/// If known, the null count can be provided in the base Array constructor. If +/// the null count is not known, pass -1 to indicate that the null count is to +/// be computed on the first call to null_count() class ARROW_EXPORT Array { public: - Array(const std::shared_ptr<DataType>& type, int32_t length, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr); + Array(const std::shared_ptr<DataType>& type, int32_t length, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); virtual ~Array() = default; /// Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i); + return null_bitmap_data_ != nullptr && + BitUtil::BitNotSet(null_bitmap_data_, i + offset_); } /// Size in the number of elements this array contains. int32_t length() const { return length_; } - /// The number of null entries in the array. - int32_t null_count() const { return null_count_; } + /// A relative position into another array's data, to enable zero-copy + /// slicing. This value defaults to zero + int32_t offset() const { return offset_; } + + /// The number of null entries in the array. If the null count was not known + /// at time of construction (and set to a negative value), then the null + /// count will be computed and cached on the first invocation of this + /// function + int32_t null_count() const; std::shared_ptr<DataType> type() const { return type_; } Type::type type_enum() const { return type_->type; } @@ -95,11 +109,13 @@ class ARROW_EXPORT Array { /// Buffer for the null bitmap. /// /// Note that for `null_count == 0`, this can be a `nullptr`. + /// This buffer does not account for any slice offset std::shared_ptr<Buffer> null_bitmap() const { return null_bitmap_; } /// Raw pointer to the null bitmap. /// /// Note that for `null_count == 0`, this can be a `nullptr`. + /// This buffer does not account for any slice offset const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } bool Equals(const Array& arr) const; @@ -120,10 +136,29 @@ class ARROW_EXPORT Array { virtual Status Accept(ArrayVisitor* visitor) const = 0; + /// Construct a zero-copy slice of the array with the indicated offset and + /// length + /// + /// \param[in] offset the position of the first element in the constructed slice + /// \param[in] length the length of the slice. If there are not enough elements in the + /// array, + /// the length will be adjusted accordingly + /// + /// \return a new object wrapped in std::shared_ptr<Array> + virtual std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const = 0; + + /// Slice from offset until end of the array + std::shared_ptr<Array> Slice(int32_t offset) const; + protected: std::shared_ptr<DataType> type_; - int32_t null_count_; int32_t length_; + int32_t offset_; + + // This member is marked mutable so that it can be modified when null_count() + // is called from a const context and the null count has to be computed (if + // it is not already known) + mutable int32_t null_count_; std::shared_ptr<Buffer> null_bitmap_; const uint8_t* null_bitmap_data_; @@ -138,28 +173,26 @@ class ARROW_EXPORT NullArray : public Array { public: using TypeClass = NullType; - NullArray(const std::shared_ptr<DataType>& type, int32_t length) - : Array(type, length, length, nullptr) {} - - explicit NullArray(int32_t length) : NullArray(std::make_shared<NullType>(), length) {} + explicit NullArray(int32_t length); Status Accept(ArrayVisitor* visitor) const override; -}; -Status ARROW_EXPORT GetEmptyBitmap( - MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result); + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; +}; /// Base class for fixed-size logical types class ARROW_EXPORT PrimitiveArray : public Array { public: - virtual ~PrimitiveArray() {} + PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, + const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); + /// The memory containing this array's data + /// This buffer does not account for any slice offset std::shared_ptr<Buffer> data() const { return data_; } protected: - PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& data, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr); std::shared_ptr<Buffer> data_; const uint8_t* raw_data_; }; @@ -169,21 +202,28 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { public: using TypeClass = TYPE; using value_type = typename TypeClass::c_type; - NumericArray(int32_t length, const std::shared_ptr<Buffer>& data, - int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr) - : PrimitiveArray( - std::make_shared<TypeClass>(), length, data, null_count, null_bitmap) {} - NumericArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& data, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr) - : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + + using PrimitiveArray::PrimitiveArray; + + // Only enable this constructor without a type argument for types without additional + // metadata + template <typename T1 = TYPE> + NumericArray( + typename std::enable_if<TypeTraits<T1>::is_parameter_free, int32_t>::type length, + const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0) + : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap, + null_count, offset) {} const value_type* raw_data() const { - return reinterpret_cast<const value_type*>(raw_data_); + return reinterpret_cast<const value_type*>(raw_data_) + offset_; } Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; + value_type Value(int i) const { return raw_data()[i]; } }; @@ -191,17 +231,19 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: using TypeClass = BooleanType; + using PrimitiveArray::PrimitiveArray; + BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data, - int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr); - BooleanArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& data, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr); + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); Status Accept(ArrayVisitor* visitor) const override; - const uint8_t* raw_data() const { return reinterpret_cast<const uint8_t*>(raw_data_); } + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; - bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } + bool Value(int i) const { + return BitUtil::GetBit(reinterpret_cast<const uint8_t*>(raw_data_), i + offset_); + } }; // ---------------------------------------------------------------------- @@ -212,39 +254,45 @@ class ARROW_EXPORT ListArray : public Array { using TypeClass = ListType; ListArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Array>& values, - int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr) - : Array(type, length, null_count, null_bitmap) { - offsets_buffer_ = offsets; - offsets_ = offsets == nullptr ? nullptr : reinterpret_cast<const int32_t*>( - offsets_buffer_->data()); + const std::shared_ptr<Buffer>& value_offsets, const std::shared_ptr<Array>& values, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0) + : Array(type, length, null_bitmap, null_count, offset) { + value_offsets_ = value_offsets; + raw_value_offsets_ = value_offsets == nullptr + ? nullptr + : reinterpret_cast<const int32_t*>(value_offsets_->data()); values_ = values; } Status Validate() const override; - virtual ~ListArray() = default; - // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr<Array> values() const { return values_; } - std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; } - std::shared_ptr<DataType> value_type() const { return values_->type(); } + /// Note that this buffer does not account for any slice offset + std::shared_ptr<Buffer> value_offsets() const { return value_offsets_; } - const int32_t* raw_offsets() const { return offsets_; } + std::shared_ptr<DataType> value_type() const { return values_->type(); } - int32_t offset(int i) const { return offsets_[i]; } + /// Return pointer to raw value offsets accounting for any slice offset + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return offsets_[i]; } - int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + int32_t value_offset(int i) const { return raw_value_offsets_[i + offset_]; } + int32_t value_length(int i) const { + i += offset_; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; + protected: - std::shared_ptr<Buffer> offsets_buffer_; - const int32_t* offsets_; + std::shared_ptr<Buffer> value_offsets_; + const int32_t* raw_value_offsets_; std::shared_ptr<Array> values_; }; @@ -255,55 +303,67 @@ class ARROW_EXPORT BinaryArray : public Array { public: using TypeClass = BinaryType; - BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const std::shared_ptr<Buffer>& data, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr); - - // Constructor that allows sub-classes/builders to propagate there logical type up the - // class hierarchy. - BinaryArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data, - int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr); + BinaryArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets, + const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); // Return the pointer to the given elements bytes // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy // pointer + offset const uint8_t* GetValue(int i, int32_t* out_length) const { - const int32_t pos = offsets_[i]; - *out_length = offsets_[i + 1] - pos; - return data_ + pos; + // Account for base offset + i += offset_; + + const int32_t pos = raw_value_offsets_[i]; + *out_length = raw_value_offsets_[i + 1] - pos; + return raw_data_ + pos; } - std::shared_ptr<Buffer> data() const { return data_buffer_; } - std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; } + /// Note that this buffer does not account for any slice offset + std::shared_ptr<Buffer> data() const { return data_; } - const int32_t* raw_offsets() const { return offsets_; } + /// Note that this buffer does not account for any slice offset + std::shared_ptr<Buffer> value_offsets() const { return value_offsets_; } - int32_t offset(int i) const { return offsets_[i]; } + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) const { return offsets_[i]; } - int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + int32_t value_offset(int i) const { return raw_value_offsets_[i + offset_]; } + int32_t value_length(int i) const { + i += offset_; + return raw_value_offsets_[i + 1] - raw_value_offsets_[i]; + } Status Validate() const override; Status Accept(ArrayVisitor* visitor) const override; - private: - std::shared_ptr<Buffer> offsets_buffer_; - const int32_t* offsets_; + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; + + protected: + // Constructor that allows sub-classes/builders to propagate there logical type up the + // class hierarchy. + BinaryArray(const std::shared_ptr<DataType>& type, int32_t length, + const std::shared_ptr<Buffer>& value_offsets, const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); - std::shared_ptr<Buffer> data_buffer_; - const uint8_t* data_; + std::shared_ptr<Buffer> value_offsets_; + const int32_t* raw_value_offsets_; + + std::shared_ptr<Buffer> data_; + const uint8_t* raw_data_; }; class ARROW_EXPORT StringArray : public BinaryArray { public: using TypeClass = StringType; - StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const std::shared_ptr<Buffer>& data, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr); + StringArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets, + const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); // Construct a std::string // TODO: std::bad_alloc possibility @@ -316,6 +376,8 @@ class ARROW_EXPORT StringArray : public BinaryArray { Status Validate() const override; Status Accept(ArrayVisitor* visitor) const override; + + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; }; // ---------------------------------------------------------------------- @@ -326,28 +388,25 @@ class ARROW_EXPORT StructArray : public Array { using TypeClass = StructType; StructArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::vector<std::shared_ptr<Array>>& field_arrays, int32_t null_count = 0, - std::shared_ptr<Buffer> null_bitmap = nullptr) - : Array(type, length, null_count, null_bitmap) { - type_ = type; - field_arrays_ = field_arrays; - } + const std::vector<std::shared_ptr<Array>>& children, + std::shared_ptr<Buffer> null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); Status Validate() const override; - virtual ~StructArray() {} - // Return a shared pointer in case the requestor desires to share ownership // with this array. std::shared_ptr<Array> field(int32_t pos) const; - const std::vector<std::shared_ptr<Array>>& fields() const { return field_arrays_; } + const std::vector<std::shared_ptr<Array>>& fields() const { return children_; } Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; + protected: // The child arrays corresponding to each field of the struct data type. - std::vector<std::shared_ptr<Array>> field_arrays_; + std::vector<std::shared_ptr<Array>> children_; }; // ---------------------------------------------------------------------- @@ -356,22 +415,25 @@ class ARROW_EXPORT StructArray : public Array { class ARROW_EXPORT UnionArray : public Array { public: using TypeClass = UnionType; + using type_id_t = uint8_t; UnionArray(const std::shared_ptr<DataType>& type, int32_t length, const std::vector<std::shared_ptr<Array>>& children, const std::shared_ptr<Buffer>& type_ids, - const std::shared_ptr<Buffer>& offsets = nullptr, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr); + const std::shared_ptr<Buffer>& value_offsets = nullptr, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0, + int32_t offset = 0); Status Validate() const override; - virtual ~UnionArray() {} + /// Note that this buffer does not account for any slice offset + std::shared_ptr<Buffer> type_ids() const { return type_ids_; } - std::shared_ptr<Buffer> type_ids() const { return type_ids_buffer_; } - const uint8_t* raw_type_ids() const { return type_ids_; } + /// Note that this buffer does not account for any slice offset + std::shared_ptr<Buffer> value_offsets() const { return value_offsets_; } - std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; } - const int32_t* raw_offsets() const { return offsets_; } + const type_id_t* raw_type_ids() const { return raw_type_ids_ + offset_; } + const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; } UnionMode mode() const { return static_cast<const UnionType&>(*type_.get()).mode; } @@ -381,14 +443,16 @@ class ARROW_EXPORT UnionArray : public Array { Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; + protected: std::vector<std::shared_ptr<Array>> children_; - std::shared_ptr<Buffer> type_ids_buffer_; - const uint8_t* type_ids_; + std::shared_ptr<Buffer> type_ids_; + const type_id_t* raw_type_ids_; - std::shared_ptr<Buffer> offsets_buffer_; - const int32_t* offsets_; + std::shared_ptr<Buffer> value_offsets_; + const int32_t* raw_value_offsets_; }; // ---------------------------------------------------------------------- @@ -419,8 +483,8 @@ class ARROW_EXPORT DictionaryArray : public Array { // Alternate ctor; other attributes (like null count) are inherited from the // passed indices array static Status FromBuffer(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& indices, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<DictionaryArray>* out); + const std::shared_ptr<Buffer>& indices, const std::shared_ptr<Buffer>& null_bitmap, + int32_t null_count, int32_t offset, std::shared_ptr<DictionaryArray>* out); Status Validate() const override; @@ -431,6 +495,8 @@ class ARROW_EXPORT DictionaryArray : public Array { Status Accept(ArrayVisitor* visitor) const override; + std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override; + protected: const DictionaryType* dict_type_; std::shared_ptr<Array> indices_; @@ -471,8 +537,9 @@ extern template class ARROW_EXPORT NumericArray<TimeType>; // Create new arrays for logical types that are backed by primitive arrays. Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type, - int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out); + int32_t length, const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset, + std::shared_ptr<Array>* out); } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/buffer.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 6cce0ef..fb5a010 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -116,4 +116,20 @@ Status PoolBuffer::Resize(int64_t new_size, bool shrink_to_fit) { return Status::OK(); } +Status AllocateBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr<MutableBuffer>* out) { + auto buffer = std::make_shared<PoolBuffer>(pool); + RETURN_NOT_OK(buffer->Resize(size)); + *out = buffer; + return Status::OK(); +} + +Status AllocateResizableBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr<ResizableBuffer>* out) { + auto buffer = std::make_shared<PoolBuffer>(pool); + RETURN_NOT_OK(buffer->Resize(size)); + *out = buffer; + return Status::OK(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/buffer.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index d43ab03..9c400b1 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_UTIL_BUFFER_H -#define ARROW_UTIL_BUFFER_H +#ifndef ARROW_BUFFER_H +#define ARROW_BUFFER_H #include <algorithm> #include <cstdint> @@ -105,7 +105,7 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this<Buffer> { /// Construct a view on passed buffer at the indicated offset and length. This /// function cannot fail and does not error checking (except in debug builds) -ARROW_EXPORT std::shared_ptr<Buffer> SliceBuffer( +std::shared_ptr<Buffer> ARROW_EXPORT SliceBuffer( const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length); /// A Buffer whose contents can be mutated. May or may not own its data. @@ -232,6 +232,19 @@ class ARROW_EXPORT BufferBuilder { int64_t size_; }; +/// Allocate a new mutable buffer from a memory pool +/// +/// \param[in] pool a memory pool +/// \param[in] size size of buffer to allocate +/// \param[out] out the allocated buffer with padding +/// +/// \return Status message +Status ARROW_EXPORT AllocateBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr<MutableBuffer>* out); + +Status ARROW_EXPORT AllocateResizableBuffer( + MemoryPool* pool, int64_t size, std::shared_ptr<ResizableBuffer>* out); + } // namespace arrow -#endif // ARROW_UTIL_BUFFER_H +#endif // ARROW_BUFFER_H http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index b0dc41b..dddadee 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -185,7 +185,7 @@ Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) { RETURN_NOT_OK(data_->Resize(bytes_required)); } *out = std::make_shared<typename TypeTraits<T>::ArrayType>( - type_, length_, data_, null_count_, null_bitmap_); + type_, length_, data_, null_bitmap_, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -202,10 +202,19 @@ template class PrimitiveBuilder<Int32Type>; template class PrimitiveBuilder<Int64Type>; template class PrimitiveBuilder<DateType>; template class PrimitiveBuilder<TimestampType>; +template class PrimitiveBuilder<TimeType>; template class PrimitiveBuilder<HalfFloatType>; template class PrimitiveBuilder<FloatType>; template class PrimitiveBuilder<DoubleType>; +BooleanBuilder::BooleanBuilder(MemoryPool* pool) + : ArrayBuilder(pool, boolean()), data_(nullptr), raw_data_(nullptr) {} + +BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type) + : BooleanBuilder(pool) { + DCHECK_EQ(Type::BOOL, type->type); +} + Status BooleanBuilder::Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); data_ = std::make_shared<PoolBuffer>(pool_); @@ -244,7 +253,7 @@ Status BooleanBuilder::Finish(std::shared_ptr<Array>* out) { // Trim buffers RETURN_NOT_OK(data_->Resize(bytes_required)); } - *out = std::make_shared<BooleanArray>(type_, length_, data_, null_count_, null_bitmap_); + *out = std::make_shared<BooleanArray>(type_, length_, data_, null_bitmap_, null_count_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -313,7 +322,7 @@ Status ListBuilder::Finish(std::shared_ptr<Array>* out) { std::shared_ptr<Buffer> offsets = offset_builder_.Finish(); *out = std::make_shared<ListArray>( - type_, length_, offsets, items, null_count_, null_bitmap_); + type_, length_, offsets, items, null_bitmap_, null_count_); Reset(); @@ -333,14 +342,13 @@ std::shared_ptr<ArrayBuilder> ListBuilder::value_builder() const { // ---------------------------------------------------------------------- // String and binary -// This used to be a static member variable of BinaryBuilder, but it can cause -// valgrind to report a (spurious?) memory leak when needed in other shared -// libraries. The problem came up while adding explicit visibility to libarrow -// and libparquet_arrow -static TypePtr kBinaryValueType = TypePtr(new UInt8Type()); +BinaryBuilder::BinaryBuilder(MemoryPool* pool) + : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, uint8()), binary()) { + byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get()); +} BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) - : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, kBinaryValueType), type) { + : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, uint8()), type) { byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get()); } @@ -351,11 +359,13 @@ Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) { const auto list = std::dynamic_pointer_cast<ListArray>(result); auto values = std::dynamic_pointer_cast<UInt8Array>(list->values()); - *out = std::make_shared<BinaryArray>(list->length(), list->offsets(), values->data(), - list->null_count(), list->null_bitmap()); + *out = std::make_shared<BinaryArray>(list->length(), list->value_offsets(), + values->data(), list->null_bitmap(), list->null_count()); return Status::OK(); } +StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {} + Status StringBuilder::Finish(std::shared_ptr<Array>* out) { std::shared_ptr<Array> result; RETURN_NOT_OK(ListBuilder::Finish(&result)); @@ -363,8 +373,8 @@ Status StringBuilder::Finish(std::shared_ptr<Array>* out) { const auto list = std::dynamic_pointer_cast<ListArray>(result); auto values = std::dynamic_pointer_cast<UInt8Array>(list->values()); - *out = std::make_shared<StringArray>(list->length(), list->offsets(), values->data(), - list->null_count(), list->null_bitmap()); + *out = std::make_shared<StringArray>(list->length(), list->value_offsets(), + values->data(), list->null_bitmap(), list->null_count()); return Status::OK(); } @@ -377,7 +387,7 @@ Status StructBuilder::Finish(std::shared_ptr<Array>* out) { RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i])); } - *out = std::make_shared<StructArray>(type_, length_, fields, null_count_, null_bitmap_); + *out = std::make_shared<StructArray>(type_, length_, fields, null_bitmap_, null_count_); null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -393,9 +403,9 @@ std::shared_ptr<ArrayBuilder> StructBuilder::field_builder(int pos) const { // ---------------------------------------------------------------------- // Helper functions -#define BUILDER_CASE(ENUM, BuilderType) \ - case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ +#define BUILDER_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + out->reset(new BuilderType(pool)); \ return Status::OK(); // Initially looked at doing this with vtables, but shared pointers makes it @@ -414,19 +424,17 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, BUILDER_CASE(UINT64, UInt64Builder); BUILDER_CASE(INT64, Int64Builder); BUILDER_CASE(DATE, DateBuilder); - BUILDER_CASE(TIMESTAMP, TimestampBuilder); - - BUILDER_CASE(BOOL, BooleanBuilder); - - BUILDER_CASE(FLOAT, FloatBuilder); - BUILDER_CASE(DOUBLE, DoubleBuilder); - - case Type::STRING: - out->reset(new StringBuilder(pool)); + case Type::TIMESTAMP: + out->reset(new TimestampBuilder(pool, type)); return Status::OK(); - case Type::BINARY: - out->reset(new BinaryBuilder(pool, type)); + case Type::TIME: + out->reset(new TimeBuilder(pool, type)); return Status::OK(); + BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(FLOAT, FloatBuilder); + BUILDER_CASE(DOUBLE, DoubleBuilder); + BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); case Type::LIST: { std::shared_ptr<ArrayBuilder> value_builder; std::shared_ptr<DataType> value_type = http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 672d2d8..0b83b9f 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -141,9 +141,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { using value_type = typename Type::c_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) - : ArrayBuilder(pool, type), data_(nullptr) {} - - virtual ~PrimitiveBuilder() {} + : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {} using ArrayBuilder::Advance; @@ -233,6 +231,7 @@ using Int16Builder = NumericBuilder<Int16Type>; using Int32Builder = NumericBuilder<Int32Type>; using Int64Builder = NumericBuilder<Int64Type>; using TimestampBuilder = NumericBuilder<TimestampType>; +using TimeBuilder = NumericBuilder<TimeType>; using DateBuilder = NumericBuilder<DateType>; using HalfFloatBuilder = NumericBuilder<HalfFloatType>; @@ -241,10 +240,8 @@ using DoubleBuilder = NumericBuilder<DoubleType>; class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: - explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type = boolean()) - : ArrayBuilder(pool, type), data_(nullptr) {} - - virtual ~BooleanBuilder() {} + explicit BooleanBuilder(MemoryPool* pool); + explicit BooleanBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type); using ArrayBuilder::Advance; @@ -321,8 +318,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { ListBuilder( MemoryPool* pool, std::shared_ptr<Array> values, const TypePtr& type = nullptr); - virtual ~ListBuilder() {} - Status Init(int32_t elements) override; Status Resize(int32_t capacity) override; Status Finish(std::shared_ptr<Array>* out) override; @@ -368,8 +363,8 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { // BinaryBuilder : public ListBuilder class ARROW_EXPORT BinaryBuilder : public ListBuilder { public: + explicit BinaryBuilder(MemoryPool* pool); explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type); - virtual ~BinaryBuilder() {} Status Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(ListBuilder::Append()); @@ -391,11 +386,7 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { // String builder class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: - explicit StringBuilder(MemoryPool* pool = default_memory_pool()) - : BinaryBuilder(pool, utf8()) {} - - explicit StringBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type) - : BinaryBuilder(pool, type) {} + explicit StringBuilder(MemoryPool* pool); using BinaryBuilder::Append; http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/column-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc index 1e722ed..0bbfc83 100644 --- a/cpp/src/arrow/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -51,7 +51,7 @@ TEST_F(TestChunkedArray, BasicEquals) { std::vector<bool> null_bitmap(100, true); std::vector<int32_t> data(100, 1); std::shared_ptr<Array> array; - ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array); + ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array); arrays_one_.push_back(array); arrays_another_.push_back(array); @@ -67,9 +67,9 @@ TEST_F(TestChunkedArray, EqualsDifferingTypes) { std::vector<int32_t> data32(100, 1); std::vector<int64_t> data64(100, 1); std::shared_ptr<Array> array; - ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data32, &array); + ArrayFromVector<Int32Type, int32_t>(null_bitmap, data32, &array); arrays_one_.push_back(array); - ArrayFromVector<Int64Type, int64_t>(int64(), null_bitmap, data64, &array); + ArrayFromVector<Int64Type, int64_t>(null_bitmap, data64, &array); arrays_another_.push_back(array); Construct(); @@ -83,9 +83,9 @@ TEST_F(TestChunkedArray, EqualsDifferingLengths) { std::vector<int32_t> data100(100, 1); std::vector<int32_t> data101(101, 1); std::shared_ptr<Array> array; - ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap100, data100, &array); + ArrayFromVector<Int32Type, int32_t>(null_bitmap100, data100, &array); arrays_one_.push_back(array); - ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap101, data101, &array); + ArrayFromVector<Int32Type, int32_t>(null_bitmap101, data101, &array); arrays_another_.push_back(array); Construct(); @@ -94,7 +94,7 @@ TEST_F(TestChunkedArray, EqualsDifferingLengths) { std::vector<bool> null_bitmap1(1, true); std::vector<int32_t> data1(1, 1); - ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap1, data1, &array); + ArrayFromVector<Int32Type, int32_t>(null_bitmap1, data1, &array); arrays_one_.push_back(array); Construct(); @@ -156,7 +156,7 @@ TEST_F(TestColumn, Equals) { std::vector<bool> null_bitmap(100, true); std::vector<int32_t> data(100, 1); std::shared_ptr<Array> array; - ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array); + ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array); arrays_one_.push_back(array); arrays_another_.push_back(array);
