ARROW-418: [C++] Array / Builder class code reorganization, flattening I've been wanting to do this for a while -- it feels cleaner to me. I also am going to promote modules from arrow/util to the top level as well. I'm open to other ideas, too.
Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #236 from wesm/ARROW-418 and squashes the following commits: 6f556ea [Wes McKinney] Add missing math.h include for clang 9dc2e22 [Wes McKinney] Fix remaining old includes 6f7ae77 [Wes McKinney] Fixes, cpplint 66ac3f7 [Wes McKinney] Promote buffer.h/status.h/memory-pool.h to top level directory 8cdf059 [Wes McKinney] Consolidate Array and Builder classes in array.h, builder.h. Remove arrow/types subdirectory Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2c10d7cc Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2c10d7cc Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2c10d7cc Branch: refs/heads/master Commit: 2c10d7ccec3c07fb061e1988be16aecaf9916af4 Parents: 73fe556 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Mon Dec 12 17:17:31 2016 -0500 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Mon Dec 12 17:17:31 2016 -0500 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 15 +- cpp/src/arrow/CMakeLists.txt | 11 + cpp/src/arrow/api.h | 13 +- cpp/src/arrow/array-decimal-test.cc | 40 ++ cpp/src/arrow/array-list-test.cc | 237 ++++++++++++ cpp/src/arrow/array-primitive-test.cc | 476 +++++++++++++++++++++++ cpp/src/arrow/array-string-test.cc | 358 ++++++++++++++++++ cpp/src/arrow/array-struct-test.cc | 391 +++++++++++++++++++ cpp/src/arrow/array-test.cc | 5 +- cpp/src/arrow/array.cc | 443 +++++++++++++++++++++- cpp/src/arrow/array.h | 373 +++++++++++++++++- cpp/src/arrow/buffer-test.cc | 140 +++++++ cpp/src/arrow/buffer.cc | 102 +++++ cpp/src/arrow/buffer.h | 232 ++++++++++++ cpp/src/arrow/builder.cc | 329 +++++++++++++++- cpp/src/arrow/builder.h | 315 +++++++++++++++- cpp/src/arrow/column-benchmark.cc | 4 +- cpp/src/arrow/column-test.cc | 1 - cpp/src/arrow/column.cc | 2 +- cpp/src/arrow/io/file.cc | 6 +- cpp/src/arrow/io/hdfs.cc | 6 +- cpp/src/arrow/io/interfaces.cc | 4 +- cpp/src/arrow/io/io-file-test.cc | 2 +- cpp/src/arrow/io/io-hdfs-test.cc | 2 +- cpp/src/arrow/io/libhdfs_shim.cc | 2 +- cpp/src/arrow/io/memory.cc | 5 +- cpp/src/arrow/io/test-common.h | 4 +- cpp/src/arrow/ipc/adapter.cc | 9 +- cpp/src/arrow/ipc/file.cc | 4 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 10 +- cpp/src/arrow/ipc/ipc-file-test.cc | 11 +- cpp/src/arrow/ipc/ipc-json-test.cc | 18 +- cpp/src/arrow/ipc/ipc-metadata-test.cc | 2 +- cpp/src/arrow/ipc/json-integration-test.cc | 2 +- cpp/src/arrow/ipc/json-internal.cc | 10 +- cpp/src/arrow/ipc/json.cc | 6 +- cpp/src/arrow/ipc/metadata-internal.cc | 4 +- cpp/src/arrow/ipc/metadata.cc | 4 +- cpp/src/arrow/ipc/test-common.h | 9 +- cpp/src/arrow/ipc/util.h | 2 +- cpp/src/arrow/memory_pool-test.cc | 69 ++++ cpp/src/arrow/memory_pool.cc | 111 ++++++ cpp/src/arrow/memory_pool.h | 43 +++ cpp/src/arrow/pretty_print-test.cc | 5 +- cpp/src/arrow/pretty_print.cc | 5 +- cpp/src/arrow/status-test.cc | 38 ++ cpp/src/arrow/status.cc | 86 +++++ cpp/src/arrow/status.h | 192 ++++++++++ cpp/src/arrow/table-test.cc | 4 +- cpp/src/arrow/table.cc | 2 +- cpp/src/arrow/test-util.h | 43 ++- cpp/src/arrow/type.cc | 8 +- cpp/src/arrow/type.h | 2 +- cpp/src/arrow/types/CMakeLists.txt | 39 -- cpp/src/arrow/types/construct.cc | 124 ------ cpp/src/arrow/types/construct.h | 47 --- cpp/src/arrow/types/datetime.h | 27 -- cpp/src/arrow/types/decimal-test.cc | 40 -- cpp/src/arrow/types/decimal.cc | 31 -- cpp/src/arrow/types/decimal.h | 28 -- cpp/src/arrow/types/list-test.cc | 241 ------------ cpp/src/arrow/types/list.cc | 162 -------- cpp/src/arrow/types/list.h | 170 --------- cpp/src/arrow/types/primitive-test.cc | 478 ------------------------ cpp/src/arrow/types/primitive.cc | 294 --------------- cpp/src/arrow/types/primitive.h | 371 ------------------ cpp/src/arrow/types/string-test.cc | 360 ------------------ cpp/src/arrow/types/string.cc | 150 -------- cpp/src/arrow/types/string.h | 149 -------- cpp/src/arrow/types/struct-test.cc | 396 -------------------- cpp/src/arrow/types/struct.cc | 108 ------ cpp/src/arrow/types/struct.h | 116 ------ cpp/src/arrow/types/test-common.h | 70 ---- cpp/src/arrow/types/union.cc | 27 -- cpp/src/arrow/types/union.h | 48 --- cpp/src/arrow/util/CMakeLists.txt | 6 - cpp/src/arrow/util/bit-util.cc | 4 +- cpp/src/arrow/util/buffer-test.cc | 140 ------- cpp/src/arrow/util/buffer.cc | 102 ----- cpp/src/arrow/util/buffer.h | 232 ------------ cpp/src/arrow/util/memory-pool-test.cc | 69 ---- cpp/src/arrow/util/memory-pool.cc | 111 ------ cpp/src/arrow/util/memory-pool.h | 43 --- cpp/src/arrow/util/status-test.cc | 38 -- cpp/src/arrow/util/status.cc | 86 ----- cpp/src/arrow/util/status.h | 192 ---------- python/src/pyarrow/adapters/builtin.cc | 2 +- python/src/pyarrow/adapters/pandas.cc | 2 +- python/src/pyarrow/common.cc | 4 +- python/src/pyarrow/common.h | 5 +- python/src/pyarrow/io.cc | 4 +- 91 files changed, 4103 insertions(+), 4630 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 798d75f..adcca0e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -743,25 +743,17 @@ set(ARROW_PRIVATE_LINK_LIBS set(ARROW_SRCS src/arrow/array.cc + src/arrow/buffer.cc src/arrow/builder.cc src/arrow/column.cc + src/arrow/memory_pool.cc src/arrow/pretty_print.cc src/arrow/schema.cc + src/arrow/status.cc src/arrow/table.cc src/arrow/type.cc - src/arrow/types/construct.cc - src/arrow/types/decimal.cc - src/arrow/types/list.cc - src/arrow/types/primitive.cc - src/arrow/types/string.cc - src/arrow/types/struct.cc - src/arrow/types/union.cc - src/arrow/util/bit-util.cc - src/arrow/util/buffer.cc - src/arrow/util/memory-pool.cc - src/arrow/util/status.cc ) add_library(arrow_objlib OBJECT @@ -823,7 +815,6 @@ endif() add_subdirectory(src/arrow) add_subdirectory(src/arrow/io) add_subdirectory(src/arrow/util) -add_subdirectory(src/arrow/types) #---------------------------------------------------------------------- # IPC library http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6c0dea2..7d7bc29 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -20,9 +20,12 @@ install(FILES api.h array.h column.h + buffer.h builder.h + memory_pool.h pretty_print.h schema.h + status.h table.h type.h type_fwd.h @@ -37,9 +40,17 @@ install(FILES set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) ADD_ARROW_TEST(array-test) +ADD_ARROW_TEST(array-decimal-test) +ADD_ARROW_TEST(array-list-test) +ADD_ARROW_TEST(array-primitive-test) +ADD_ARROW_TEST(array-string-test) +ADD_ARROW_TEST(array-struct-test) +ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(column-test) +ADD_ARROW_TEST(memory_pool-test) ADD_ARROW_TEST(pretty_print-test) ADD_ARROW_TEST(schema-test) +ADD_ARROW_TEST(status-test) ADD_ARROW_TEST(table-test) ADD_ARROW_BENCHMARK(column-benchmark) http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/api.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 2d317b4..51437d8 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -21,20 +21,13 @@ #define ARROW_API_H #include "arrow/array.h" +#include "arrow/buffer.h" #include "arrow/builder.h" #include "arrow/column.h" +#include "arrow/memory_pool.h" #include "arrow/schema.h" +#include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/types/construct.h" -#include "arrow/types/list.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" -#include "arrow/types/struct.h" - -#include "arrow/util/buffer.h" -#include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" - #endif // ARROW_API_H http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array-decimal-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc new file mode 100644 index 0000000..9e00fd9 --- /dev/null +++ b/cpp/src/arrow/array-decimal-test.cc @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/type.h" + +namespace arrow { + +TEST(TypesTest, TestDecimalType) { + DecimalType t1(8, 4); + + ASSERT_EQ(t1.type, Type::DECIMAL); + ASSERT_EQ(t1.precision, 8); + ASSERT_EQ(t1.scale, 4); + + ASSERT_EQ(t1.ToString(), std::string("decimal(8, 4)")); + + // Test copy constructor + DecimalType t2 = t1; + ASSERT_EQ(t2.type, Type::DECIMAL); + ASSERT_EQ(t2.precision, 8); + ASSERT_EQ(t2.scale, 4); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array-list-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc new file mode 100644 index 0000000..8baaf06 --- /dev/null +++ b/cpp/src/arrow/array-list-test.cc @@ -0,0 +1,237 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <cstdlib> +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +using std::shared_ptr; +using std::string; +using std::unique_ptr; +using std::vector; + +namespace arrow { + +TEST(TypesTest, TestListType) { + std::shared_ptr<DataType> vt = std::make_shared<UInt8Type>(); + + ListType list_type(vt); + ASSERT_EQ(list_type.type, Type::LIST); + + ASSERT_EQ(list_type.name(), string("list")); + ASSERT_EQ(list_type.ToString(), string("list<item: uint8>")); + + ASSERT_EQ(list_type.value_type()->type, vt->type); + ASSERT_EQ(list_type.value_type()->type, vt->type); + + std::shared_ptr<DataType> st = std::make_shared<StringType>(); + std::shared_ptr<DataType> lt = std::make_shared<ListType>(st); + ASSERT_EQ(lt->ToString(), string("list<item: string>")); + + ListType lt2(lt); + ASSERT_EQ(lt2.ToString(), string("list<item: list<item: string>>")); +} + +// ---------------------------------------------------------------------- +// List tests + +class TestListBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + value_type_ = TypePtr(new Int32Type()); + type_ = TypePtr(new ListType(value_type_)); + + std::shared_ptr<ArrayBuilder> tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_ = std::dynamic_pointer_cast<ListBuilder>(tmp); + } + + void Done() { + std::shared_ptr<Array> out; + EXPECT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast<ListArray>(out); + } + + protected: + TypePtr value_type_; + TypePtr type_; + + shared_ptr<ListBuilder> builder_; + shared_ptr<ListArray> result_; +}; + +TEST_F(TestListBuilder, Equality) { + Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get()); + + ArrayPtr array, equal_array, unequal_array; + vector<int32_t> equal_offsets = {0, 1, 2, 5}; + vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2}; + vector<int32_t> unequal_offsets = {0, 1, 4}; + vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5}; + + // setup two equal arrays + ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); + + ASSERT_OK(builder_->Finish(&array)); + ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); + + ASSERT_OK(builder_->Finish(&equal_array)); + // now an unequal one + ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size())); + ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size())); + + ASSERT_OK(builder_->Finish(&unequal_array)); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); + EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array)); +} + +TEST_F(TestListBuilder, TestResize) {} + +TEST_F(TestListBuilder, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + + Done(); + + ASSERT_OK(result_->Validate()); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + + ASSERT_EQ(0, result_->raw_offsets()[0]); + ASSERT_EQ(0, result_->offset(1)); + ASSERT_EQ(0, result_->offset(2)); + + Int32Array* values = static_cast<Int32Array*>(result_->values().get()); + ASSERT_EQ(0, values->length()); +} + +void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& values, + const vector<uint8_t>& is_valid) { + ASSERT_OK(result->Validate()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + vector<int32_t> ex_offsets = {0, 3, 3, 7}; + for (size_t i = 0; i < ex_offsets.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->offset(i)); + } + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(!static_cast<bool>(is_valid[i]), result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + Int32Array* varr = static_cast<Int32Array*>(result->values().get()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } +} + +TEST_F(TestListBuilder, TestBasics) { + vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6}; + vector<int> lengths = {3, 0, 4}; + vector<uint8_t> is_valid = {1, 0, 1}; + + Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get()); + + ASSERT_OK(builder_->Reserve(lengths.size())); + ASSERT_OK(vb->Reserve(values.size())); + + int pos = 0; + for (size_t i = 0; i < lengths.size(); ++i) { + ASSERT_OK(builder_->Append(is_valid[i] > 0)); + for (int j = 0; j < lengths[i]; ++j) { + vb->Append(values[pos++]); + } + } + + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); +} + +TEST_F(TestListBuilder, BulkAppend) { + vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6}; + vector<int> lengths = {3, 0, 4}; + vector<uint8_t> is_valid = {1, 0, 1}; + vector<int32_t> offsets = {0, 3, 3}; + + Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get()); + ASSERT_OK(vb->Reserve(values.size())); + + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + for (int32_t value : values) { + vb->Append(value); + } + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); +} + +TEST_F(TestListBuilder, BulkAppendInvalid) { + vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6}; + vector<int> lengths = {3, 0, 4}; + vector<uint8_t> is_null = {0, 1, 0}; + vector<uint8_t> is_valid = {1, 0, 1}; + vector<int32_t> offsets = {0, 2, 4}; // should be 0, 3, 3 given the is_null array + + Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get()); + ASSERT_OK(vb->Reserve(values.size())); + + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + for (int32_t value : values) { + vb->Append(value); + } + + Done(); + ASSERT_RAISES(Invalid, result_->Validate()); +} + +TEST_F(TestListBuilder, TestZeroLength) { + // All buffers are null + Done(); + ASSERT_OK(result_->Validate()); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array-primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc new file mode 100644 index 0000000..a10e240 --- /dev/null +++ b/cpp/src/arrow/array-primitive-test.cc @@ -0,0 +1,476 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" + +using std::string; +using std::shared_ptr; +using std::unique_ptr; +using std::vector; + +namespace arrow { + +class Array; + +#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ + TEST(TypesTest, TestPrimitive_##ENUM) { \ + KLASS tp; \ + \ + ASSERT_EQ(tp.type, Type::ENUM); \ + ASSERT_EQ(tp.ToString(), string(NAME)); \ + \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ + } + +PRIMITIVE_TEST(Int8Type, INT8, "int8"); +PRIMITIVE_TEST(Int16Type, INT16, "int16"); +PRIMITIVE_TEST(Int32Type, INT32, "int32"); +PRIMITIVE_TEST(Int64Type, INT64, "int64"); +PRIMITIVE_TEST(UInt8Type, UINT8, "uint8"); +PRIMITIVE_TEST(UInt16Type, UINT16, "uint16"); +PRIMITIVE_TEST(UInt32Type, UINT32, "uint32"); +PRIMITIVE_TEST(UInt64Type, UINT64, "uint64"); + +PRIMITIVE_TEST(FloatType, FLOAT, "float"); +PRIMITIVE_TEST(DoubleType, DOUBLE, "double"); + +PRIMITIVE_TEST(BooleanType, BOOL, "bool"); + +// ---------------------------------------------------------------------- +// Primitive type tests + +TEST_F(TestBuilder, TestReserve) { + builder_->Init(10); + ASSERT_EQ(2, builder_->null_bitmap()->size()); + + builder_->Reserve(30); + ASSERT_EQ(4, builder_->null_bitmap()->size()); +} + +template <typename Attrs> +class TestPrimitiveBuilder : public TestBuilder { + public: + typedef typename Attrs::ArrayType ArrayType; + typedef typename Attrs::BuilderType BuilderType; + typedef typename Attrs::T T; + typedef typename Attrs::Type Type; + + virtual void SetUp() { + TestBuilder::SetUp(); + + type_ = Attrs::type(); + + std::shared_ptr<ArrayBuilder> tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_ = std::dynamic_pointer_cast<BuilderType>(tmp); + + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_nn_ = std::dynamic_pointer_cast<BuilderType>(tmp); + } + + void RandomData(int N, double pct_null = 0.1) { + Attrs::draw(N, &draws_); + + valid_bytes_.resize(N); + test::random_null_bytes(N, pct_null, valid_bytes_.data()); + } + + void Check(const std::shared_ptr<BuilderType>& builder, bool nullable) { + int size = builder->length(); + + auto ex_data = std::make_shared<Buffer>( + reinterpret_cast<uint8_t*>(draws_.data()), size * sizeof(T)); + + std::shared_ptr<Buffer> ex_null_bitmap; + int32_t ex_null_count = 0; + + if (nullable) { + ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_); + ex_null_count = test::null_count(valid_bytes_); + } else { + ex_null_bitmap = nullptr; + } + + auto expected = + std::make_shared<ArrayType>(size, ex_data, ex_null_count, ex_null_bitmap); + + std::shared_ptr<Array> out; + ASSERT_OK(builder->Finish(&out)); + + std::shared_ptr<ArrayType> result = std::dynamic_pointer_cast<ArrayType>(out); + + // Builder is now reset + ASSERT_EQ(0, builder->length()); + ASSERT_EQ(0, builder->capacity()); + ASSERT_EQ(0, builder->null_count()); + ASSERT_EQ(nullptr, builder->data()); + + ASSERT_EQ(ex_null_count, result->null_count()); + ASSERT_TRUE(result->EqualsExact(*expected.get())); + } + + protected: + std::shared_ptr<DataType> type_; + shared_ptr<BuilderType> builder_; + shared_ptr<BuilderType> builder_nn_; + + vector<T> draws_; + vector<uint8_t> valid_bytes_; +}; + +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static std::shared_ptr<DataType> type() { \ + return std::shared_ptr<DataType>(new Type()); \ + } + +#define PINT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int N, vector<T>* draws) { \ + test::randint<T>(N, LOWER, UPPER, draws); \ + } \ + } + +#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int N, vector<T>* draws) { \ + test::random_real<T>(N, 0, LOWER, UPPER, draws); \ + } \ + } + +PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); +PINT_DECL(UInt16, uint16_t, 0, UINT16_MAX); +PINT_DECL(UInt32, uint32_t, 0, UINT32_MAX); +PINT_DECL(UInt64, uint64_t, 0, UINT64_MAX); + +PINT_DECL(Int8, int8_t, INT8_MIN, INT8_MAX); +PINT_DECL(Int16, int16_t, INT16_MIN, INT16_MAX); +PINT_DECL(Int32, int32_t, INT32_MIN, INT32_MAX); +PINT_DECL(Int64, int64_t, INT64_MIN, INT64_MAX); + +PFLOAT_DECL(Float, float, -1000, 1000); +PFLOAT_DECL(Double, double, -1000, 1000); + +struct PBoolean { + PTYPE_DECL(Boolean, uint8_t); +}; + +template <> +void TestPrimitiveBuilder<PBoolean>::RandomData(int N, double pct_null) { + draws_.resize(N); + valid_bytes_.resize(N); + + test::random_null_bytes(N, 0.5, draws_.data()); + test::random_null_bytes(N, pct_null, valid_bytes_.data()); +} + +template <> +void TestPrimitiveBuilder<PBoolean>::Check( + const std::shared_ptr<BooleanBuilder>& builder, bool nullable) { + int size = builder->length(); + + auto ex_data = test::bytes_to_null_buffer(draws_); + + std::shared_ptr<Buffer> ex_null_bitmap; + int32_t ex_null_count = 0; + + if (nullable) { + ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_); + ex_null_count = test::null_count(valid_bytes_); + } else { + ex_null_bitmap = nullptr; + } + + auto expected = + std::make_shared<BooleanArray>(size, ex_data, ex_null_count, ex_null_bitmap); + + std::shared_ptr<Array> out; + ASSERT_OK(builder->Finish(&out)); + std::shared_ptr<BooleanArray> result = std::dynamic_pointer_cast<BooleanArray>(out); + + // Builder is now reset + ASSERT_EQ(0, builder->length()); + ASSERT_EQ(0, builder->capacity()); + ASSERT_EQ(0, builder->null_count()); + ASSERT_EQ(nullptr, builder->data()); + + ASSERT_EQ(ex_null_count, result->null_count()); + + ASSERT_EQ(expected->length(), result->length()); + + for (int i = 0; i < result->length(); ++i) { + if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } + bool actual = BitUtil::GetBit(result->raw_data(), i); + ASSERT_EQ(static_cast<bool>(draws_[i]), actual) << i; + } + ASSERT_TRUE(result->EqualsExact(*expected.get())); +} + +typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16, + PInt32, PInt64, PFloat, PDouble> + Primitives; + +TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); + +#define DECL_T() typedef typename TestFixture::T T; + +#define DECL_TYPE() typedef typename TestFixture::Type Type; + +#define DECL_ARRAYTYPE() typedef typename TestFixture::ArrayType ArrayType; + +TYPED_TEST(TestPrimitiveBuilder, TestInit) { + DECL_TYPE(); + + int n = 1000; + ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_EQ(BitUtil::NextPower2(n), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(TypeTraits<Type>::bytes_required(n)), + this->builder_->data()->size()); + + // unsure if this should go in all builder classes + ASSERT_EQ(0, this->builder_->num_children()); +} + +TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { + int size = 1000; + for (int i = 0; i < size; ++i) { + ASSERT_OK(this->builder_->AppendNull()); + } + + std::shared_ptr<Array> result; + ASSERT_OK(this->builder_->Finish(&result)); + + for (int i = 0; i < size; ++i) { + ASSERT_TRUE(result->IsNull(i)) << i; + } +} + +TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { + DECL_T(); + + int size = 1000; + + vector<T>& draws = this->draws_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; + + int64_t memory_before = this->pool_->bytes_allocated(); + + this->RandomData(size); + + this->builder_->Reserve(size); + + int i; + for (i = 0; i < size; ++i) { + if (valid_bytes[i] > 0) { + this->builder_->Append(draws[i]); + } else { + this->builder_->AppendNull(); + } + } + + do { + std::shared_ptr<Array> result; + ASSERT_OK(this->builder_->Finish(&result)); + } while (false); + + ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); +} + +TYPED_TEST(TestPrimitiveBuilder, Equality) { + DECL_T(); + + const int size = 1000; + this->RandomData(size); + vector<T>& draws = this->draws_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; + ArrayPtr array, equal_array, unequal_array; + auto builder = this->builder_.get(); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array)); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array)); + + // Make the not equal array by negating the first valid element with itself. + const auto first_valid = std::find_if( + valid_bytes.begin(), valid_bytes.end(), [](uint8_t valid) { return valid > 0; }); + const int first_valid_idx = std::distance(valid_bytes.begin(), first_valid); + // This should be true with a very high probability, but might introduce flakiness + ASSERT_LT(first_valid_idx, size - 1); + draws[first_valid_idx] = ~*reinterpret_cast<int64_t*>(&draws[first_valid_idx]); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array)); + + // test normal equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + + // Test range equality + EXPECT_FALSE(array->RangeEquals(0, first_valid_idx + 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(first_valid_idx, size, first_valid_idx, unequal_array)); + EXPECT_TRUE(array->RangeEquals(0, first_valid_idx, 0, unequal_array)); + EXPECT_TRUE( + array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array)); +} + +TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { + DECL_T(); + + const int size = 10000; + + vector<T>& draws = this->draws_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; + + this->RandomData(size); + + this->builder_->Reserve(1000); + this->builder_nn_->Reserve(1000); + + int i; + int null_count = 0; + // Append the first 1000 + for (i = 0; i < 1000; ++i) { + if (valid_bytes[i] > 0) { + this->builder_->Append(draws[i]); + } else { + this->builder_->AppendNull(); + ++null_count; + } + this->builder_nn_->Append(draws[i]); + } + + ASSERT_EQ(null_count, this->builder_->null_count()); + + ASSERT_EQ(1000, this->builder_->length()); + ASSERT_EQ(1024, this->builder_->capacity()); + + ASSERT_EQ(1000, this->builder_nn_->length()); + ASSERT_EQ(1024, this->builder_nn_->capacity()); + + this->builder_->Reserve(size - 1000); + this->builder_nn_->Reserve(size - 1000); + + // Append the next 9000 + for (i = 1000; i < size; ++i) { + if (valid_bytes[i] > 0) { + this->builder_->Append(draws[i]); + } else { + this->builder_->AppendNull(); + } + this->builder_nn_->Append(draws[i]); + } + + ASSERT_EQ(size, this->builder_->length()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity()); + + ASSERT_EQ(size, this->builder_nn_->length()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_nn_->capacity()); + + this->Check(this->builder_, true); + this->Check(this->builder_nn_, false); +} + +TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { + DECL_T(); + + int size = 10000; + this->RandomData(size); + + vector<T>& draws = this->draws_; + vector<uint8_t>& valid_bytes = this->valid_bytes_; + + // first slug + int K = 1000; + + ASSERT_OK(this->builder_->Append(draws.data(), K, valid_bytes.data())); + ASSERT_OK(this->builder_nn_->Append(draws.data(), K)); + + ASSERT_EQ(1000, this->builder_->length()); + ASSERT_EQ(1024, this->builder_->capacity()); + + ASSERT_EQ(1000, this->builder_nn_->length()); + ASSERT_EQ(1024, this->builder_nn_->capacity()); + + // Append the next 9000 + ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, valid_bytes.data() + K)); + ASSERT_OK(this->builder_nn_->Append(draws.data() + K, size - K)); + + ASSERT_EQ(size, this->builder_->length()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity()); + + this->Check(this->builder_, true); + this->Check(this->builder_nn_, false); +} + +TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { + int n = 1000; + ASSERT_OK(this->builder_->Reserve(n)); + + ASSERT_OK(this->builder_->Advance(100)); + ASSERT_EQ(100, this->builder_->length()); + + ASSERT_OK(this->builder_->Advance(900)); + + int too_many = this->builder_->capacity() - 1000 + 1; + ASSERT_RAISES(Invalid, this->builder_->Advance(too_many)); +} + +TYPED_TEST(TestPrimitiveBuilder, TestResize) { + DECL_TYPE(); + + int cap = kMinBuilderCapacity * 2; + + ASSERT_OK(this->builder_->Reserve(cap)); + ASSERT_EQ(cap, this->builder_->capacity()); + + ASSERT_EQ(TypeTraits<Type>::bytes_required(cap), this->builder_->data()->size()); + ASSERT_EQ(BitUtil::BytesForBits(cap), this->builder_->null_bitmap()->size()); +} + +TYPED_TEST(TestPrimitiveBuilder, TestReserve) { + ASSERT_OK(this->builder_->Reserve(10)); + ASSERT_EQ(0, this->builder_->length()); + ASSERT_EQ(kMinBuilderCapacity, this->builder_->capacity()); + + ASSERT_OK(this->builder_->Reserve(90)); + ASSERT_OK(this->builder_->Advance(100)); + ASSERT_OK(this->builder_->Reserve(kMinBuilderCapacity)); + + ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity()); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array-string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc new file mode 100644 index 0000000..b144c63 --- /dev/null +++ b/cpp/src/arrow/array-string-test.cc @@ -0,0 +1,358 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cstdint> +#include <cstdlib> +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +class Buffer; + +TEST(TypesTest, BinaryType) { + BinaryType t1; + BinaryType e1; + StringType t2; + EXPECT_TRUE(t1.Equals(&e1)); + EXPECT_FALSE(t1.Equals(&t2)); + ASSERT_EQ(t1.type, Type::BINARY); + ASSERT_EQ(t1.ToString(), std::string("binary")); +} + +TEST(TypesTest, TestStringType) { + StringType str; + ASSERT_EQ(str.type, Type::STRING); + ASSERT_EQ(str.ToString(), std::string("string")); +} + +// ---------------------------------------------------------------------- +// String container + +class TestStringContainer : public ::testing::Test { + public: + void SetUp() { + chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; + offsets_ = {0, 1, 1, 1, 3, 6}; + valid_bytes_ = {1, 1, 0, 1, 1}; + expected_ = {"a", "", "", "bb", "ccc"}; + + MakeArray(); + } + + void MakeArray() { + length_ = offsets_.size() - 1; + value_buf_ = test::GetBufferFromVector(chars_); + offsets_buf_ = test::GetBufferFromVector(offsets_); + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); + null_count_ = test::null_count(valid_bytes_); + + strings_ = std::make_shared<StringArray>( + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + } + + protected: + std::vector<int32_t> offsets_; + std::vector<char> chars_; + std::vector<uint8_t> valid_bytes_; + + std::vector<std::string> expected_; + + std::shared_ptr<Buffer> value_buf_; + std::shared_ptr<Buffer> offsets_buf_; + std::shared_ptr<Buffer> null_bitmap_; + + int null_count_; + int length_; + + std::shared_ptr<StringArray> strings_; +}; + +TEST_F(TestStringContainer, TestArrayBasics) { + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(strings_->Validate()); +} + +TEST_F(TestStringContainer, TestType) { + TypePtr type = strings_->type(); + + ASSERT_EQ(Type::STRING, type->type); + ASSERT_EQ(Type::STRING, strings_->type_enum()); +} + +TEST_F(TestStringContainer, TestListFunctions) { + int pos = 0; + for (size_t i = 0; i < expected_.size(); ++i) { + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i)); + pos += expected_[i].size(); + } +} + +TEST_F(TestStringContainer, TestDestructor) { + auto arr = std::make_shared<StringArray>( + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); +} + +TEST_F(TestStringContainer, TestGetString) { + for (size_t i = 0; i < expected_.size(); ++i) { + if (valid_bytes_[i] == 0) { + ASSERT_TRUE(strings_->IsNull(i)); + } else { + ASSERT_EQ(expected_[i], strings_->GetString(i)); + } + } +} + +TEST_F(TestStringContainer, TestEmptyStringComparison) { + offsets_ = {0, 0, 0, 0, 0, 0}; + offsets_buf_ = test::GetBufferFromVector(offsets_); + length_ = offsets_.size() - 1; + + auto strings_a = std::make_shared<StringArray>( + length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + auto strings_b = std::make_shared<StringArray>( + length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + ASSERT_TRUE(strings_a->Equals(strings_b)); +} + +// ---------------------------------------------------------------------- +// String builder tests + +class TestStringBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + type_ = TypePtr(new StringType()); + builder_.reset(new StringBuilder(pool_, type_)); + } + + void Done() { + std::shared_ptr<Array> out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast<StringArray>(out); + result_->Validate(); + } + + protected: + TypePtr type_; + + std::unique_ptr<StringBuilder> builder_; + std::shared_ptr<StringArray> result_; +}; + +TEST_F(TestStringBuilder, TestScalarAppend) { + std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; + std::vector<uint8_t> is_null = {0, 0, 0, 1, 0}; + + int N = strings.size(); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder_->AppendNull(); + } else { + builder_->Append(strings[i]); + } + } + } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->data()->size()); + + int32_t length; + int32_t pos = 0; + for (int i = 0; i < N * reps; ++i) { + if (is_null[i % N]) { + ASSERT_TRUE(result_->IsNull(i)); + } else { + ASSERT_FALSE(result_->IsNull(i)); + result_->GetValue(i, &length); + ASSERT_EQ(pos, result_->offset(i)); + ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); + ASSERT_EQ(strings[i % N], result_->GetString(i)); + + pos += length; + } + } +} + +TEST_F(TestStringBuilder, TestZeroLength) { + // All buffers are null + Done(); +} + +// Binary container type +// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating +// with String +class TestBinaryContainer : public ::testing::Test { + public: + void SetUp() { + chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; + offsets_ = {0, 1, 1, 1, 3, 6}; + valid_bytes_ = {1, 1, 0, 1, 1}; + expected_ = {"a", "", "", "bb", "ccc"}; + + MakeArray(); + } + + void MakeArray() { + length_ = offsets_.size() - 1; + value_buf_ = test::GetBufferFromVector(chars_); + offsets_buf_ = test::GetBufferFromVector(offsets_); + + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); + null_count_ = test::null_count(valid_bytes_); + + strings_ = std::make_shared<BinaryArray>( + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); + } + + protected: + std::vector<int32_t> offsets_; + std::vector<char> chars_; + std::vector<uint8_t> valid_bytes_; + + std::vector<std::string> expected_; + + std::shared_ptr<Buffer> value_buf_; + std::shared_ptr<Buffer> offsets_buf_; + std::shared_ptr<Buffer> null_bitmap_; + + int null_count_; + int length_; + + std::shared_ptr<BinaryArray> strings_; +}; + +TEST_F(TestBinaryContainer, TestArrayBasics) { + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(strings_->Validate()); +} + +TEST_F(TestBinaryContainer, TestType) { + TypePtr type = strings_->type(); + + ASSERT_EQ(Type::BINARY, type->type); + ASSERT_EQ(Type::BINARY, strings_->type_enum()); +} + +TEST_F(TestBinaryContainer, TestListFunctions) { + int pos = 0; + for (size_t i = 0; i < expected_.size(); ++i) { + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(static_cast<int>(expected_[i].size()), strings_->value_length(i)); + pos += expected_[i].size(); + } +} + +TEST_F(TestBinaryContainer, TestDestructor) { + auto arr = std::make_shared<BinaryArray>( + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); +} + +TEST_F(TestBinaryContainer, TestGetValue) { + for (size_t i = 0; i < expected_.size(); ++i) { + if (valid_bytes_[i] == 0) { + ASSERT_TRUE(strings_->IsNull(i)); + } else { + int32_t len = -1; + const uint8_t* bytes = strings_->GetValue(i, &len); + ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len)); + } + } +} + +class TestBinaryBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + type_ = TypePtr(new BinaryType()); + builder_.reset(new BinaryBuilder(pool_, type_)); + } + + void Done() { + std::shared_ptr<Array> out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast<BinaryArray>(out); + result_->Validate(); + } + + protected: + TypePtr type_; + + std::unique_ptr<BinaryBuilder> builder_; + std::shared_ptr<BinaryArray> result_; +}; + +TEST_F(TestBinaryBuilder, TestScalarAppend) { + std::vector<std::string> strings = {"", "bb", "a", "", "ccc"}; + std::vector<uint8_t> is_null = {0, 0, 0, 1, 0}; + + int N = strings.size(); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder_->AppendNull(); + } else { + builder_->Append( + reinterpret_cast<const uint8_t*>(strings[i].data()), strings[i].size()); + } + } + } + Done(); + ASSERT_OK(result_->Validate()); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->data()->size()); + + int32_t length; + for (int i = 0; i < N * reps; ++i) { + if (is_null[i % N]) { + ASSERT_TRUE(result_->IsNull(i)); + } else { + ASSERT_FALSE(result_->IsNull(i)); + const uint8_t* vals = result_->GetValue(i, &length); + ASSERT_EQ(static_cast<int>(strings[i % N].size()), length); + ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length)); + } + } +} + +TEST_F(TestBinaryBuilder, TestZeroLength) { + // All buffers are null + Done(); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array-struct-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc new file mode 100644 index 0000000..58386fe --- /dev/null +++ b/cpp/src/arrow/array-struct-test.cc @@ -0,0 +1,391 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +using std::shared_ptr; +using std::string; +using std::vector; + +namespace arrow { + +TEST(TestStructType, Basics) { + TypePtr f0_type = TypePtr(new Int32Type()); + auto f0 = std::make_shared<Field>("f0", f0_type); + + TypePtr f1_type = TypePtr(new StringType()); + auto f1 = std::make_shared<Field>("f1", f1_type); + + TypePtr f2_type = TypePtr(new UInt8Type()); + auto f2 = std::make_shared<Field>("f2", f2_type); + + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + + StructType struct_type(fields); + + ASSERT_TRUE(struct_type.child(0)->Equals(f0)); + ASSERT_TRUE(struct_type.child(1)->Equals(f1)); + ASSERT_TRUE(struct_type.child(2)->Equals(f2)); + + ASSERT_EQ(struct_type.ToString(), "struct<f0: int32, f1: string, f2: uint8>"); + + // TODO(wesm): out of bounds for field(...) +} + +void ValidateBasicStructArray(const StructArray* result, + const vector<uint8_t>& struct_is_valid, const vector<char>& list_values, + const vector<uint8_t>& list_is_valid, const vector<int>& list_lengths, + const vector<int>& list_offsets, const vector<int32_t>& int_values) { + ASSERT_EQ(4, result->length()); + ASSERT_OK(result->Validate()); + + auto list_char_arr = static_cast<ListArray*>(result->field(0).get()); + auto char_arr = static_cast<Int8Array*>(list_char_arr->values().get()); + auto int32_arr = static_cast<Int32Array*>(result->field(1).get()); + + ASSERT_EQ(0, result->null_count()); + ASSERT_EQ(1, list_char_arr->null_count()); + ASSERT_EQ(0, int32_arr->null_count()); + + // List<char> + ASSERT_EQ(4, list_char_arr->length()); + ASSERT_EQ(10, list_char_arr->values()->length()); + for (size_t i = 0; i < list_offsets.size(); ++i) { + ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]); + } + for (size_t i = 0; i < list_values.size(); ++i) { + ASSERT_EQ(list_values[i], char_arr->Value(i)); + } + + // Int32 + ASSERT_EQ(4, int32_arr->length()); + for (size_t i = 0; i < int_values.size(); ++i) { + ASSERT_EQ(int_values[i], int32_arr->Value(i)); + } +} + +// ---------------------------------------------------------------------------------- +// Struct test +class TestStructBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + auto int32_type = TypePtr(new Int32Type()); + auto char_type = TypePtr(new Int8Type()); + auto list_type = TypePtr(new ListType(char_type)); + + std::vector<TypePtr> types = {list_type, int32_type}; + std::vector<FieldPtr> fields; + fields.push_back(FieldPtr(new Field("list", list_type))); + fields.push_back(FieldPtr(new Field("int", int32_type))); + + type_ = TypePtr(new StructType(fields)); + value_fields_ = fields; + + std::shared_ptr<ArrayBuilder> tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + + builder_ = std::dynamic_pointer_cast<StructBuilder>(tmp); + ASSERT_EQ(2, static_cast<int>(builder_->field_builders().size())); + } + + void Done() { + std::shared_ptr<Array> out; + ASSERT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast<StructArray>(out); + } + + protected: + std::vector<FieldPtr> value_fields_; + TypePtr type_; + + std::shared_ptr<StructBuilder> builder_; + std::shared_ptr<StructArray> result_; +}; + +TEST_F(TestStructBuilder, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + ASSERT_EQ(2, static_cast<int>(builder_->field_builders().size())); + + ListBuilder* list_vb = static_cast<ListBuilder*>(builder_->field_builder(0).get()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_EQ(2, list_vb->length()); + + Int32Builder* int_vb = static_cast<Int32Builder*>(builder_->field_builder(1).get()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_EQ(2, int_vb->length()); + + Done(); + + ASSERT_OK(result_->Validate()); + + ASSERT_EQ(2, static_cast<int>(result_->fields().size())); + ASSERT_EQ(2, result_->length()); + ASSERT_EQ(2, result_->field(0)->length()); + ASSERT_EQ(2, result_->field(1)->length()); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + ASSERT_TRUE(result_->field(0)->IsNull(0)); + ASSERT_TRUE(result_->field(0)->IsNull(1)); + ASSERT_TRUE(result_->field(1)->IsNull(0)); + ASSERT_TRUE(result_->field(1)->IsNull(1)); + + ASSERT_EQ(Type::LIST, result_->field(0)->type_enum()); + ASSERT_EQ(Type::INT32, result_->field(1)->type_enum()); +} + +TEST_F(TestStructBuilder, TestBasics) { + vector<int32_t> int_values = {1, 2, 3, 4}; + vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector<int> list_lengths = {3, 0, 3, 4}; + vector<int> list_offsets = {0, 3, 3, 6, 10}; + vector<uint8_t> list_is_valid = {1, 0, 1, 1}; + vector<uint8_t> struct_is_valid = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast<ListBuilder*>(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast<Int8Builder*>(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast<Int32Builder*>(builder_->field_builder(1).get()); + ASSERT_EQ(2, static_cast<int>(builder_->field_builders().size())); + + EXPECT_OK(builder_->Resize(list_lengths.size())); + EXPECT_OK(char_vb->Resize(list_values.size())); + EXPECT_OK(int_vb->Resize(int_values.size())); + + int pos = 0; + for (size_t i = 0; i < list_lengths.size(); ++i) { + ASSERT_OK(list_vb->Append(list_is_valid[i] > 0)); + int_vb->UnsafeAppend(int_values[i]); + for (int j = 0; j < list_lengths[i]; ++j) { + char_vb->UnsafeAppend(list_values[pos++]); + } + } + + for (size_t i = 0; i < struct_is_valid.size(); ++i) { + ASSERT_OK(builder_->Append(struct_is_valid[i] > 0)); + } + + Done(); + + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppend) { + vector<int32_t> int_values = {1, 2, 3, 4}; + vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector<int> list_lengths = {3, 0, 3, 4}; + vector<int> list_offsets = {0, 3, 3, 6}; + vector<uint8_t> list_is_valid = {1, 0, 1, 1}; + vector<uint8_t> struct_is_valid = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast<ListBuilder*>(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast<Int8Builder*>(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast<Int32Builder*>(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + Done(); + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppendInvalid) { + vector<int32_t> int_values = {1, 2, 3, 4}; + vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector<int> list_lengths = {3, 0, 3, 4}; + vector<int> list_offsets = {0, 3, 3, 6}; + vector<uint8_t> list_is_valid = {1, 0, 1, 1}; + vector<uint8_t> struct_is_valid = {1, 0, 1, 1}; // should be 1, 1, 1, 1 + + ListBuilder* list_vb = static_cast<ListBuilder*>(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast<Int8Builder*>(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast<Int32Builder*>(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + Done(); + // Even null bitmap of the parent Struct is not valid, Validate() will ignore it. + ASSERT_OK(result_->Validate()); +} + +TEST_F(TestStructBuilder, TestEquality) { + ArrayPtr array, equal_array; + ArrayPtr unequal_bitmap_array, unequal_offsets_array, unequal_values_array; + + vector<int32_t> int_values = {1, 2, 3, 4}; + vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector<int> list_lengths = {3, 0, 3, 4}; + vector<int> list_offsets = {0, 3, 3, 6}; + vector<uint8_t> list_is_valid = {1, 0, 1, 1}; + vector<uint8_t> struct_is_valid = {1, 1, 1, 1}; + + vector<int32_t> unequal_int_values = {4, 2, 3, 1}; + vector<char> unequal_list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'l', 'u', 'c', 'y'}; + vector<int> unequal_list_offsets = {0, 3, 4, 6}; + vector<uint8_t> unequal_list_is_valid = {1, 1, 1, 1}; + vector<uint8_t> unequal_struct_is_valid = {1, 0, 0, 1}; + + ListBuilder* list_vb = static_cast<ListBuilder*>(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast<Int8Builder*>(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast<Int32Builder*>(builder_->field_builder(1).get()); + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + // setup two equal arrays, one of which takes an unequal bitmap + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + ASSERT_OK(builder_->Finish(&array)); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + ASSERT_OK(builder_->Finish(&equal_array)); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with the unequal bitmap + builder_->Append(unequal_struct_is_valid.size(), unequal_struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + ASSERT_OK(builder_->Finish(&unequal_bitmap_array)); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with unequal offsets + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(unequal_list_offsets.data(), unequal_list_offsets.size(), + unequal_list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + ASSERT_OK(builder_->Finish(&unequal_offsets_array)); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup anunequal one with unequal values + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : unequal_list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : unequal_int_values) { + int_vb->UnsafeAppend(value); + } + + ASSERT_OK(builder_->Finish(&unequal_values_array)); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(equal_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_values_array)); + EXPECT_FALSE(unequal_values_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_offsets_array)); + EXPECT_FALSE(unequal_offsets_array->Equals(unequal_bitmap_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 4, 0, equal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 3, unequal_bitmap_array)); + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array)); + EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array)); + EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array)); +} + +TEST_F(TestStructBuilder, TestZeroLength) { + // All buffers are null + Done(); + ASSERT_OK(result_->Validate()); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 1581244..783104e 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -24,11 +24,10 @@ #include "gtest/gtest.h" #include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/memory_pool.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/primitive.h" -#include "arrow/util/buffer.h" -#include "arrow/util/memory-pool.h" namespace arrow { http://git-wip-us.apache.org/repos/asf/arrow/blob/2c10d7cc/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 1f0bb66..7ab61f5 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -19,10 +19,13 @@ #include <cstdint> #include <cstring> +#include <sstream> +#include "arrow/buffer.h" +#include "arrow/status.h" +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" -#include "arrow/util/buffer.h" -#include "arrow/util/status.h" +#include "arrow/util/logging.h" namespace arrow { @@ -85,4 +88,440 @@ Status NullArray::Accept(ArrayVisitor* visitor) const { return visitor->Visit(*this); } +// ---------------------------------------------------------------------- +// Primitive array base + +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, + const std::shared_ptr<Buffer>& data, int32_t null_count, + const std::shared_ptr<Buffer>& null_bitmap) + : Array(type, length, null_count, null_bitmap) { + data_ = data; + raw_data_ = data == nullptr ? nullptr : data_->data(); +} + +bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { + if (this == &other) { return true; } + if (null_count_ != other.null_count_) { return false; } + + if (null_count_ > 0) { + bool equal_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::CeilByte(length_) / 8); + if (!equal_bitmap) { return false; } + + const uint8_t* this_data = raw_data_; + const uint8_t* other_data = other.raw_data_; + + auto size_meta = dynamic_cast<const FixedWidthType*>(type_.get()); + int value_byte_size = size_meta->bit_width() / 8; + DCHECK_GT(value_byte_size, 0); + + for (int i = 0; i < length_; ++i) { + if (!IsNull(i) && memcmp(this_data, other_data, value_byte_size)) { return false; } + this_data += value_byte_size; + other_data += value_byte_size; + } + return true; + } else { + if (length_ == 0 && other.length_ == 0) { return true; } + return data_->Equals(*other.data_, length_); + } +} + +bool PrimitiveArray::Equals(const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast<const PrimitiveArray*>(arr.get())); +} + +template <typename T> +Status NumericArray<T>::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +template class NumericArray<UInt8Type>; +template class NumericArray<UInt16Type>; +template class NumericArray<UInt32Type>; +template class NumericArray<UInt64Type>; +template class NumericArray<Int8Type>; +template class NumericArray<Int16Type>; +template class NumericArray<Int32Type>; +template class NumericArray<Int64Type>; +template class NumericArray<TimestampType>; +template class NumericArray<HalfFloatType>; +template class NumericArray<FloatType>; +template class NumericArray<DoubleType>; + +// ---------------------------------------------------------------------- +// BooleanArray + +BooleanArray::BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data, + int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) + : PrimitiveArray( + std::make_shared<BooleanType>(), length, data, null_count, null_bitmap) {} + +BooleanArray::BooleanArray(const TypePtr& type, int32_t length, + const std::shared_ptr<Buffer>& data, int32_t null_count, + const std::shared_ptr<Buffer>& null_bitmap) + : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + +bool BooleanArray::EqualsExact(const BooleanArray& other) const { + if (this == &other) return true; + if (null_count_ != other.null_count_) { return false; } + + if (null_count_ > 0) { + bool equal_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); + if (!equal_bitmap) { return false; } + + const uint8_t* this_data = raw_data_; + const uint8_t* other_data = other.raw_data_; + + for (int i = 0; i < length_; ++i) { + if (!IsNull(i) && BitUtil::GetBit(this_data, i) != BitUtil::GetBit(other_data, i)) { + return false; + } + } + return true; + } else { + return data_->Equals(*other.data_, BitUtil::BytesForBits(length_)); + } +} + +bool BooleanArray::Equals(const ArrayPtr& arr) const { + if (this == arr.get()) return true; + if (Type::BOOL != arr->type_enum()) { return false; } + return EqualsExact(*static_cast<const BooleanArray*>(arr.get())); +} + +bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx, + int32_t other_start_idx, const ArrayPtr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast<BooleanArray*>(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) { + return false; + } + } + return true; +} + +Status BooleanArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +// ---------------------------------------------------------------------- +// ListArray + +bool ListArray::EqualsExact(const ListArray& other) const { + if (this == &other) { return true; } + if (null_count_ != other.null_count_) { return false; } + + bool equal_offsets = + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } + bool equal_null_bitmap = true; + if (null_count_ > 0) { + equal_null_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); + } + + if (!equal_null_bitmap) { return false; } + + return values()->Equals(other.values()); +} + +bool ListArray::Equals(const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast<const ListArray*>(arr.get())); +} + +bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast<ListArray*>(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + if (!values_->RangeEquals( + begin_offset, end_offset, other_begin_offset, other->values())) { + return false; + } + } + return true; +} + +Status ListArray::Validate() const { + if (length_ < 0) { return Status::Invalid("Length was negative"); } + if (!offset_buffer_) { return Status::Invalid("offset_buffer_ was null"); } + if (offset_buffer_->size() / static_cast<int>(sizeof(int32_t)) < length_) { + std::stringstream ss; + ss << "offset buffer size (bytes): " << offset_buffer_->size() + << " isn't large enough for length: " << length_; + return Status::Invalid(ss.str()); + } + const int32_t last_offset = offset(length_); + if (last_offset > 0) { + if (!values_) { + return Status::Invalid("last offset was non-zero and values was null"); + } + if (values_->length() != last_offset) { + std::stringstream ss; + ss << "Final offset invariant not equal to values length: " << last_offset + << "!=" << values_->length(); + return Status::Invalid(ss.str()); + } + + const Status child_valid = values_->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString(); + return Status::Invalid(ss.str()); + } + } + + int32_t prev_offset = offset(0); + if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } + for (int32_t i = 1; i <= length_; ++i) { + int32_t current_offset = offset(i); + if (IsNull(i - 1) && current_offset != prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot" + << current_offset << "!=" << prev_offset; + return Status::Invalid(ss.str()); + } + if (current_offset < prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure: " << i + << " inconsistent offset for non-null slot: " << current_offset << "<" + << prev_offset; + return Status::Invalid(ss.str()); + } + prev_offset = current_offset; + } + return Status::OK(); +} + +Status ListArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +// ---------------------------------------------------------------------- +// String and binary + +static std::shared_ptr<DataType> kBinary = std::make_shared<BinaryType>(); +static std::shared_ptr<DataType> kString = std::make_shared<StringType>(); + +BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets, + const std::shared_ptr<Buffer>& data, int32_t null_count, + const std::shared_ptr<Buffer>& null_bitmap) + : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {} + +BinaryArray::BinaryArray(const TypePtr& type, int32_t length, + const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data, + int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) + : Array(type, length, null_count, null_bitmap), + offset_buffer_(offsets), + offsets_(reinterpret_cast<const int32_t*>(offset_buffer_->data())), + data_buffer_(data), + data_(nullptr) { + if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); } +} + +Status BinaryArray::Validate() const { + // TODO(wesm): what to do here? + return Status::OK(); +} + +bool BinaryArray::EqualsExact(const BinaryArray& other) const { + if (!Array::EqualsExact(other)) { return false; } + + bool equal_offsets = + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } + + if (!data_buffer_ && !(other.data_buffer_)) { return true; } + + return data_buffer_->Equals(*other.data_buffer_, data_buffer_->size()); +} + +bool BinaryArray::Equals(const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast<const BinaryArray*>(arr.get())); +} + +bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast<const BinaryArray*>(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + + if (std::memcmp(data_ + begin_offset, other->data_ + other_begin_offset, + end_offset - begin_offset)) { + return false; + } + } + return true; +} + +Status BinaryArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets, + const std::shared_ptr<Buffer>& data, int32_t null_count, + const std::shared_ptr<Buffer>& null_bitmap) + : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {} + +Status StringArray::Validate() const { + // TODO(emkornfield) Validate proper UTF8 code points? + return BinaryArray::Validate(); +} + +Status StringArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +// ---------------------------------------------------------------------- +// Struct + +std::shared_ptr<Array> StructArray::field(int32_t pos) const { + DCHECK_GT(field_arrays_.size(), 0); + return field_arrays_[pos]; +} + +bool StructArray::Equals(const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + if (null_count_ != arr->null_count()) { return false; } + return RangeEquals(0, length_, 0, arr); +} + +bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (Type::STRUCT != arr->type_enum()) { return false; } + const auto other = static_cast<StructArray*>(arr.get()); + + bool equal_fields = true; + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + if (IsNull(i) != arr->IsNull(o_i)) { return false; } + if (IsNull(i)) continue; + for (size_t j = 0; j < field_arrays_.size(); ++j) { + // TODO: really we should be comparing stretches of non-null data rather + // than looking at one value at a time. + equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other->field(j)); + if (!equal_fields) { return false; } + } + } + + return true; +} + +Status StructArray::Validate() const { + if (length_ < 0) { return Status::Invalid("Length was negative"); } + + if (null_count() > length_) { + return Status::Invalid("Null count exceeds the length of this struct"); + } + + if (field_arrays_.size() > 0) { + // Validate fields + int32_t array_length = field_arrays_[0]->length(); + size_t idx = 0; + for (auto it : field_arrays_) { + if (it->length() != array_length) { + std::stringstream ss; + ss << "Length is not equal from field " << it->type()->ToString() + << " at position {" << idx << "}"; + return Status::Invalid(ss.str()); + } + + const Status child_valid = it->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx + << "}"; + return Status::Invalid(ss.str()); + } + ++idx; + } + + if (array_length > 0 && array_length != length_) { + return Status::Invalid("Struct's length is not equal to its child arrays"); + } + } + return Status::OK(); +} + +Status StructArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +// ---------------------------------------------------------------------- + +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ + break; + +Status MakePrimitiveArray(const TypePtr& type, int32_t length, + const std::shared_ptr<Buffer>& data, int32_t null_count, + const std::shared_ptr<Buffer>& null_bitmap, ArrayPtr* out) { + switch (type->type) { + MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); + MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT8, Int8Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT16, UInt16Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT16, Int16Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT32, UInt32Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); + MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); + default: + return Status::NotImplemented(type->ToString()); + } +#ifdef NDEBUG + return Status::OK(); +#else + return (*out)->Validate(); +#endif +} + } // namespace arrow