ARROW-261: Refactor String/Binary code paths to reflect unnested (non-list-based) structure
Per discussions on the mailing list. This should in theory match the Java implementation. Author: Wes McKinney <[email protected]> Closes #176 from wesm/ARROW-261 and squashes the following commits: dca39ce [Wes McKinney] Make binary/string constants static to avoid memory-access-related segfaults in third party libraries 1e65b01 [Wes McKinney] Deprecate pyarrow::Status in favor of just arrow::Status. Conform pyarrow use of ArrayBuilder::Finish 9a1f77e [Wes McKinney] Add license header to index.rst bd70cab [Wes McKinney] Complete refactoring, fix up IPC tests for flattened string/binary buffer/metadata layout ae64f2e [Wes McKinney] Refactoring to reflect collaprsed list-like structure of Binary and String types. Not yet complete Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/732a2059 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/732a2059 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/732a2059 Branch: refs/heads/master Commit: 732a2059d0c4493e451c566160b9d5d01dfe87be Parents: 8e8b17f Author: Wes McKinney <[email protected]> Authored: Mon Oct 17 13:44:34 2016 -0400 Committer: Wes McKinney <[email protected]> Committed: Mon Oct 17 13:44:34 2016 -0400 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 1 - cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/ipc/adapter.cc | 47 +++++---- cpp/src/arrow/ipc/test-common.h | 19 ++-- cpp/src/arrow/type.h | 20 +--- cpp/src/arrow/types/CMakeLists.txt | 1 - cpp/src/arrow/types/binary.h | 28 ------ cpp/src/arrow/types/construct.cc | 31 +----- cpp/src/arrow/types/construct.h | 8 -- cpp/src/arrow/types/json.cc | 37 ------- cpp/src/arrow/types/json.h | 36 ------- cpp/src/arrow/types/list-test.cc | 15 ++- cpp/src/arrow/types/list.cc | 42 +++++++- cpp/src/arrow/types/list.h | 49 ++-------- cpp/src/arrow/types/primitive-test.cc | 26 +++-- cpp/src/arrow/types/primitive.cc | 31 ++++-- cpp/src/arrow/types/primitive.h | 31 +++--- cpp/src/arrow/types/string-test.cc | 33 +++---- cpp/src/arrow/types/string.cc | 101 +++++++++++++++---- cpp/src/arrow/types/string.h | 49 ++++++---- cpp/src/arrow/types/struct-test.cc | 21 ++-- cpp/src/arrow/types/struct.cc | 14 +++ cpp/src/arrow/types/struct.h | 17 +--- cpp/src/arrow/util/status.cc | 6 ++ cpp/src/arrow/util/status.h | 17 +++- python/CMakeLists.txt | 2 - python/doc/index.rst | 18 +++- python/pyarrow/error.pxd | 4 +- python/pyarrow/error.pyx | 10 +- python/pyarrow/includes/pyarrow.pxd | 35 ++----- python/pyarrow/io.pyx | 56 +++++------ python/pyarrow/ipc.pyx | 18 ++-- python/pyarrow/parquet.pyx | 14 +-- python/src/pyarrow/adapters/builtin.cc | 39 ++++---- python/src/pyarrow/adapters/builtin.h | 9 +- python/src/pyarrow/adapters/pandas.cc | 32 +++--- python/src/pyarrow/adapters/pandas.h | 15 ++- python/src/pyarrow/api.h | 2 - python/src/pyarrow/common.cc | 12 +-- python/src/pyarrow/common.h | 7 -- python/src/pyarrow/io.cc | 59 +++++------ python/src/pyarrow/status.cc | 92 ------------------ python/src/pyarrow/status.h | 146 ---------------------------- 43 files changed, 484 insertions(+), 768 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d682dc7..6f95483 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -681,7 +681,6 @@ set(ARROW_SRCS src/arrow/types/construct.cc src/arrow/types/decimal.cc - src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc src/arrow/types/string.cc http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 646a6f2..cef17e5 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -93,7 +93,7 @@ class ARROW_EXPORT ArrayBuilder { // Creates new array object to hold the contents of the builder and transfers // ownership of the data. This resets all variables on the builder. - virtual std::shared_ptr<Array> Finish() = 0; + virtual Status Finish(std::shared_ptr<Array>* out) = 0; const std::shared_ptr<DataType>& type() const { return type_; } http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/ipc/adapter.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index cd8ab53..f84cb26 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -78,22 +78,6 @@ static bool IsPrimitive(const DataType* type) { } } -static bool IsListType(const DataType* type) { - DCHECK(type != nullptr); - switch (type->type) { - // TODO(emkornfield) grouping like this are used in a few places in the - // code consider using pattern like: - // http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type - // - case Type::BINARY: - case Type::LIST: - case Type::STRING: - return true; - default: - return false; - } -} - // ---------------------------------------------------------------------- // Record batch write path @@ -115,7 +99,11 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes if (IsPrimitive(arr_type)) { const auto prim_arr = static_cast<const PrimitiveArray*>(arr); buffers->push_back(prim_arr->data()); - } else if (IsListType(arr_type)) { + } else if (arr->type_enum() == Type::STRING || arr->type_enum() == Type::BINARY) { + const auto binary_arr = static_cast<const BinaryArray*>(arr); + buffers->push_back(binary_arr->offsets()); + buffers->push_back(binary_arr->data()); + } else if (arr->type_enum() == Type::LIST) { const auto list_arr = static_cast<const ListArray*>(arr); buffers->push_back(list_arr->offset_buffer()); RETURN_NOT_OK(VisitArray( @@ -331,9 +319,21 @@ class RecordBatchReader::RecordBatchReaderImpl { } return MakePrimitiveArray( type, field_meta.length, data, field_meta.null_count, null_bitmap, out); - } + } else if (type->type == Type::STRING || type->type == Type::BINARY) { + std::shared_ptr<Buffer> offsets; + std::shared_ptr<Buffer> values; + RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &values)); - if (IsListType(type.get())) { + if (type->type == Type::STRING) { + *out = std::make_shared<StringArray>( + field_meta.length, offsets, values, field_meta.null_count, null_bitmap); + } else { + *out = std::make_shared<BinaryArray>( + field_meta.length, offsets, values, field_meta.null_count, null_bitmap); + } + return Status::OK(); + } else if (type->type == Type::LIST) { std::shared_ptr<Buffer> offsets; RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); const int num_children = type->num_children(); @@ -346,11 +346,10 @@ class RecordBatchReader::RecordBatchReaderImpl { std::shared_ptr<Array> values_array; RETURN_NOT_OK( NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array)); - return MakeListArray(type, field_meta.length, offsets, values_array, - field_meta.null_count, null_bitmap, out); - } - - if (type->type == Type::STRUCT) { + *out = std::make_shared<ListArray>(type, field_meta.length, offsets, values_array, + field_meta.null_count, null_bitmap); + return Status::OK(); + } else if (type->type == Type::STRUCT) { const int num_children = type->num_children(); std::vector<ArrayPtr> fields; fields.reserve(num_children); http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/ipc/test-common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 7d02bc3..13bbbeb 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -42,7 +42,7 @@ const auto kListInt32 = std::make_shared<ListType>(kInt32); const auto kListListInt32 = std::make_shared<ListType>(kListInt32); Status MakeRandomInt32Array( - int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) { + int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) { std::shared_ptr<PoolBuffer> data; test::MakeRandomInt32PoolBuffer(length, pool, &data); const auto kInt32 = std::make_shared<Int32Type>(); @@ -52,16 +52,14 @@ Status MakeRandomInt32Array( test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); RETURN_NOT_OK(builder.Append( reinterpret_cast<const int32_t*>(data->data()), length, valid_bytes->data())); - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } RETURN_NOT_OK(builder.Append(reinterpret_cast<const int32_t*>(data->data()), length)); - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* array) { + bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) { // Create the null list values std::vector<uint8_t> valid_lists(num_lists); const double null_percent = include_nulls ? 0.1 : 0; @@ -90,8 +88,8 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li } ListBuilder builder(pool, child_array); RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); - *array = builder.Finish(); - return (*array)->Validate(); + RETURN_NOT_OK(builder.Finish(out)); + return (*out)->Validate(); } typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out); @@ -115,7 +113,7 @@ Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) { template <class Builder, class RawType> Status MakeRandomBinaryArray( - const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) { + const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) { const std::vector<std::string> values = { "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; Builder builder(pool, type); @@ -130,8 +128,7 @@ Status MakeRandomBinaryArray( builder.Append(reinterpret_cast<const RawType*>(value.data()), value.size())); } } - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) { http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index b4c3721..ea8516f 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -242,7 +242,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType<DoubleType> { struct ARROW_EXPORT ListType : public DataType { // List can contain any other logical value type explicit ListType(const std::shared_ptr<DataType>& value_type) - : ListType(value_type, Type::LIST) {} + : ListType(std::make_shared<Field>("item", value_type)) {} explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST) { children_ = {value_field}; @@ -255,26 +255,17 @@ struct ARROW_EXPORT ListType : public DataType { static char const* name() { return "list"; } std::string ToString() const override; - - protected: - // Constructor for classes that are implemented as List Arrays. - ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type) - : DataType(logical_type) { - // TODO ARROW-187 this can technically fail, make a constructor method ? - children_ = {std::make_shared<Field>("item", value_type)}; - } }; // BinaryType type is reprsents lists of 1-byte values. -struct ARROW_EXPORT BinaryType : public ListType { +struct ARROW_EXPORT BinaryType : public DataType { BinaryType() : BinaryType(Type::BINARY) {} static char const* name() { return "binary"; } std::string ToString() const override; protected: // Allow subclasses to change the logical type. - explicit BinaryType(Type::type logical_type) - : ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {} + explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} }; // UTF encoded strings @@ -284,9 +275,6 @@ struct ARROW_EXPORT StringType : public BinaryType { static char const* name() { return "string"; } std::string ToString() const override; - - protected: - explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} }; struct ARROW_EXPORT StructType : public DataType { @@ -300,7 +288,7 @@ struct ARROW_EXPORT StructType : public DataType { // These will be defined elsewhere template <typename T> -struct type_traits {}; +struct TypeTraits {}; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 72a8e77..9f78169 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -25,7 +25,6 @@ install(FILES construct.h datetime.h decimal.h - json.h list.h primitive.h string.h http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/binary.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/binary.h b/cpp/src/arrow/types/binary.h deleted file mode 100644 index 201fbb6..0000000 --- a/cpp/src/arrow/types/binary.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_BINARY_H -#define ARROW_TYPES_BINARY_H - -#include <string> -#include <vector> - -#include "arrow/type.h" - -namespace arrow {} // namespace arrow - -#endif // ARROW_TYPES_BINARY_H http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/construct.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 0b71ea9..67245f8 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -59,6 +59,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, BUILDER_CASE(DOUBLE, DoubleBuilder); BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); case Type::LIST: { std::shared_ptr<ArrayBuilder> value_builder; @@ -105,10 +106,10 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); default: return Status::NotImplemented(type->ToString()); @@ -120,30 +121,4 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, #endif } -Status MakeListArray(const TypePtr& type, int32_t length, - const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap, ArrayPtr* out) { - switch (type->type) { - case Type::BINARY: - out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap)); - break; - - case Type::LIST: - out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); - break; - - case Type::DECIMAL_TEXT: - case Type::STRING: - out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); - break; - default: - return Status::NotImplemented(type->ToString()); - } -#ifdef NDEBUG - return Status::OK(); -#else - return (*out)->Validate(); -#endif -} - } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/construct.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index afdadbe..e18e946 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -42,14 +42,6 @@ Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out); -// Create new list arrays for logical types that are backed by ListArrays (e.g. list of -// primitives and strings) -// TODO(emkornfield) split up string vs list? -Status ARROW_EXPORT MakeListArray(const std::shared_ptr<DataType>& type, int32_t length, - const std::shared_ptr<Buffer>& offests, const std::shared_ptr<Array>& values, - int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap, - std::shared_ptr<Array>* out); - } // namespace arrow #endif // ARROW_BUILDER_H_ http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/json.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc deleted file mode 100644 index 89240fc..0000000 --- a/cpp/src/arrow/types/json.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/json.h" - -#include <vector> - -#include "arrow/type.h" -#include "arrow/types/union.h" - -namespace arrow { - -static const TypePtr Null(new NullType()); -static const TypePtr Int32(new Int32Type()); -static const TypePtr String(new StringType()); -static const TypePtr Double(new DoubleType()); -static const TypePtr Bool(new BooleanType()); - -static const std::vector<TypePtr> kJsonTypes = {Null, Int32, String, Double, Bool}; -TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(kJsonTypes)); -TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(kJsonTypes)); - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/json.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h deleted file mode 100644 index 9de961f..0000000 --- a/cpp/src/arrow/types/json.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_JSON_H -#define ARROW_TYPES_JSON_H - -#include "arrow/type.h" - -namespace arrow { - -struct JSONScalar : public DataType { - bool dense; - - static TypePtr dense_type; - static TypePtr sparse_type; - - explicit JSONScalar(bool dense = true) : DataType(Type::JSON_SCALAR), dense(dense) {} -}; - -} // namespace arrow - -#endif // ARROW_TYPES_JSON_H http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/list-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 2e41b4a..12c5394 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -76,7 +76,11 @@ class TestListBuilder : public TestBuilder { builder_ = std::dynamic_pointer_cast<ListBuilder>(tmp); } - void Done() { result_ = std::dynamic_pointer_cast<ListArray>(builder_->Finish()); } + void Done() { + std::shared_ptr<Array> out; + EXPECT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast<ListArray>(out); + } protected: TypePtr value_type_; @@ -98,14 +102,17 @@ TEST_F(TestListBuilder, Equality) { // setup two equal arrays ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); - array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&array)); ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); - equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&equal_array)); // now an unequal one ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size())); ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size())); - unequal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality EXPECT_TRUE(array->Equals(array)); http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/list.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 6334054..ef2ec22 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -25,7 +25,7 @@ bool ListArray::EqualsExact(const ListArray& other) const { if (null_count_ != other.null_count_) { return false; } bool equal_offsets = - offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); if (!equal_offsets) { return false; } bool equal_null_bitmap = true; if (null_count_ > 0) { @@ -72,10 +72,10 @@ bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } - if (offset_buf_->size() / static_cast<int>(sizeof(int32_t)) < length_) { + if (!offset_buffer_) { return Status::Invalid("offset_buffer_ was null"); } + if (offset_buffer_->size() / static_cast<int>(sizeof(int32_t)) < length_) { std::stringstream ss; - ss << "offset buffer size (bytes): " << offset_buf_->size() + ss << "offset buffer size (bytes): " << offset_buffer_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } @@ -121,4 +121,38 @@ Status ListArray::Validate() const { return Status::OK(); } +Status ListBuilder::Init(int32_t elements) { + DCHECK_LT(elements, std::numeric_limits<int32_t>::max()); + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + // one more then requested for offsets + return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); +} + +Status ListBuilder::Resize(int32_t capacity) { + DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); + // one more then requested for offsets + RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::Finish(std::shared_ptr<Array>* out) { + std::shared_ptr<Array> items = values_; + if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); } + + RETURN_NOT_OK(offset_builder_.Append<int32_t>(items->length())); + std::shared_ptr<Buffer> offsets = offset_builder_.Finish(); + + *out = std::make_shared<ListArray>( + type_, length_, offsets, items, null_count_, null_bitmap_); + + Reset(); + + return Status::OK(); +} + +void ListBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/list.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f389451..9440ffe 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -43,9 +43,9 @@ class ARROW_EXPORT ListArray : public Array { const ArrayPtr& values, int32_t null_count = 0, std::shared_ptr<Buffer> null_bitmap = nullptr) : Array(type, length, null_count, null_bitmap) { - offset_buf_ = offsets; - offsets_ = offsets == nullptr ? nullptr - : reinterpret_cast<const int32_t*>(offset_buf_->data()); + offset_buffer_ = offsets; + offsets_ = offsets == nullptr ? nullptr : reinterpret_cast<const int32_t*>( + offset_buffer_->data()); values_ = values; } @@ -57,7 +57,7 @@ class ARROW_EXPORT ListArray : public Array { // with this array. const std::shared_ptr<Array>& values() const { return values_; } const std::shared_ptr<Buffer> offset_buffer() const { - return std::static_pointer_cast<Buffer>(offset_buf_); + return std::static_pointer_cast<Buffer>(offset_buffer_); } const std::shared_ptr<DataType>& value_type() const { return values_->type(); } @@ -77,7 +77,7 @@ class ARROW_EXPORT ListArray : public Array { const ArrayPtr& arr) const override; protected: - std::shared_ptr<Buffer> offset_buf_; + std::shared_ptr<Buffer> offset_buffer_; const int32_t* offsets_; ArrayPtr values_; }; @@ -119,19 +119,9 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { virtual ~ListBuilder() {} - Status Init(int32_t elements) override { - DCHECK_LT(elements, std::numeric_limits<int32_t>::max()); - RETURN_NOT_OK(ArrayBuilder::Init(elements)); - // one more then requested for offsets - return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); - } - - Status Resize(int32_t capacity) override { - DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); - // one more then requested for offsets - RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); - } + Status Init(int32_t elements) override; + Status Resize(int32_t capacity) override; + Status Finish(std::shared_ptr<Array>* out) override; // Vector append // @@ -145,27 +135,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { return Status::OK(); } - // The same as Finalize but allows for overridding the c++ type - template <typename Container> - std::shared_ptr<Array> Transfer() { - std::shared_ptr<Array> items = values_; - if (!items) { items = value_builder_->Finish(); } - - offset_builder_.Append<int32_t>(items->length()); - - const auto offsets_buffer = offset_builder_.Finish(); - auto result = std::make_shared<Container>( - type_, length_, offsets_buffer, items, null_count_, null_bitmap_); - - // TODO(emkornfield) make a reset method - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; - - return result; - } - - std::shared_ptr<Array> Finish() override { return Transfer<ListArray>(); } - // Start a new variable-length list slot // // This function should be called before beginning to append elements to the @@ -188,6 +157,8 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { BufferBuilder offset_builder_; std::shared_ptr<ArrayBuilder> value_builder_; std::shared_ptr<Array> values_; + + void Reset(); }; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/primitive-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 5ac2867..121bd47 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -123,8 +123,11 @@ class TestPrimitiveBuilder : public TestBuilder { auto expected = std::make_shared<ArrayType>(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr<ArrayType> result = - std::dynamic_pointer_cast<ArrayType>(builder->Finish()); + + std::shared_ptr<Array> out; + ASSERT_OK(builder->Finish(&out)); + + std::shared_ptr<ArrayType> result = std::dynamic_pointer_cast<ArrayType>(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -216,8 +219,10 @@ void TestPrimitiveBuilder<PBoolean>::Check( auto expected = std::make_shared<BooleanArray>(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr<BooleanArray> result = - std::dynamic_pointer_cast<BooleanArray>(builder->Finish()); + + std::shared_ptr<Array> out; + ASSERT_OK(builder->Finish(&out)); + std::shared_ptr<BooleanArray> result = std::dynamic_pointer_cast<BooleanArray>(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -254,7 +259,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { int n = 1000; ASSERT_OK(this->builder_->Reserve(n)); ASSERT_EQ(util::next_power2(n), this->builder_->capacity()); - ASSERT_EQ(util::next_power2(type_traits<Type>::bytes_required(n)), + ASSERT_EQ(util::next_power2(TypeTraits<Type>::bytes_required(n)), this->builder_->data()->size()); // unsure if this should go in all builder classes @@ -267,7 +272,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { ASSERT_OK(this->builder_->AppendNull()); } - auto result = this->builder_->Finish(); + std::shared_ptr<Array> result; + ASSERT_OK(this->builder_->Finish(&result)); for (int i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)) << i; @@ -298,7 +304,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { } do { - std::shared_ptr<Array> result = this->builder_->Finish(); + std::shared_ptr<Array> result; + ASSERT_OK(this->builder_->Finish(&result)); } while (false); ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); @@ -315,8 +322,7 @@ Status MakeArray(const vector<uint8_t>& valid_bytes, const vector<T>& draws, int RETURN_NOT_OK(builder->AppendNull()); } } - *out = builder->Finish(); - return Status::OK(); + return builder->Finish(out); } TYPED_TEST(TestPrimitiveBuilder, Equality) { @@ -465,7 +471,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_OK(this->builder_->Reserve(cap)); ASSERT_EQ(cap, this->builder_->capacity()); - ASSERT_EQ(type_traits<Type>::bytes_required(cap), this->builder_->data()->size()); + ASSERT_EQ(TypeTraits<Type>::bytes_required(cap), this->builder_->data()->size()); ASSERT_EQ(util::bytes_for_bits(cap), this->builder_->null_bitmap()->size()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/primitive.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 9ba2ebd..3a05ccf 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -69,12 +69,25 @@ bool PrimitiveArray::Equals(const std::shared_ptr<Array>& arr) const { return EqualsExact(*static_cast<const PrimitiveArray*>(arr.get())); } +template class NumericArray<UInt8Type>; +template class NumericArray<UInt16Type>; +template class NumericArray<UInt32Type>; +template class NumericArray<UInt64Type>; +template class NumericArray<Int8Type>; +template class NumericArray<Int16Type>; +template class NumericArray<Int32Type>; +template class NumericArray<Int64Type>; +template class NumericArray<TimestampType>; +template class NumericArray<FloatType>; +template class NumericArray<DoubleType>; +template class NumericArray<BooleanType>; + template <typename T> Status PrimitiveBuilder<T>::Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); data_ = std::make_shared<PoolBuffer>(pool_); - int64_t nbytes = type_traits<T>::bytes_required(capacity); + int64_t nbytes = TypeTraits<T>::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); // TODO(emkornfield) valgrind complains without this memset(data_->mutable_data(), 0, nbytes); @@ -93,10 +106,9 @@ Status PrimitiveBuilder<T>::Resize(int32_t capacity) { } else { RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); const int64_t old_bytes = data_->size(); - const int64_t new_bytes = type_traits<T>::bytes_required(capacity); + const int64_t new_bytes = TypeTraits<T>::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast<value_type*>(data_->mutable_data()); - memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); @@ -108,7 +120,7 @@ Status PrimitiveBuilder<T>::Append( RETURN_NOT_OK(Reserve(length)); if (length > 0) { - memcpy(raw_data_ + length_, values, type_traits<T>::bytes_required(length)); + memcpy(raw_data_ + length_, values, TypeTraits<T>::bytes_required(length)); } // length_ is update by these @@ -118,13 +130,18 @@ Status PrimitiveBuilder<T>::Append( } template <typename T> -std::shared_ptr<Array> PrimitiveBuilder<T>::Finish() { - std::shared_ptr<Array> result = std::make_shared<typename type_traits<T>::ArrayType>( +Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) { + const int64_t bytes_required = TypeTraits<T>::bytes_required(length_); + if (bytes_required > 0 && bytes_required < data_->size()) { + // Trim buffers + RETURN_NOT_OK(data_->Resize(bytes_required)); + } + *out = std::make_shared<typename TypeTraits<T>::ArrayType>( type_, length_, data_, null_count_, null_bitmap_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; - return result; + return Status::OK(); } template <> http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/primitive.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c643783..f21470d 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -91,7 +91,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int i) const { return raw_data()[i]; } }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass) using NAME = NumericArray<TypeClass>; +#define NUMERIC_ARRAY_DECL(NAME, TypeClass) \ + using NAME = NumericArray<TypeClass>; \ + extern template class ARROW_EXPORT NumericArray<TypeClass>; NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type); NUMERIC_ARRAY_DECL(Int8Array, Int8Type); @@ -139,8 +141,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { Status Append( const value_type* values, int32_t length, const uint8_t* valid_bytes = nullptr); - std::shared_ptr<Array> Finish() override; - + Status Finish(std::shared_ptr<Array>* out) override; Status Init(int32_t capacity) override; // Increase the capacity of the builder to accommodate at least the indicated @@ -183,77 +184,77 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder<T> { }; template <> -struct type_traits<UInt8Type> { +struct TypeTraits<UInt8Type> { typedef UInt8Array ArrayType; static inline int bytes_required(int elements) { return elements; } }; template <> -struct type_traits<Int8Type> { +struct TypeTraits<Int8Type> { typedef Int8Array ArrayType; static inline int bytes_required(int elements) { return elements; } }; template <> -struct type_traits<UInt16Type> { +struct TypeTraits<UInt16Type> { typedef UInt16Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } }; template <> -struct type_traits<Int16Type> { +struct TypeTraits<Int16Type> { typedef Int16Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } }; template <> -struct type_traits<UInt32Type> { +struct TypeTraits<UInt32Type> { typedef UInt32Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } }; template <> -struct type_traits<Int32Type> { +struct TypeTraits<Int32Type> { typedef Int32Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } }; template <> -struct type_traits<UInt64Type> { +struct TypeTraits<UInt64Type> { typedef UInt64Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } }; template <> -struct type_traits<Int64Type> { +struct TypeTraits<Int64Type> { typedef Int64Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; template <> -struct type_traits<TimestampType> { +struct TypeTraits<TimestampType> { typedef TimestampArray ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; template <> -struct type_traits<FloatType> { +struct TypeTraits<FloatType> { typedef FloatArray ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(float); } }; template <> -struct type_traits<DoubleType> { +struct TypeTraits<DoubleType> { typedef DoubleArray ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(double); } @@ -293,7 +294,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { }; template <> -struct type_traits<BooleanType> { +struct TypeTraits<BooleanType> { typedef BooleanArray ArrayType; static inline int bytes_required(int elements) { http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/string-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 6807b00..d897e30 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -66,18 +66,13 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int nchars = chars_.size(); - value_buf_ = test::to_buffer(chars_); - values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); - offsets_buf_ = test::to_buffer(offsets_); - null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared<StringArray>( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } protected: @@ -94,7 +89,6 @@ class TestStringContainer : public ::testing::Test { int null_count_; int length_; - ArrayPtr values_; std::shared_ptr<StringArray> strings_; }; @@ -122,7 +116,7 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { auto arr = std::make_shared<StringArray>( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } TEST_F(TestStringContainer, TestGetString) { @@ -147,7 +141,10 @@ class TestStringBuilder : public TestBuilder { } void Done() { - result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish()); + std::shared_ptr<Array> out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast<StringArray>(out); result_->Validate(); } @@ -178,7 +175,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->values()->length()); + ASSERT_EQ(reps * 6, result_->data()->size()); int32_t length; int32_t pos = 0; @@ -218,18 +215,14 @@ class TestBinaryContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int nchars = chars_.size(); - value_buf_ = test::to_buffer(chars_); - values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); - offsets_buf_ = test::to_buffer(offsets_); null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared<BinaryArray>( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } protected: @@ -246,7 +239,6 @@ class TestBinaryContainer : public ::testing::Test { int null_count_; int length_; - ArrayPtr values_; std::shared_ptr<BinaryArray> strings_; }; @@ -274,7 +266,7 @@ TEST_F(TestBinaryContainer, TestListFunctions) { TEST_F(TestBinaryContainer, TestDestructor) { auto arr = std::make_shared<BinaryArray>( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } TEST_F(TestBinaryContainer, TestGetValue) { @@ -298,7 +290,10 @@ class TestBinaryBuilder : public TestBuilder { } void Done() { - result_ = std::dynamic_pointer_cast<BinaryArray>(builder_->Finish()); + std::shared_ptr<Array> out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast<BinaryArray>(out); result_->Validate(); } @@ -330,7 +325,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { ASSERT_OK(result_->Validate()); ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->values()->length()); + ASSERT_EQ(reps * 6, result_->data()->size()); int32_t length; for (int i = 0; i < N * reps; ++i) { http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/string.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 745ed8f..d692e13 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -17,6 +17,7 @@ #include "arrow/types/string.h" +#include <cstring> #include <sstream> #include <string> @@ -24,37 +25,77 @@ namespace arrow { -const std::shared_ptr<DataType> BINARY(new BinaryType()); -const std::shared_ptr<DataType> STRING(new StringType()); +static std::shared_ptr<DataType> kBinary = std::make_shared<BinaryType>(); +static std::shared_ptr<DataType> kString = std::make_shared<StringType>(); BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const ArrayPtr& values, int32_t null_count, + const std::shared_ptr<Buffer>& data, int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) - : BinaryArray(BINARY, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {} BinaryArray::BinaryArray(const TypePtr& type, int32_t length, - const std::shared_ptr<Buffer>& offsets, const ArrayPtr& values, int32_t null_count, - const std::shared_ptr<Buffer>& null_bitmap) - : ListArray(type, length, offsets, values, null_count, null_bitmap), - bytes_(std::dynamic_pointer_cast<UInt8Array>(values).get()), - raw_bytes_(bytes_->raw_data()) { - // Check in case the dynamic cast fails. - DCHECK(bytes_); + const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data, + int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) + : Array(type, length, null_count, null_bitmap), + offset_buffer_(offsets), + offsets_(reinterpret_cast<const int32_t*>(offset_buffer_->data())), + data_buffer_(data), + data_(nullptr) { + if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); } } Status BinaryArray::Validate() const { - if (values()->null_count() > 0) { - std::stringstream ss; - ss << type()->ToString() << " can have null values in the value array"; - Status::Invalid(ss.str()); + // TODO(wesm): what to do here? + return Status::OK(); +} + +bool BinaryArray::EqualsExact(const BinaryArray& other) const { + if (!Array::EqualsExact(other)) { return false; } + + bool equal_offsets = + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } + + return data_buffer_->Equals(*other.data_buffer_, data_buffer_->size()); +} + +bool BinaryArray::Equals(const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast<const BinaryArray*>(arr.get())); +} + +bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr<Array>& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast<const BinaryArray*>(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + + if (std::memcmp(data_ + begin_offset, other->data_ + other_begin_offset, + end_offset - begin_offset)) { + return false; + } } - return ListArray::Validate(); + return true; } StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const ArrayPtr& values, int32_t null_count, + const std::shared_ptr<Buffer>& data, int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap) - : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -72,4 +113,28 @@ BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get()); } +Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) { + std::shared_ptr<Array> result; + RETURN_NOT_OK(ListBuilder::Finish(&result)); + + const auto list = std::dynamic_pointer_cast<ListArray>(result); + auto values = std::dynamic_pointer_cast<UInt8Array>(list->values()); + + *out = std::make_shared<BinaryArray>(list->length(), list->offset_buffer(), + values->data(), list->null_count(), list->null_bitmap()); + return Status::OK(); +} + +Status StringBuilder::Finish(std::shared_ptr<Array>* out) { + std::shared_ptr<Array> result; + RETURN_NOT_OK(ListBuilder::Finish(&result)); + + const auto list = std::dynamic_pointer_cast<ListArray>(result); + auto values = std::dynamic_pointer_cast<UInt8Array>(list->values()); + + *out = std::make_shared<StringArray>(list->length(), list->offset_buffer(), + values->data(), list->null_count(), list->null_bitmap()); + return Status::OK(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/string.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index bab0c58..aaba49c 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -35,15 +35,16 @@ namespace arrow { class Buffer; class MemoryPool; -class ARROW_EXPORT BinaryArray : public ListArray { +class ARROW_EXPORT BinaryArray : public Array { public: BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr<Buffer>& data, int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr); + // Constructor that allows sub-classes/builders to propagate there logical type up the // class hierarchy. BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr<Buffer>& data, int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr); // Return the pointer to the given elements bytes @@ -53,28 +54,38 @@ class ARROW_EXPORT BinaryArray : public ListArray { DCHECK(out_length); const int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; - return raw_bytes_ + pos; + return data_ + pos; } + std::shared_ptr<Buffer> data() const { return data_buffer_; } + std::shared_ptr<Buffer> offsets() const { return offset_buffer_; } + + int32_t offset(int i) const { return offsets_[i]; } + + // Neither of these functions will perform boundschecking + int32_t value_offset(int i) const { return offsets_[i]; } + int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + + bool EqualsExact(const BinaryArray& other) const; + bool Equals(const std::shared_ptr<Array>& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; + Status Validate() const override; private: - UInt8Array* bytes_; - const uint8_t* raw_bytes_; + std::shared_ptr<Buffer> offset_buffer_; + const int32_t* offsets_; + + std::shared_ptr<Buffer> data_buffer_; + const uint8_t* data_; }; class ARROW_EXPORT StringArray : public BinaryArray { public: StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr<Buffer>& data, int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr); - // Constructor that allows overriding the logical type, so subclasses can propagate - // there - // up the class hierarchy. - StringArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& offsets, - const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr<Buffer>& null_bitmap = nullptr) - : BinaryArray(type, length, offsets, values, null_count, null_bitmap) {} // Construct a std::string // TODO: std::bad_alloc possibility @@ -98,9 +109,7 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { return byte_builder_->Append(value, length); } - std::shared_ptr<Array> Finish() override { - return ListBuilder::Transfer<BinaryArray>(); - } + Status Finish(std::shared_ptr<Array>* out) override; protected: UInt8Builder* byte_builder_; @@ -112,6 +121,8 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : BinaryBuilder(pool, type) {} + Status Finish(std::shared_ptr<Array>* out) override; + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } Status Append(const char* value, int32_t length) { @@ -119,10 +130,6 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { } Status Append(const std::vector<std::string>& values, uint8_t* null_bytes); - - std::shared_ptr<Array> Finish() override { - return ListBuilder::Transfer<StringArray>(); - } }; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/struct-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index ccf5a52..8e82c38 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -119,7 +119,11 @@ class TestStructBuilder : public TestBuilder { ASSERT_EQ(2, static_cast<int>(builder_->field_builders().size())); } - void Done() { result_ = std::dynamic_pointer_cast<StructArray>(builder_->Finish()); } + void Done() { + std::shared_ptr<Array> out; + ASSERT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast<StructArray>(out); + } protected: std::vector<FieldPtr> value_fields_; @@ -294,7 +298,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -308,7 +313,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&equal_array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -323,7 +329,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - unequal_bitmap_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_bitmap_array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -339,7 +346,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - unequal_offsets_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_offsets_array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -354,7 +362,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : unequal_int_values) { int_vb->UnsafeAppend(value); } - unequal_values_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_values_array)); // Test array equality EXPECT_TRUE(array->Equals(array)); http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/struct.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index e8176f0..369c29d 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -87,4 +87,18 @@ Status StructArray::Validate() const { return Status::OK(); } +Status StructBuilder::Finish(std::shared_ptr<Array>* out) { + std::vector<std::shared_ptr<Array>> fields(field_builders_.size()); + for (size_t i = 0; i < field_builders_.size(); ++i) { + RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i])); + } + + *out = std::make_shared<StructArray>(type_, length_, fields, null_count_, null_bitmap_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/types/struct.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 63955eb..65b8daf 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -73,6 +73,8 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { field_builders_ = field_builders; } + Status Finish(std::shared_ptr<Array>* out) override; + // Null bitmap is of equal length to every child field, and any zero byte // will be considered as a null for that field, but users must using app- // end methods or advance methods of the child builders' independently to @@ -83,21 +85,6 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - std::shared_ptr<Array> Finish() override { - std::vector<ArrayPtr> fields; - for (auto it : field_builders_) { - fields.push_back(it->Finish()); - } - - auto result = - std::make_shared<StructArray>(type_, length_, fields, null_count_, null_bitmap_); - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return result; - } - // Append an element to the Struct. All child-builders' Append method must // be called independently to maintain data-structure consistency. Status Append(bool is_valid = true) { http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/util/status.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index 8dd07d0..08e9ae3 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -49,12 +49,18 @@ std::string Status::CodeAsString() const { case StatusCode::KeyError: type = "Key error"; break; + case StatusCode::TypeError: + type = "Type error"; + break; case StatusCode::Invalid: type = "Invalid"; break; case StatusCode::IOError: type = "IOError"; break; + case StatusCode::UnknownError: + type = "Unknown error"; + break; case StatusCode::NotImplemented: type = "NotImplemented"; break; http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/cpp/src/arrow/util/status.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index d558531..05f5b74 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -78,9 +78,10 @@ enum class StatusCode : char { OK = 0, OutOfMemory = 1, KeyError = 2, - Invalid = 3, - IOError = 4, - + TypeError = 3, + Invalid = 4, + IOError = 5, + UnknownError = 9, NotImplemented = 10, }; @@ -106,6 +107,14 @@ class ARROW_EXPORT Status { return Status(StatusCode::KeyError, msg, -1); } + static Status TypeError(const std::string& msg) { + return Status(StatusCode::TypeError, msg, -1); + } + + static Status UnknownError(const std::string& msg) { + return Status(StatusCode::UnknownError, msg, -1); + } + static Status NotImplemented(const std::string& msg) { return Status(StatusCode::NotImplemented, msg, -1); } @@ -125,6 +134,8 @@ class ARROW_EXPORT Status { bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsIOError() const { return code() == StatusCode::IOError; } + + bool IsUnknownError() const { return code() == StatusCode::UnknownError; } bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } // Return a string representation of this status suitable for printing. http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 55f6d05..4357fa0 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -438,8 +438,6 @@ set(PYARROW_SRCS src/pyarrow/config.cc src/pyarrow/helpers.cc src/pyarrow/io.cc - src/pyarrow/status.cc - src/pyarrow/adapters/builtin.cc src/pyarrow/adapters/pandas.cc ) http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/doc/index.rst ---------------------------------------------------------------------- diff --git a/python/doc/index.rst b/python/doc/index.rst index 550e544..88725ba 100644 --- a/python/doc/index.rst +++ b/python/doc/index.rst @@ -1,3 +1,20 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + Apache Arrow (Python) ===================== @@ -25,4 +42,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/pyarrow/error.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd index 891d1ac..4fb46c2 100644 --- a/python/pyarrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -16,7 +16,5 @@ # under the License. from pyarrow.includes.libarrow cimport CStatus -from pyarrow.includes.pyarrow cimport PyStatus -cdef int check_cstatus(const CStatus& status) nogil except -1 -cdef int check_status(const PyStatus& status) nogil except -1 +cdef int check_status(const CStatus& status) nogil except -1 http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/pyarrow/error.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index a2c53fe..b8a82b3 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -22,15 +22,7 @@ from pyarrow.compat import frombytes class ArrowException(Exception): pass -cdef int check_cstatus(const CStatus& status) nogil except -1: - if status.ok(): - return 0 - - cdef c_string c_message = status.ToString() - with gil: - raise ArrowException(frombytes(c_message)) - -cdef int check_status(const PyStatus& status) nogil except -1: +cdef int check_status(const CStatus& status) nogil except -1: if status.ok(): return 0 http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/pyarrow/includes/pyarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 7c47f21..e1da191 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -25,36 +25,19 @@ cimport pyarrow.includes.libarrow_io as arrow_io cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: - # We can later add more of the common status factory methods as needed - cdef PyStatus PyStatus_OK "Status::OK"() - - cdef cppclass PyStatus "pyarrow::Status": - PyStatus() - - c_string ToString() - - c_bool ok() - c_bool IsOutOfMemory() - c_bool IsKeyError() - c_bool IsTypeError() - c_bool IsIOError() - c_bool IsValueError() - c_bool IsNotImplemented() - c_bool IsArrowError() - shared_ptr[CDataType] GetPrimitiveType(Type type) - PyStatus ConvertPySequence(object obj, shared_ptr[CArray]* out) + CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out) - PyStatus PandasToArrow(MemoryPool* pool, object ao, - shared_ptr[CArray]* out) - PyStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, - shared_ptr[CArray]* out) + CStatus PandasToArrow(MemoryPool* pool, object ao, + shared_ptr[CArray]* out) + CStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, + shared_ptr[CArray]* out) - PyStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, - object py_ref, PyObject** out) + CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, + object py_ref, PyObject** out) - PyStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, - object py_ref, PyObject** out) + CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, + object py_ref, PyObject** out) MemoryPool* get_memory_pool() http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/pyarrow/io.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 8970e06..16ebfa1 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -28,7 +28,7 @@ cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.libarrow_io cimport * from pyarrow.compat import frombytes, tobytes -from pyarrow.error cimport check_cstatus +from pyarrow.error cimport check_status cimport cpython as cp @@ -57,9 +57,9 @@ cdef class NativeFile: if self.is_open: with nogil: if self.is_readonly: - check_cstatus(self.rd_file.get().Close()) + check_status(self.rd_file.get().Close()) else: - check_cstatus(self.wr_file.get().Close()) + check_status(self.wr_file.get().Close()) self.is_open = False cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): @@ -88,22 +88,22 @@ cdef class NativeFile: cdef int64_t size self._assert_readable() with nogil: - check_cstatus(self.rd_file.get().GetSize(&size)) + check_status(self.rd_file.get().GetSize(&size)) return size def tell(self): cdef int64_t position with nogil: if self.is_readonly: - check_cstatus(self.rd_file.get().Tell(&position)) + check_status(self.rd_file.get().Tell(&position)) else: - check_cstatus(self.wr_file.get().Tell(&position)) + check_status(self.wr_file.get().Tell(&position)) return position def seek(self, int64_t position): self._assert_readable() with nogil: - check_cstatus(self.rd_file.get().Seek(position)) + check_status(self.rd_file.get().Seek(position)) def write(self, data): """ @@ -116,7 +116,7 @@ cdef class NativeFile: cdef const uint8_t* buf = <const uint8_t*> cp.PyBytes_AS_STRING(data) cdef int64_t bufsize = len(data) with nogil: - check_cstatus(self.wr_file.get().Write(buf, bufsize)) + check_status(self.wr_file.get().Write(buf, bufsize)) def read(self, int nbytes): cdef: @@ -127,8 +127,7 @@ cdef class NativeFile: self._assert_readable() with nogil: - check_cstatus(self.rd_file.get() - .ReadB(nbytes, &out)) + check_status(self.rd_file.get().ReadB(nbytes, &out)) result = cp.PyBytes_FromStringAndSize( <const char*>out.get().data(), out.get().size()) @@ -223,7 +222,7 @@ cdef class InMemoryOutputStream(NativeFile): def get_result(self): cdef Buffer result = Buffer() - check_cstatus(self.wr_file.get().Close()) + check_status(self.wr_file.get().Close()) result.init(<shared_ptr[CBuffer]> self.buffer) self.is_open = False @@ -270,7 +269,7 @@ except ImportError: def have_libhdfs(): try: - check_cstatus(ConnectLibHdfs()) + check_status(ConnectLibHdfs()) return True except: return False @@ -304,7 +303,7 @@ cdef class HdfsClient: def close(self): self._ensure_client() with nogil: - check_cstatus(self.client.get().Disconnect()) + check_status(self.client.get().Disconnect()) self.is_open = False cdef _ensure_client(self): @@ -341,8 +340,7 @@ cdef class HdfsClient: conf.user = tobytes(user) with nogil: - check_cstatus( - CHdfsClient.Connect(&conf, &out.client)) + check_status(CHdfsClient.Connect(&conf, &out.client)) out.is_open = True return out @@ -383,8 +381,8 @@ cdef class HdfsClient: self._ensure_client() with nogil: - check_cstatus(self.client.get() - .ListDirectory(c_path, &listing)) + check_status(self.client.get() + .ListDirectory(c_path, &listing)) cdef const HdfsPathInfo* info for i in range(<int> listing.size()): @@ -422,8 +420,8 @@ cdef class HdfsClient: cdef c_string c_path = tobytes(path) with nogil: - check_cstatus(self.client.get() - .CreateDirectory(c_path)) + check_status(self.client.get() + .CreateDirectory(c_path)) def delete(self, path, bint recursive=False): """ @@ -439,8 +437,8 @@ cdef class HdfsClient: cdef c_string c_path = tobytes(path) with nogil: - check_cstatus(self.client.get() - .Delete(c_path, recursive)) + check_status(self.client.get() + .Delete(c_path, recursive)) def open(self, path, mode='rb', buffer_size=None, replication=None, default_block_size=None): @@ -473,7 +471,7 @@ cdef class HdfsClient: append = True with nogil: - check_cstatus( + check_status( self.client.get() .OpenWriteable(c_path, append, c_buffer_size, c_replication, c_default_block_size, @@ -484,8 +482,8 @@ cdef class HdfsClient: out.is_readonly = False else: with nogil: - check_cstatus(self.client.get() - .OpenReadable(c_path, &rd_handle)) + check_status(self.client.get() + .OpenReadable(c_path, &rd_handle)) out.rd_file = <shared_ptr[ReadableFileInterface]> rd_handle out.is_readonly = True @@ -579,9 +577,9 @@ cdef class HdfsFile(NativeFile): try: with nogil: while total_bytes < nbytes: - check_cstatus(self.rd_file.get() - .Read(rpc_chunksize, &bytes_read, - buf + total_bytes)) + check_status(self.rd_file.get() + .Read(rpc_chunksize, &bytes_read, + buf + total_bytes)) total_bytes += bytes_read @@ -647,8 +645,8 @@ cdef class HdfsFile(NativeFile): try: while True: with nogil: - check_cstatus(self.rd_file.get() - .Read(self.buffer_size, &bytes_read, buf)) + check_status(self.rd_file.get() + .Read(self.buffer_size, &bytes_read, buf)) total_bytes += bytes_read http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/pyarrow/ipc.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/ipc.pyx b/python/pyarrow/ipc.pyx index f8da3a7..46deb5a 100644 --- a/python/pyarrow/ipc.pyx +++ b/python/pyarrow/ipc.pyx @@ -26,7 +26,7 @@ from pyarrow.includes.libarrow_io cimport * from pyarrow.includes.libarrow_ipc cimport * cimport pyarrow.includes.pyarrow as pyarrow -from pyarrow.error cimport check_cstatus +from pyarrow.error cimport check_status from pyarrow.io cimport NativeFile from pyarrow.schema cimport Schema from pyarrow.table cimport RecordBatch @@ -89,8 +89,8 @@ cdef class ArrowFileWriter: get_writer(sink, &self.sink) with nogil: - check_cstatus(CFileWriter.Open(self.sink.get(), schema.sp_schema, - &self.writer)) + check_status(CFileWriter.Open(self.sink.get(), schema.sp_schema, + &self.writer)) self.closed = False @@ -101,12 +101,12 @@ cdef class ArrowFileWriter: def write_record_batch(self, RecordBatch batch): cdef CRecordBatch* bptr = batch.batch with nogil: - check_cstatus(self.writer.get() - .WriteRecordBatch(bptr.columns(), bptr.num_rows())) + check_status(self.writer.get() + .WriteRecordBatch(bptr.columns(), bptr.num_rows())) def close(self): with nogil: - check_cstatus(self.writer.get().Close()) + check_status(self.writer.get().Close()) self.closed = True @@ -124,9 +124,9 @@ cdef class ArrowFileReader: with nogil: if offset != 0: - check_cstatus(CFileReader.Open2(reader, offset, &self.reader)) + check_status(CFileReader.Open2(reader, offset, &self.reader)) else: - check_cstatus(CFileReader.Open(reader, &self.reader)) + check_status(CFileReader.Open(reader, &self.reader)) property num_dictionaries: @@ -147,7 +147,7 @@ cdef class ArrowFileReader: raise ValueError('Batch number {0} out of range'.format(i)) with nogil: - check_cstatus(self.reader.get().GetRecordBatch(i, &batch)) + check_status(self.reader.get().GetRecordBatch(i, &batch)) result = RecordBatch() result.init(batch) http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/pyarrow/parquet.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 2abe57b..019dd2c 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -26,7 +26,7 @@ cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.compat import tobytes from pyarrow.error import ArrowException -from pyarrow.error cimport check_cstatus +from pyarrow.error cimport check_status from pyarrow.io import NativeFile from pyarrow.table cimport Table @@ -62,7 +62,7 @@ cdef class ParquetReader: cdef shared_ptr[ReadableFileInterface] cpp_handle file.read_handle(&cpp_handle) - check_cstatus(OpenFile(cpp_handle, &self.allocator, &self.reader)) + check_status(OpenFile(cpp_handle, &self.allocator, &self.reader)) def read_all(self): cdef: @@ -70,8 +70,8 @@ cdef class ParquetReader: shared_ptr[CTable] ctable with nogil: - check_cstatus(self.reader.get() - .ReadFlatTable(&ctable)) + check_status(self.reader.get() + .ReadFlatTable(&ctable)) table.init(ctable) return table @@ -80,7 +80,7 @@ cdef class ParquetReader: def read_table(source, columns=None): """ Read a Table from Parquet format - + Returns ------- pyarrow.table.Table @@ -176,5 +176,5 @@ def write_table(table, filename, chunk_size=None, version=None, sink.reset(new LocalFileOutputStream(tobytes(filename))) with nogil: - check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, - chunk_size_, properties_builder.build())) + check_status(WriteFlatTable(ctable_, default_memory_pool(), sink, + chunk_size_, properties_builder.build())) http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/src/pyarrow/adapters/builtin.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 680f3a5..c034fbd 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -20,13 +20,14 @@ #include "pyarrow/adapters/builtin.h" -#include <arrow/api.h> +#include "arrow/api.h" +#include "arrow/util/status.h" #include "pyarrow/helpers.h" -#include "pyarrow/status.h" using arrow::ArrayBuilder; using arrow::DataType; +using arrow::Status; using arrow::Type; namespace pyarrow { @@ -129,7 +130,7 @@ class SeqVisitor { PyObject* item = item_ref.obj(); if (PyList_Check(item)) { - PY_RETURN_NOT_OK(Visit(item, level + 1)); + RETURN_NOT_OK(Visit(item, level + 1)); } else if (PyDict_Check(item)) { return Status::NotImplemented("No type inference for dicts"); } else { @@ -164,9 +165,9 @@ class SeqVisitor { Status Validate() const { if (scalars_.total_count() > 0) { if (num_nesting_levels() > 1) { - return Status::ValueError("Mixed nesting levels not supported"); + return Status::Invalid("Mixed nesting levels not supported"); } else if (max_observed_level() < max_nesting_level_) { - return Status::ValueError("Mixed nesting levels not supported"); + return Status::Invalid("Mixed nesting levels not supported"); } } return Status::OK(); @@ -216,8 +217,8 @@ static Status InferArrowType(PyObject* obj, int64_t* size, } SeqVisitor seq_visitor; - PY_RETURN_NOT_OK(seq_visitor.Visit(obj)); - PY_RETURN_NOT_OK(seq_visitor.Validate()); + RETURN_NOT_OK(seq_visitor.Visit(obj)); + RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); @@ -259,7 +260,7 @@ class BoolConverter : public TypedConverter<arrow::BooleanBuilder> { public: Status AppendData(PyObject* seq) override { Py_ssize_t size = PySequence_Size(seq); - RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -281,7 +282,7 @@ class Int64Converter : public TypedConverter<arrow::Int64Builder> { Status AppendData(PyObject* seq) override { int64_t val; Py_ssize_t size = PySequence_Size(seq); - RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -301,7 +302,7 @@ class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> { Status AppendData(PyObject* seq) override { double val; Py_ssize_t size = PySequence_Size(seq); - RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -330,7 +331,7 @@ class StringConverter : public TypedConverter<arrow::StringBuilder> { OwnedRef holder(item); if (item == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + RETURN_NOT_OK(typed_builder_->AppendNull()); continue; } else if (PyUnicode_Check(item)) { tmp.reset(PyUnicode_AsUTF8String(item)); @@ -344,7 +345,7 @@ class StringConverter : public TypedConverter<arrow::StringBuilder> { // No error checking length = PyBytes_GET_SIZE(bytes_obj); bytes = PyBytes_AS_STRING(bytes_obj); - RETURN_ARROW_NOT_OK(typed_builder_->Append(bytes, length)); + RETURN_NOT_OK(typed_builder_->Append(bytes, length)); } return Status::OK(); } @@ -359,10 +360,10 @@ class ListConverter : public TypedConverter<arrow::ListBuilder> { for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + RETURN_NOT_OK(typed_builder_->AppendNull()); } else { typed_builder_->Append(); - PY_RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + RETURN_NOT_OK(value_converter_->AppendData(item.obj())); } } return Status::OK(); @@ -408,7 +409,7 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) { Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) { std::shared_ptr<DataType> type; int64_t size; - PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + RETURN_NOT_OK(InferArrowType(obj, &size, &type)); // Handle NA / NullType case if (type->type == Type::NA) { @@ -426,14 +427,12 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) { // Give the sequence converter an array builder std::shared_ptr<ArrayBuilder> builder; - RETURN_ARROW_NOT_OK(arrow::MakeBuilder(get_memory_pool(), type, &builder)); + RETURN_NOT_OK(arrow::MakeBuilder(get_memory_pool(), type, &builder)); converter->Init(builder); - PY_RETURN_NOT_OK(converter->AppendData(obj)); + RETURN_NOT_OK(converter->AppendData(obj)); - *out = builder->Finish(); - - return Status::OK(); + return builder->Finish(out); } } // namespace pyarrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/src/pyarrow/adapters/builtin.h ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 4e997e3..2ddfdaa 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -30,14 +30,15 @@ #include "pyarrow/common.h" #include "pyarrow/visibility.h" -namespace arrow { class Array; } +namespace arrow { +class Array; +class Status; +} namespace pyarrow { -class Status; - PYARROW_EXPORT -Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out); +arrow::Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out); } // namespace pyarrow http://git-wip-us.apache.org/repos/asf/arrow/blob/732a2059/python/src/pyarrow/adapters/pandas.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index b2fcd37..5902b83 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -31,10 +31,10 @@ #include "arrow/api.h" #include "arrow/util/bit-util.h" +#include "arrow/util/status.h" #include "pyarrow/common.h" #include "pyarrow/config.h" -#include "pyarrow/status.h" namespace pyarrow { @@ -42,6 +42,8 @@ using arrow::Array; using arrow::Column; using arrow::Field; using arrow::DataType; +using arrow::Status; + namespace util = arrow::util; // ---------------------------------------------------------------------- @@ -149,7 +151,7 @@ class ArrowSerializer { int null_bytes = util::bytes_for_bits(length_); null_bitmap_ = std::make_shared<arrow::PoolBuffer>(pool_); - RETURN_ARROW_NOT_OK(null_bitmap_->Resize(null_bytes)); + RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); null_bitmap_data_ = null_bitmap_->mutable_data(); memset(null_bitmap_data_, 0, null_bytes); @@ -171,9 +173,9 @@ class ArrowSerializer { PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); arrow::TypePtr string_type(new arrow::StringType()); arrow::StringBuilder string_builder(pool_, string_type); - RETURN_ARROW_NOT_OK(string_builder.Resize(length_)); + RETURN_NOT_OK(string_builder.Resize(length_)); - arrow::Status s; + Status s; PyObject* obj; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; @@ -187,18 +189,16 @@ class ArrowSerializer { s = string_builder.Append(PyBytes_AS_STRING(obj), length); Py_DECREF(obj); if (!s.ok()) { - return Status::ArrowError(s.ToString()); + return s; } } else if (PyBytes_Check(obj)) { const int32_t length = PyBytes_GET_SIZE(obj); - RETURN_ARROW_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); + RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); } else { string_builder.AppendNull(); } } - *out = std::shared_ptr<arrow::Array>(string_builder.Finish()); - - return Status::OK(); + return string_builder.Finish(out); } Status ConvertBooleans(std::shared_ptr<Array>* out) { @@ -208,7 +208,7 @@ class ArrowSerializer { int nbytes = util::bytes_for_bits(length_); auto data = std::make_shared<arrow::PoolBuffer>(pool_); - RETURN_ARROW_NOT_OK(data->Resize(nbytes)); + RETURN_NOT_OK(data->Resize(nbytes)); uint8_t* bitmap = data->mutable_data(); memset(bitmap, 0, nbytes); @@ -305,7 +305,7 @@ inline Status ArrowSerializer<NPY_DATETIME>::MakeDataType(std::shared_ptr<DataTy unit = arrow::TimestampType::Unit::NANO; break; default: - return Status::ValueError("Unknown NumPy datetime unit"); + return Status::Invalid("Unknown NumPy datetime unit"); } out->reset(new arrow::TimestampType(unit)); @@ -330,7 +330,7 @@ inline Status ArrowSerializer<TYPE>::Convert(std::shared_ptr<Array>* out) { RETURN_NOT_OK(ConvertData()); std::shared_ptr<DataType> type; RETURN_NOT_OK(MakeDataType(&type)); - RETURN_ARROW_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out)); + RETURN_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out)); return Status::OK(); } @@ -389,7 +389,7 @@ template <int TYPE> inline Status ArrowSerializer<TYPE>::ConvertData() { // TODO(wesm): strided arrays if (is_strided()) { - return Status::ValueError("no support for strided data yet"); + return Status::Invalid("no support for strided data yet"); } data_ = std::make_shared<NumPyBuffer>(arr_); @@ -399,12 +399,12 @@ inline Status ArrowSerializer<TYPE>::ConvertData() { template <> inline Status ArrowSerializer<NPY_BOOL>::ConvertData() { if (is_strided()) { - return Status::ValueError("no support for strided data yet"); + return Status::Invalid("no support for strided data yet"); } int nbytes = util::bytes_for_bits(length_); auto buffer = std::make_shared<arrow::PoolBuffer>(pool_); - RETURN_ARROW_NOT_OK(buffer->Resize(nbytes)); + RETURN_NOT_OK(buffer->Resize(nbytes)); const uint8_t* values = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_)); @@ -446,7 +446,7 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, } if (PyArray_NDIM(arr) != 1) { - return Status::ValueError("only handle 1-dimensional arrays"); + return Status::Invalid("only handle 1-dimensional arrays"); } switch(PyArray_DESCR(arr)->type_num) {
