Repository: arrow Updated Branches: refs/heads/master 949249d9e -> 7d433dc27
ARROW-483: [C++/Python] Provide access to "custom_metadata" Field attribute in IPC setting Author: Phillip Cloud <[email protected]> Closes #588 from cpcloud/ARROW-483 and squashes the following commits: f671ba4 [Phillip Cloud] ARROW-483: [C++/Python] Provide access to "custom_metadata" Field attribute in IPC setting Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7d433dc2 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7d433dc2 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7d433dc2 Branch: refs/heads/master Commit: 7d433dc27bf70b5d80b8c88261a19cdc615defdb Parents: 949249d Author: Phillip Cloud <[email protected]> Authored: Tue Apr 25 17:36:31 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Tue Apr 25 17:36:31 2017 -0400 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 1 + cpp/src/arrow/array.cc | 2 +- cpp/src/arrow/builder.cc | 13 ++- cpp/src/arrow/ipc/metadata.cc | 30 ++++++- cpp/src/arrow/type-test.cc | 34 ++++++++ cpp/src/arrow/type.cc | 20 ++++- cpp/src/arrow/type.h | 10 ++- cpp/src/arrow/util/CMakeLists.txt | 2 + cpp/src/arrow/util/key-value-metadata-test.cc | 87 +++++++++++++++++++ cpp/src/arrow/util/key_value_metadata.cc | 99 ++++++++++++++++++++++ cpp/src/arrow/util/key_value_metadata.h | 56 ++++++++++++ format/Schema.fbs | 2 +- python/.gitignore | 1 + python/pyarrow/_array.pxd | 2 + python/pyarrow/_array.pyx | 7 ++ python/pyarrow/_table.pyx | 64 ++++++++------ python/pyarrow/includes/common.pxd | 3 +- python/pyarrow/includes/libarrow.pxd | 11 ++- 18 files changed, 401 insertions(+), 43 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2d8c00f..5abe5f1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -944,6 +944,7 @@ set(ARROW_SRCS src/arrow/util/bit-util.cc src/arrow/util/decimal.cc + src/arrow/util/key_value_metadata.cc ) if (ARROW_IPC) http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index e640bbd..76dda2c 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -113,7 +113,7 @@ Status Array::Validate() const { static inline void ConformSliceParams( int64_t array_offset, int64_t array_length, int64_t* offset, int64_t* length) { DCHECK_LE(*offset, array_length); - DCHECK_GE(offset, 0); + DCHECK_NE(offset, nullptr); *length = std::min(array_length - *offset, *length); *offset = array_offset + *offset; } http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index d85eb32..4ecb8d3 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -363,8 +363,6 @@ ARROW_EXPORT Status DecimalBuilder::Append(const decimal::Decimal128& value) { return Status::OK(); } -template ARROW_EXPORT Status DecimalBuilder::Append(const decimal::Decimal128& val); - Status DecimalBuilder::Init(int64_t capacity) { RETURN_NOT_OK(FixedSizeBinaryBuilder::Init(capacity)); if (byte_width_ == 16) { @@ -408,16 +406,17 @@ Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) { ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> value_builder, const std::shared_ptr<DataType>& type) - : ArrayBuilder( - pool, type ? type : std::static_pointer_cast<DataType>( - std::make_shared<ListType>(value_builder->type()))), + : ArrayBuilder(pool, + type ? type : std::static_pointer_cast<DataType>( + std::make_shared<ListType>(value_builder->type()))), offset_builder_(pool), value_builder_(value_builder) {} ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values, const std::shared_ptr<DataType>& type) - : ArrayBuilder(pool, type ? type : std::static_pointer_cast<DataType>( - std::make_shared<ListType>(values->type()))), + : ArrayBuilder(pool, + type ? type : std::static_pointer_cast<DataType>( + std::make_shared<ListType>(values->type()))), offset_builder_(pool), values_(values) {} http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/ipc/metadata.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index 791948b..c0b518a 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -45,6 +45,7 @@ namespace ipc { using FBB = flatbuffers::FlatBufferBuilder; using DictionaryOffset = flatbuffers::Offset<flatbuf::DictionaryEncoding>; using FieldOffset = flatbuffers::Offset<flatbuf::Field>; +using KeyValueOffset = flatbuffers::Offset<flatbuf::KeyValue>; using RecordBatchOffset = flatbuffers::Offset<flatbuf::RecordBatch>; using VectorLayoutOffset = flatbuffers::Offset<arrow::flatbuf::VectorLayout>; using Offset = flatbuffers::Offset<void>; @@ -583,6 +584,7 @@ flatbuf::Endianness endianness() { static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema, DictionaryMemo* dictionary_memo, flatbuffers::Offset<flatbuf::Schema>* out) { + /// Fields std::vector<FieldOffset> field_offsets; for (int i = 0; i < schema.num_fields(); ++i) { std::shared_ptr<Field> field = schema.field(i); @@ -591,7 +593,20 @@ static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema, field_offsets.push_back(offset); } - *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets)); + /// Custom metadata + const auto& custom_metadata_ = schema.custom_metadata(); + std::vector<KeyValueOffset> key_value_offsets; + size_t metadata_size = custom_metadata_.size(); + key_value_offsets.reserve(metadata_size); + for (size_t i = 0; i < metadata_size; ++i) { + const auto& key = custom_metadata_.key(i); + const auto& value = custom_metadata_.value(i); + key_value_offsets.push_back( + flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value))); + } + + *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets), + fbb.CreateVector(key_value_offsets)); return Status::OK(); } @@ -939,7 +954,18 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem const flatbuf::Field* field = schema->fields()->Get(i); RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i])); } - *out = std::make_shared<Schema>(fields); + + KeyValueMetadata custom_metadata; + auto fb_metadata = schema->custom_metadata(); + if (fb_metadata != nullptr) { + custom_metadata.reserve(fb_metadata->size()); + + for (const auto& pair : *fb_metadata) { + custom_metadata.Append(pair->key()->str(), pair->value()->str()); + } + } + + *out = std::make_shared<Schema>(fields, custom_metadata); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/type-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index dec7268..8e2dfd5 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -117,6 +117,40 @@ TEST_F(TestSchema, GetFieldByName) { ASSERT_TRUE(result == nullptr); } +TEST_F(TestSchema, TestCustomMetadataConstruction) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"}); + auto schema = std::make_shared<Schema>(fields, metadata); + ASSERT_TRUE(metadata.Equals(schema->custom_metadata())); +} + +TEST_F(TestSchema, TestAddCustomMetadata) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"}); + auto schema = std::make_shared<Schema>(fields); + std::shared_ptr<Schema> new_schema; + schema->AddCustomMetadata(metadata, &new_schema); + ASSERT_TRUE(metadata.Equals(new_schema->custom_metadata())); +} + +TEST_F(TestSchema, TestRemoveCustomMetadata) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"}); + auto schema = std::make_shared<Schema>(fields); + std::shared_ptr<Schema> new_schema; + schema->RemoveCustomMetadata(&new_schema); + ASSERT_EQ(0, new_schema->custom_metadata().size()); +} + #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 2e454ae..f59f8fb 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -24,6 +24,7 @@ #include "arrow/array.h" #include "arrow/compare.h" #include "arrow/status.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/stl.h" #include "arrow/visitor.h" @@ -231,7 +232,9 @@ std::string NullType::ToString() const { // ---------------------------------------------------------------------- // Schema implementation -Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields) : fields_(fields) {} +Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields, + const KeyValueMetadata& custom_metadata) + : fields_(fields), custom_metadata_(custom_metadata) {} bool Schema::Equals(const Schema& other) const { if (this == &other) { return true; } @@ -263,7 +266,18 @@ Status Schema::AddField( DCHECK_GE(i, 0); DCHECK_LE(i, this->num_fields()); - *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field)); + *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field), custom_metadata_); + return Status::OK(); +} + +Status Schema::AddCustomMetadata( + const KeyValueMetadata& custom_metadata, std::shared_ptr<Schema>* out) const { + *out = std::make_shared<Schema>(fields_, custom_metadata); + return Status::OK(); +} + +Status Schema::RemoveCustomMetadata(std::shared_ptr<Schema>* out) { + *out = std::make_shared<Schema>(fields_, KeyValueMetadata()); return Status::OK(); } @@ -271,7 +285,7 @@ Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const { DCHECK_GE(i, 0); DCHECK_LT(i, this->num_fields()); - *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i)); + *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i), custom_metadata_); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ea4ea03..dc94561 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -28,6 +28,7 @@ #include "arrow/status.h" #include "arrow/type_fwd.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" #include "arrow/visitor.h" @@ -677,7 +678,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { class ARROW_EXPORT Schema { public: - explicit Schema(const std::vector<std::shared_ptr<Field>>& fields); + explicit Schema(const std::vector<std::shared_ptr<Field>>& fields, + const KeyValueMetadata& custom_metadata = KeyValueMetadata()); // Returns true if all of the schema fields are equal bool Equals(const Schema& other) const; @@ -689,6 +691,7 @@ class ARROW_EXPORT Schema { std::shared_ptr<Field> GetFieldByName(const std::string& name); const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; } + const KeyValueMetadata& custom_metadata() const { return custom_metadata_; } // Render a string representation of the schema suitable for debugging std::string ToString() const; @@ -697,11 +700,16 @@ class ARROW_EXPORT Schema { int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const; Status RemoveField(int i, std::shared_ptr<Schema>* out) const; + Status AddCustomMetadata( + const KeyValueMetadata& metadata, std::shared_ptr<Schema>* out) const; + Status RemoveCustomMetadata(std::shared_ptr<Schema>* out); + int num_fields() const { return static_cast<int>(fields_.size()); } private: std::vector<std::shared_ptr<Field>> fields_; std::unordered_map<std::string, int> name_to_index_; + KeyValueMetadata custom_metadata_; }; // ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index b22c8ac..ac7e866 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -26,6 +26,7 @@ install(FILES macros.h random.h visibility.h + key_value_metadata.h DESTINATION include/arrow/util) ####################################### @@ -52,3 +53,4 @@ endif() ADD_ARROW_TEST(bit-util-test) ADD_ARROW_TEST(stl-util-test) ADD_ARROW_TEST(decimal-test) +ADD_ARROW_TEST(key-value-metadata-test) http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/key-value-metadata-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/key-value-metadata-test.cc b/cpp/src/arrow/util/key-value-metadata-test.cc new file mode 100644 index 0000000..aadc989 --- /dev/null +++ b/cpp/src/arrow/util/key-value-metadata-test.cc @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/util/key_value_metadata.h" + +#include "arrow/test-util.h" + +namespace arrow { + +TEST(KeyValueMetadataTest, SimpleConstruction) { + KeyValueMetadata metadata; + ASSERT_EQ(0, metadata.size()); +} + +TEST(KeyValueMetadataTest, StringVectorConstruction) { + std::vector<std::string> keys = {"foo", "bar"}; + std::vector<std::string> values = {"bizz", "buzz"}; + + KeyValueMetadata metadata(keys, values); + ASSERT_EQ("foo", metadata.key(0)); + ASSERT_EQ("bar", metadata.key(1)); + ASSERT_EQ("bizz", metadata.value(0)); + ASSERT_EQ("buzz", metadata.value(1)); + ASSERT_EQ(2, metadata.size()); +} + +TEST(KeyValueMetadataTest, StringMapConstruction) { + std::unordered_map<std::string, std::string> pairs = {{"foo", "bizz"}, {"bar", "buzz"}}; + std::unordered_map<std::string, std::string> result_map; + result_map.reserve(pairs.size()); + + KeyValueMetadata metadata(pairs); + metadata.ToUnorderedMap(&result_map); + ASSERT_EQ(pairs, result_map); + ASSERT_EQ(2, metadata.size()); +} + +TEST(KeyValueMetadataTest, StringAppend) { + std::vector<std::string> keys = {"foo", "bar"}; + std::vector<std::string> values = {"bizz", "buzz"}; + + KeyValueMetadata metadata(keys, values); + ASSERT_EQ("foo", metadata.key(0)); + ASSERT_EQ("bar", metadata.key(1)); + ASSERT_EQ("bizz", metadata.value(0)); + ASSERT_EQ("buzz", metadata.value(1)); + ASSERT_EQ(2, metadata.size()); + + metadata.Append("purple", "orange"); + metadata.Append("blue", "red"); + + ASSERT_EQ("purple", metadata.key(2)); + ASSERT_EQ("blue", metadata.key(3)); + + ASSERT_EQ("orange", metadata.value(2)); + ASSERT_EQ("red", metadata.value(3)); +} + +TEST(KeyValueMetadataTest, Equals) { + std::vector<std::string> keys = {"foo", "bar"}; + std::vector<std::string> values = {"bizz", "buzz"}; + + KeyValueMetadata metadata(keys, values); + KeyValueMetadata metadata2(keys, values); + KeyValueMetadata metadata3(keys, {"buzz", "bizz"}); + + ASSERT_TRUE(metadata.Equals(metadata2)); + ASSERT_FALSE(metadata.Equals(metadata3)); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/key_value_metadata.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc new file mode 100644 index 0000000..c91478b --- /dev/null +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> + +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" + +namespace arrow { + +static std::vector<std::string> UnorderedMapKeys( + const std::unordered_map<std::string, std::string>& map) { + std::vector<std::string> keys; + keys.reserve(map.size()); + for (const auto& pair : map) { + keys.push_back(pair.first); + } + return keys; +} + +static std::vector<std::string> UnorderedMapValues( + const std::unordered_map<std::string, std::string>& map) { + std::vector<std::string> values; + values.reserve(map.size()); + for (const auto& pair : map) { + values.push_back(pair.second); + } + return values; +} + +KeyValueMetadata::KeyValueMetadata() : keys_(), values_() {} + +KeyValueMetadata::KeyValueMetadata( + const std::unordered_map<std::string, std::string>& map) + : keys_(UnorderedMapKeys(map)), values_(UnorderedMapValues(map)) {} + +KeyValueMetadata::KeyValueMetadata( + const std::vector<std::string>& keys, const std::vector<std::string>& values) + : keys_(keys), values_(values) { + DCHECK_EQ(keys.size(), values.size()); +} + +void KeyValueMetadata::ToUnorderedMap( + std::unordered_map<std::string, std::string>* out) const { + DCHECK_NE(out, nullptr); + const int64_t n = size(); + out->reserve(n); + for (int64_t i = 0; i < n; ++i) { + out->insert(std::make_pair(key(i), value(i))); + } +} + +void KeyValueMetadata::Append(const std::string& key, const std::string& value) { + keys_.push_back(key); + values_.push_back(value); +} + +void KeyValueMetadata::reserve(int64_t n) { + DCHECK_GE(n, 0); + const auto m = static_cast<size_t>(n); + keys_.reserve(m); + values_.reserve(m); +} + +int64_t KeyValueMetadata::size() const { + DCHECK_EQ(keys_.size(), values_.size()); + return static_cast<int64_t>(keys_.size()); +} + +std::string KeyValueMetadata::key(int64_t i) const { + DCHECK_GE(i, 0); + return keys_[static_cast<size_t>(i)]; +} + +std::string KeyValueMetadata::value(int64_t i) const { + DCHECK_GE(i, 0); + return values_[static_cast<size_t>(i)]; +} + +bool KeyValueMetadata::Equals(const KeyValueMetadata& other) const { + return size() == other.size() && + std::equal(keys_.cbegin(), keys_.cend(), other.keys_.cbegin()) && + std::equal(values_.cbegin(), values_.cend(), other.values_.cbegin()); +} +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/cpp/src/arrow/util/key_value_metadata.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h new file mode 100644 index 0000000..713b2c0 --- /dev/null +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_KEY_VALUE_METADATA_H +#define ARROW_UTIL_KEY_VALUE_METADATA_H + +#include <cstdint> +#include <string> +#include <unordered_map> +#include <vector> + +#include "arrow/util/visibility.h" + +namespace arrow { + +class ARROW_EXPORT KeyValueMetadata { + public: + KeyValueMetadata(); + KeyValueMetadata( + const std::vector<std::string>& keys, const std::vector<std::string>& values); + explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map); + + void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const; + + void Append(const std::string& key, const std::string& value); + + void reserve(int64_t n); + int64_t size() const; + + std::string key(int64_t i) const; + std::string value(int64_t i) const; + + bool Equals(const KeyValueMetadata& other) const; + + private: + std::vector<std::string> keys_; + std::vector<std::string> values_; +}; + +} // namespace arrow + +#endif // ARROW_UTIL_KEY_VALUE_METADATA_H http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/format/Schema.fbs ---------------------------------------------------------------------- diff --git a/format/Schema.fbs b/format/Schema.fbs index b48859f..8de5c6d 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -200,7 +200,7 @@ table VectorLayout { table KeyValue { key: string; - value: [ubyte]; + value: string; } /// ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/.gitignore ---------------------------------------------------------------------- diff --git a/python/.gitignore b/python/.gitignore index ba40c3e..6c0d5a9 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -33,3 +33,4 @@ coverage.xml # benchmark working dir .asv +pyarrow/_table_api.h http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/_array.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd index 464de31..4d5db86 100644 --- a/python/pyarrow/_array.pxd +++ b/python/pyarrow/_array.pxd @@ -81,6 +81,8 @@ cdef class Schema: cdef init(self, const vector[shared_ptr[CField]]& fields) cdef init_schema(self, const shared_ptr[CSchema]& schema) + cpdef dict custom_metadata(self) + cdef class Scalar: cdef readonly: http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/_array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index 1c571ba..2fb20b7 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -244,6 +244,13 @@ cdef class Schema: self.schema = schema.get() self.sp_schema = schema + cpdef dict custom_metadata(self): + cdef: + CKeyValueMetadata metadata = self.schema.custom_metadata() + unordered_map[c_string, c_string] result + metadata.ToUnorderedMap(&result) + return result + def equals(self, other): """ Test if this schema is equal to the other http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/_table.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx index 78fec75..ed0782b 100644 --- a/python/pyarrow/_table.pyx +++ b/python/pyarrow/_table.pyx @@ -34,7 +34,6 @@ from pyarrow._error import ArrowException from pyarrow._array import field from pyarrow.compat import frombytes, tobytes - from collections import OrderedDict @@ -273,15 +272,22 @@ cdef class Column: return chunked_array -cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema): +cdef CKeyValueMetadata key_value_metadata_from_dict(dict metadata): + cdef: + unordered_map[c_string, c_string] unordered_metadata = metadata + CKeyValueMetadata c_metadata = CKeyValueMetadata(unordered_metadata) + return c_metadata + + +cdef int _schema_from_arrays( + arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1: cdef: Array arr Column col c_string c_name vector[shared_ptr[CField]] fields - cdef shared_ptr[CDataType] type_ - - cdef int K = len(arrays) + shared_ptr[CDataType] type_ + int K = len(arrays) fields.resize(K) @@ -306,15 +312,16 @@ cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema): else: raise TypeError(type(arrays[0])) - schema.reset(new CSchema(fields)) - + schema.reset(new CSchema(fields, key_value_metadata_from_dict(metadata))) + return 0 -cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema): +cdef tuple _dataframe_to_arrays(df, bint timestamps_to_ms, Schema schema): cdef: list names = [] list arrays = [] DataType type = None + dict metadata = {} for name in df.columns: col = df[name] @@ -326,7 +333,7 @@ cdef _dataframe_to_arrays(df, timestamps_to_ms, Schema schema): names.append(name) arrays.append(arr) - return names, arrays + return names, arrays, metadata cdef class RecordBatch: @@ -486,11 +493,11 @@ cdef class RecordBatch: ------- pyarrow.table.RecordBatch """ - names, arrays = _dataframe_to_arrays(df, False, schema) - return cls.from_arrays(arrays, names) + names, arrays, metadata = _dataframe_to_arrays(df, False, schema) + return cls.from_arrays(arrays, names, metadata) @staticmethod - def from_arrays(arrays, names): + def from_arrays(list arrays, list names, dict metadata=None): """ Construct a RecordBatch from multiple pyarrow.Arrays @@ -512,15 +519,17 @@ cdef class RecordBatch: shared_ptr[CRecordBatch] batch vector[shared_ptr[CArray]] c_arrays int64_t num_rows + int64_t i + int64_t number_of_arrays = len(arrays) - if len(arrays) == 0: + if not number_of_arrays: raise ValueError('Record batch cannot contain no arrays (for now)') num_rows = len(arrays[0]) - _schema_from_arrays(arrays, names, &schema) + _schema_from_arrays(arrays, names, metadata or {}, &schema) - for i in range(len(arrays)): - arr = arrays[i] + c_arrays.reserve(len(arrays)) + for arr in arrays: c_arrays.push_back(arr.sp_array) batch.reset(new CRecordBatch(schema, num_rows, c_arrays)) @@ -656,13 +665,13 @@ cdef class Table: >>> pa.Table.from_pandas(df) <pyarrow.table.Table object at 0x7f05d1fb1b40> """ - names, arrays = _dataframe_to_arrays(df, + names, arrays, metadata = _dataframe_to_arrays(df, timestamps_to_ms=timestamps_to_ms, schema=schema) - return cls.from_arrays(arrays, names=names) + return cls.from_arrays(arrays, names=names, metadata=metadata) @staticmethod - def from_arrays(arrays, names=None): + def from_arrays(arrays, names=None, dict metadata=None): """ Construct a Table from Arrow arrays or columns @@ -680,22 +689,25 @@ cdef class Table: """ cdef: - vector[shared_ptr[CField]] fields vector[shared_ptr[CColumn]] columns shared_ptr[CSchema] schema shared_ptr[CTable] table + size_t K = len(arrays) - _schema_from_arrays(arrays, names, &schema) + _schema_from_arrays(arrays, names, metadata or {}, &schema) - cdef int K = len(arrays) - columns.resize(K) + columns.reserve(K) for i in range(K): if isinstance(arrays[i], Array): - columns[i].reset(new CColumn(schema.get().field(i), - (<Array> arrays[i]).sp_array)) + columns.push_back( + make_shared[CColumn]( + schema.get().field(i), + (<Array> arrays[i]).sp_array + ) + ) elif isinstance(arrays[i], Column): - columns[i] = (<Column> arrays[i]).sp_column + columns.push_back((<Column> arrays[i]).sp_column) else: raise ValueError(type(arrays[i])) http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/includes/common.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 44723fa..cc3b4b6 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -19,9 +19,10 @@ from libc.stdint cimport * from libcpp cimport bool as c_bool -from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.memory cimport shared_ptr, unique_ptr, make_shared from libcpp.string cimport string as c_string from libcpp.vector cimport vector +from libcpp.unordered_map cimport unordered_map from cpython cimport PyObject cimport cpython http://git-wip-us.apache.org/repos/asf/arrow/blob/7d433dc2/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 473a0b9..ef1a332 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1,4 +1,4 @@ -#t Licensed to the Apache Software Foundation (ASF) under one +# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file @@ -19,6 +19,12 @@ from pyarrow.includes.common cimport * +cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil: + cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata": + CKeyValueMetadata() + CKeyValueMetadata(const unordered_map[c_string, c_string]&) + void ToUnorderedMap(unordered_map[c_string, c_string]*) const + cdef extern from "arrow/api.h" namespace "arrow" nogil: enum Type" arrow::Type::type": @@ -170,10 +176,13 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) + CSchema(const vector[shared_ptr[CField]]& fields, + const CKeyValueMetadata& custom_metadata) c_bool Equals(const CSchema& other) shared_ptr[CField] field(int i) + const CKeyValueMetadata& custom_metadata() const shared_ptr[CField] GetFieldByName(c_string& name) int num_fields() c_string ToString()
