Repository: arrow Updated Branches: refs/heads/master 3fdeac74c -> e876abbdf
ARROW-898: [C++/Python] Use shared_ptr to avoid copying KeyValueMetadata, add to Field type also This also adds support for adding and removing schema-level metadata to the Python Schema wrapper object. Need to do the same for Field but putting this up for @cpcloud to review since he's working on using this in parquet-cpp Author: Wes McKinney <[email protected]> Closes #605 from wesm/ARROW-898 and squashes the following commits: 03873f7 [Wes McKinney] RemoveMetadata not return Status c621c2c [Wes McKinney] Add metadata methods to Field, some code cleaning 581b9fa [Wes McKinney] Add unit tests for passing metadata to Field constructor 51fae29 [Wes McKinney] Add metadata to Field 2ce4003 [Wes McKinney] Test sharing metadata 48aa3ca [Wes McKinney] Use shared_ptr<const T> for KeyValueMetadata so metadata can be shared / not copied Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/e876abbd Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/e876abbd Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/e876abbd Branch: refs/heads/master Commit: e876abbdf4f7bac53ae5d56f4680259f021ea8d9 Parents: 3fdeac7 Author: Wes McKinney <[email protected]> Authored: Thu Apr 27 09:28:54 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Thu Apr 27 09:28:54 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/builder.cc | 11 +- cpp/src/arrow/io/io-memory-benchmark.cc | 6 +- cpp/src/arrow/io/memory.cc | 4 +- cpp/src/arrow/ipc/metadata.cc | 38 ++-- cpp/src/arrow/type-test.cc | 61 ++++-- cpp/src/arrow/type.cc | 60 ++++-- cpp/src/arrow/type.h | 32 ++- cpp/src/arrow/util/key-value-metadata-test.cc | 9 + cpp/src/arrow/util/key_value_metadata.cc | 4 + cpp/src/arrow/util/key_value_metadata.h | 6 + cpp/src/arrow/util/memory.h | 8 +- python/pyarrow/_array.pxd | 2 - python/pyarrow/_array.pyx | 220 +++++++++++++++++---- python/pyarrow/_table.pyx | 7 +- python/pyarrow/includes/libarrow.pxd | 31 ++- python/pyarrow/tests/pandas_examples.py | 4 +- python/pyarrow/tests/test_convert_pandas.py | 64 +++--- python/pyarrow/tests/test_schema.py | 48 ++++- 18 files changed, 466 insertions(+), 149 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 4ecb8d3..ab43c2a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -406,17 +406,16 @@ Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) { ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> value_builder, const std::shared_ptr<DataType>& type) - : ArrayBuilder(pool, - type ? type : std::static_pointer_cast<DataType>( - std::make_shared<ListType>(value_builder->type()))), + : ArrayBuilder( + pool, type ? type : std::static_pointer_cast<DataType>( + std::make_shared<ListType>(value_builder->type()))), offset_builder_(pool), value_builder_(value_builder) {} ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values, const std::shared_ptr<DataType>& type) - : ArrayBuilder(pool, - type ? type : std::static_pointer_cast<DataType>( - std::make_shared<ListType>(values->type()))), + : ArrayBuilder(pool, type ? type : std::static_pointer_cast<DataType>( + std::make_shared<ListType>(values->type()))), offset_builder_(pool), values_(values) {} http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/io/io-memory-benchmark.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/io-memory-benchmark.cc b/cpp/src/arrow/io/io-memory-benchmark.cc index 59b511a..6aa9577 100644 --- a/cpp/src/arrow/io/io-memory-benchmark.cc +++ b/cpp/src/arrow/io/io-memory-benchmark.cc @@ -26,7 +26,7 @@ namespace arrow { static void BM_SerialMemcopy(benchmark::State& state) { // NOLINT non-const reference - constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB + constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB auto buffer1 = std::make_shared<PoolBuffer>(default_memory_pool()); buffer1->Resize(kTotalSize); @@ -43,7 +43,7 @@ static void BM_SerialMemcopy(benchmark::State& state) { // NOLINT non-const ref } static void BM_ParallelMemcopy(benchmark::State& state) { // NOLINT non-const reference - constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB + constexpr int64_t kTotalSize = 100 * 1024 * 1024; // 100MB auto buffer1 = std::make_shared<PoolBuffer>(default_memory_pool()); buffer1->Resize(kTotalSize); @@ -72,4 +72,4 @@ BENCHMARK(BM_ParallelMemcopy) ->MinTime(1.0) ->UseRealTime(); -} // namespace arrow +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/io/memory.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 95c6206..faf02d2 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -140,8 +140,8 @@ Status FixedSizeBufferWriter::Tell(int64_t* position) { Status FixedSizeBufferWriter::Write(const uint8_t* data, int64_t nbytes) { if (nbytes > memcopy_threshold_ && memcopy_num_threads_ > 1) { - parallel_memcopy(mutable_data_ + position_, data, nbytes, - memcopy_blocksize_, memcopy_num_threads_); + parallel_memcopy(mutable_data_ + position_, data, nbytes, memcopy_blocksize_, + memcopy_num_threads_); } else { memcpy(mutable_data_ + position_, data, nbytes); } http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/ipc/metadata.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index c0b518a..706ab2e 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -593,20 +593,27 @@ static Status SchemaToFlatbuffer(FBB& fbb, const Schema& schema, field_offsets.push_back(offset); } + auto fb_offsets = fbb.CreateVector(field_offsets); + /// Custom metadata - const auto& custom_metadata_ = schema.custom_metadata(); - std::vector<KeyValueOffset> key_value_offsets; - size_t metadata_size = custom_metadata_.size(); - key_value_offsets.reserve(metadata_size); - for (size_t i = 0; i < metadata_size; ++i) { - const auto& key = custom_metadata_.key(i); - const auto& value = custom_metadata_.value(i); - key_value_offsets.push_back( - flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value))); + const KeyValueMetadata* metadata = schema.metadata().get(); + + if (metadata != nullptr) { + std::vector<KeyValueOffset> key_value_offsets; + size_t metadata_size = metadata->size(); + key_value_offsets.reserve(metadata_size); + for (size_t i = 0; i < metadata_size; ++i) { + const auto& key = metadata->key(i); + const auto& value = metadata->value(i); + key_value_offsets.push_back( + flatbuf::CreateKeyValue(fbb, fbb.CreateString(key), fbb.CreateString(value))); + } + *out = flatbuf::CreateSchema( + fbb, endianness(), fb_offsets, fbb.CreateVector(key_value_offsets)); + } else { + *out = flatbuf::CreateSchema(fbb, endianness(), fb_offsets); } - *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets), - fbb.CreateVector(key_value_offsets)); return Status::OK(); } @@ -955,17 +962,16 @@ Status GetSchema(const void* opaque_schema, const DictionaryMemo& dictionary_mem RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i])); } - KeyValueMetadata custom_metadata; + auto metadata = std::make_shared<KeyValueMetadata>(); auto fb_metadata = schema->custom_metadata(); if (fb_metadata != nullptr) { - custom_metadata.reserve(fb_metadata->size()); - + metadata->reserve(fb_metadata->size()); for (const auto& pair : *fb_metadata) { - custom_metadata.Append(pair->key()->str(), pair->value()->str()); + metadata->Append(pair->key()->str(), pair->value()->str()); } } - *out = std::make_shared<Schema>(fields, custom_metadata); + *out = std::make_shared<Schema>(fields, metadata); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/type-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc index 8e2dfd5..e73adec 100644 --- a/cpp/src/arrow/type-test.cc +++ b/cpp/src/arrow/type-test.cc @@ -23,6 +23,7 @@ #include "gtest/gtest.h" +#include "arrow/test-util.h" #include "arrow/type.h" using std::shared_ptr; @@ -50,6 +51,40 @@ TEST(TestField, Equals) { ASSERT_FALSE(f0.Equals(f0_nn)); } +TEST(TestField, TestMetadataConstruction) { + auto metadata = std::shared_ptr<KeyValueMetadata>( + new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); + auto metadata2 = metadata->Copy(); + auto f0 = field("f0", int32(), true, metadata); + auto f1 = field("f0", int32(), true, metadata2); + ASSERT_TRUE(metadata->Equals(*f0->metadata())); + ASSERT_TRUE(f0->Equals(*f1)); +} + +TEST(TestField, TestAddMetadata) { + auto metadata = std::shared_ptr<KeyValueMetadata>( + new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); + auto f0 = field("f0", int32()); + auto f1 = field("f0", int32(), true, metadata); + std::shared_ptr<Field> f2; + ASSERT_OK(f0->AddMetadata(metadata, &f2)); + + ASSERT_FALSE(f2->Equals(*f0)); + ASSERT_TRUE(f2->Equals(*f1)); + + // Not copied + ASSERT_TRUE(metadata.get() == f1->metadata().get()); +} + +TEST(TestField, TestRemoveMetadata) { + auto metadata = std::shared_ptr<KeyValueMetadata>( + new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); + auto f0 = field("f0", int32()); + auto f1 = field("f0", int32(), true, metadata); + std::shared_ptr<Field> f2 = f1->RemoveMetadata(); + ASSERT_TRUE(f2->metadata() == nullptr); +} + class TestSchema : public ::testing::Test { public: void SetUp() {} @@ -117,38 +152,42 @@ TEST_F(TestSchema, GetFieldByName) { ASSERT_TRUE(result == nullptr); } -TEST_F(TestSchema, TestCustomMetadataConstruction) { +TEST_F(TestSchema, TestMetadataConstruction) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); vector<shared_ptr<Field>> fields = {f0, f1, f2}; - KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"}); + auto metadata = std::shared_ptr<KeyValueMetadata>( + new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); auto schema = std::make_shared<Schema>(fields, metadata); - ASSERT_TRUE(metadata.Equals(schema->custom_metadata())); + ASSERT_TRUE(metadata->Equals(*schema->metadata())); } -TEST_F(TestSchema, TestAddCustomMetadata) { +TEST_F(TestSchema, TestAddMetadata) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); vector<shared_ptr<Field>> fields = {f0, f1, f2}; - KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"}); + auto metadata = std::shared_ptr<KeyValueMetadata>( + new KeyValueMetadata({"foo", "bar"}, {"bizz", "buzz"})); auto schema = std::make_shared<Schema>(fields); std::shared_ptr<Schema> new_schema; - schema->AddCustomMetadata(metadata, &new_schema); - ASSERT_TRUE(metadata.Equals(new_schema->custom_metadata())); + schema->AddMetadata(metadata, &new_schema); + ASSERT_TRUE(metadata->Equals(*new_schema->metadata())); + + // Not copied + ASSERT_TRUE(metadata.get() == new_schema->metadata().get()); } -TEST_F(TestSchema, TestRemoveCustomMetadata) { +TEST_F(TestSchema, TestRemoveMetadata) { auto f0 = field("f0", int32()); auto f1 = field("f1", uint8(), false); auto f2 = field("f2", utf8()); vector<shared_ptr<Field>> fields = {f0, f1, f2}; KeyValueMetadata metadata({"foo", "bar"}, {"bizz", "buzz"}); auto schema = std::make_shared<Schema>(fields); - std::shared_ptr<Schema> new_schema; - schema->RemoveCustomMetadata(&new_schema); - ASSERT_EQ(0, new_schema->custom_metadata().size()); + std::shared_ptr<Schema> new_schema = schema->RemoveMetadata(); + ASSERT_TRUE(new_schema->metadata() == nullptr); } #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f59f8fb..b1e322c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -31,10 +31,31 @@ namespace arrow { +Status Field::AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata, + std::shared_ptr<Field>* out) const { + *out = std::make_shared<Field>(name_, type_, nullable_, metadata); + return Status::OK(); +} + +std::shared_ptr<Field> Field::RemoveMetadata() const { + return std::make_shared<Field>(name_, type_, nullable_); +} + bool Field::Equals(const Field& other) const { - return (this == &other) || - (this->name_ == other.name_ && this->nullable_ == other.nullable_ && - this->type_->Equals(*other.type_.get())); + if (this == &other) { + return true; + } + if (this->name_ == other.name_ && this->nullable_ == other.nullable_ && + this->type_->Equals(*other.type_.get())) { + if (metadata_ == nullptr && other.metadata_ == nullptr) { + return true; + } else if ((metadata_ == nullptr) ^ (other.metadata_ == nullptr)) { + return false; + } else { + return metadata_->Equals(*other.metadata_); + } + } + return false; } bool Field::Equals(const std::shared_ptr<Field>& other) const { @@ -233,8 +254,8 @@ std::string NullType::ToString() const { // Schema implementation Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields, - const KeyValueMetadata& custom_metadata) - : fields_(fields), custom_metadata_(custom_metadata) {} + const std::shared_ptr<const KeyValueMetadata>& metadata) + : fields_(fields), metadata_(metadata) {} bool Schema::Equals(const Schema& other) const { if (this == &other) { return true; } @@ -266,26 +287,25 @@ Status Schema::AddField( DCHECK_GE(i, 0); DCHECK_LE(i, this->num_fields()); - *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field), custom_metadata_); + *out = std::make_shared<Schema>(AddVectorElement(fields_, i, field), metadata_); return Status::OK(); } -Status Schema::AddCustomMetadata( - const KeyValueMetadata& custom_metadata, std::shared_ptr<Schema>* out) const { - *out = std::make_shared<Schema>(fields_, custom_metadata); +Status Schema::AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata, + std::shared_ptr<Schema>* out) const { + *out = std::make_shared<Schema>(fields_, metadata); return Status::OK(); } -Status Schema::RemoveCustomMetadata(std::shared_ptr<Schema>* out) { - *out = std::make_shared<Schema>(fields_, KeyValueMetadata()); - return Status::OK(); +std::shared_ptr<Schema> Schema::RemoveMetadata() const { + return std::make_shared<Schema>(fields_); } Status Schema::RemoveField(int i, std::shared_ptr<Schema>* out) const { DCHECK_GE(i, 0); DCHECK_LT(i, this->num_fields()); - *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i), custom_metadata_); + *out = std::make_shared<Schema>(DeleteVectorElement(fields_, i), metadata_); return Status::OK(); } @@ -298,6 +318,15 @@ std::string Schema::ToString() const { buffer << field->ToString(); ++i; } + + if (metadata_) { + buffer << "\n-- metadata --"; + for (int64_t i = 0; i < metadata_->size(); ++i) { + buffer << "\n" << metadata_->key(i) << ": " + << metadata_->value(i); + } + } + return buffer.str(); } @@ -391,8 +420,9 @@ std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type } std::shared_ptr<Field> field( - const std::string& name, const TypePtr& type, bool nullable) { - return std::make_shared<Field>(name, type, nullable); + const std::string& name, const std::shared_ptr<DataType>& type, bool nullable, + const std::shared_ptr<const KeyValueMetadata>& metadata) { + return std::make_shared<Field>(name, type, nullable, metadata); } std::shared_ptr<DataType> decimal(int precision, int scale) { http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index dc94561..bb25857 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -203,8 +203,16 @@ class NoExtraMeta {}; class ARROW_EXPORT Field { public: Field(const std::string& name, const std::shared_ptr<DataType>& type, - bool nullable = true) - : name_(name), type_(type), nullable_(nullable) {} + bool nullable = true, + const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr) + : name_(name), type_(type), nullable_(nullable), metadata_(metadata) {} + + std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; } + + Status AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata, + std::shared_ptr<Field>* out) const; + + std::shared_ptr<Field> RemoveMetadata() const; bool Equals(const Field& other) const; bool Equals(const std::shared_ptr<Field>& other) const; @@ -224,6 +232,9 @@ class ARROW_EXPORT Field { // Fields can be nullable bool nullable_; + + // The field's metadata, if any + std::shared_ptr<const KeyValueMetadata> metadata_; }; typedef std::shared_ptr<Field> FieldPtr; @@ -679,7 +690,7 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType { class ARROW_EXPORT Schema { public: explicit Schema(const std::vector<std::shared_ptr<Field>>& fields, - const KeyValueMetadata& custom_metadata = KeyValueMetadata()); + const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr); // Returns true if all of the schema fields are equal bool Equals(const Schema& other) const; @@ -691,7 +702,7 @@ class ARROW_EXPORT Schema { std::shared_ptr<Field> GetFieldByName(const std::string& name); const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; } - const KeyValueMetadata& custom_metadata() const { return custom_metadata_; } + std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; } // Render a string representation of the schema suitable for debugging std::string ToString() const; @@ -700,16 +711,18 @@ class ARROW_EXPORT Schema { int i, const std::shared_ptr<Field>& field, std::shared_ptr<Schema>* out) const; Status RemoveField(int i, std::shared_ptr<Schema>* out) const; - Status AddCustomMetadata( - const KeyValueMetadata& metadata, std::shared_ptr<Schema>* out) const; - Status RemoveCustomMetadata(std::shared_ptr<Schema>* out); + Status AddMetadata(const std::shared_ptr<const KeyValueMetadata>& metadata, + std::shared_ptr<Schema>* out) const; + + std::shared_ptr<Schema> RemoveMetadata() const; int num_fields() const { return static_cast<int>(fields_.size()); } private: std::vector<std::shared_ptr<Field>> fields_; std::unordered_map<std::string, int> name_to_index_; - KeyValueMetadata custom_metadata_; + + std::shared_ptr<const KeyValueMetadata> metadata_; }; // ---------------------------------------------------------------------- @@ -741,7 +754,8 @@ std::shared_ptr<DataType> ARROW_EXPORT dictionary( const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& values); std::shared_ptr<Field> ARROW_EXPORT field( - const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true); + const std::string& name, const std::shared_ptr<DataType>& type, bool nullable = true, + const std::shared_ptr<const KeyValueMetadata>& metadata = nullptr); // ---------------------------------------------------------------------- // http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/key-value-metadata-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/key-value-metadata-test.cc b/cpp/src/arrow/util/key-value-metadata-test.cc index aadc989..59cfdf5 100644 --- a/cpp/src/arrow/util/key-value-metadata-test.cc +++ b/cpp/src/arrow/util/key-value-metadata-test.cc @@ -72,6 +72,15 @@ TEST(KeyValueMetadataTest, StringAppend) { ASSERT_EQ("red", metadata.value(3)); } +TEST(KeyValueMetadataTest, Copy) { + std::vector<std::string> keys = {"foo", "bar"}; + std::vector<std::string> values = {"bizz", "buzz"}; + + KeyValueMetadata metadata(keys, values); + auto metadata2 = metadata.Copy(); + ASSERT_TRUE(metadata.Equals(*metadata2)); +} + TEST(KeyValueMetadataTest, Equals) { std::vector<std::string> keys = {"foo", "bar"}; std::vector<std::string> values = {"bizz", "buzz"}; http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/key_value_metadata.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index c91478b..8bddd5d 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -91,6 +91,10 @@ std::string KeyValueMetadata::value(int64_t i) const { return values_[static_cast<size_t>(i)]; } +std::shared_ptr<KeyValueMetadata> KeyValueMetadata::Copy() const { + return std::make_shared<KeyValueMetadata>(keys_, values_); +} + bool KeyValueMetadata::Equals(const KeyValueMetadata& other) const { return size() == other.size() && std::equal(keys_.cbegin(), keys_.cend(), other.keys_.cbegin()) && http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/key_value_metadata.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index 713b2c0..bae4ad8 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -19,10 +19,12 @@ #define ARROW_UTIL_KEY_VALUE_METADATA_H #include <cstdint> +#include <memory> #include <string> #include <unordered_map> #include <vector> +#include "arrow/util/macros.h" #include "arrow/util/visibility.h" namespace arrow { @@ -44,11 +46,15 @@ class ARROW_EXPORT KeyValueMetadata { std::string key(int64_t i) const; std::string value(int64_t i) const; + std::shared_ptr<KeyValueMetadata> Copy() const; + bool Equals(const KeyValueMetadata& other) const; private: std::vector<std::string> keys_; std::vector<std::string> values_; + + DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata); }; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/cpp/src/arrow/util/memory.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/memory.h b/cpp/src/arrow/util/memory.h index 7feeb29..c5c17ef 100644 --- a/cpp/src/arrow/util/memory.h +++ b/cpp/src/arrow/util/memory.h @@ -31,7 +31,7 @@ uint8_t* pointer_logical_and(const uint8_t* address, uintptr_t bits) { // A helper function for doing memcpy with multiple threads. This is required // to saturate the memory bandwidth of modern cpus. void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, - uintptr_t block_size, int num_threads) { + uintptr_t block_size, int num_threads) { std::vector<std::thread> threadpool(num_threads); uint8_t* left = pointer_logical_and(src + block_size - 1, ~(block_size - 1)); uint8_t* right = pointer_logical_and(src + nbytes, ~(block_size - 1)); @@ -52,8 +52,8 @@ void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, // Start all threads first and handle leftovers while threads run. for (int i = 0; i < num_threads; i++) { - threadpool[i] = std::thread(memcpy, dst + prefix + i * chunk_size, - left + i * chunk_size, chunk_size); + threadpool[i] = std::thread( + memcpy, dst + prefix + i * chunk_size, left + i * chunk_size, chunk_size); } memcpy(dst, src, prefix); @@ -64,6 +64,6 @@ void parallel_memcopy(uint8_t* dst, const uint8_t* src, int64_t nbytes, } } -} // namespace arrow +} // namespace arrow #endif // ARROW_UTIL_MEMORY_H http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/_array.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd index 4d5db86..464de31 100644 --- a/python/pyarrow/_array.pxd +++ b/python/pyarrow/_array.pxd @@ -81,8 +81,6 @@ cdef class Schema: cdef init(self, const vector[shared_ptr[CField]]& fields) cdef init_schema(self, const shared_ptr[CSchema]& schema) - cpdef dict custom_metadata(self) - cdef class Scalar: cdef readonly: http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/_array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index 2fb20b7..658f4b3 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -172,7 +172,14 @@ cdef class DecimalType(FixedSizeBinaryType): cdef class Field: + """ + Represents a named field, with a data type, nullability, and optional + metadata + Notes + ----- + Do not use this class's constructor directly; use pyarrow.field + """ def __cinit__(self): pass @@ -181,32 +188,77 @@ cdef class Field: self.field = field.get() self.type = box_data_type(field.get().type()) - @classmethod - def from_py(cls, object name, DataType type, bint nullable=True): - cdef Field result = Field() - result.type = type - result.sp_field.reset(new CField(tobytes(name), type.sp_type, - nullable)) - result.field = result.sp_field.get() + def equals(self, Field other): + """ + Test if this field is equal to the other + """ + return self.field.Equals(deref(other.field)) - return result + def __str__(self): + self._check_null() + return 'pyarrow.Field<{0}>'.format(frombytes(self.field.ToString())) def __repr__(self): - return 'Field({0!r}, type={1})'.format(self.name, str(self.type)) + return self.__str__() property nullable: def __get__(self): + self._check_null() return self.field.nullable() property name: def __get__(self): - if box_field(self.sp_field) is None: - raise ReferenceError( - 'Field not initialized (references NULL pointer)') + self._check_null() return frombytes(self.field.name()) + property metadata: + + def __get__(self): + self._check_null() + return box_metadata(self.field.metadata().get()) + + def _check_null(self): + if self.field == NULL: + raise ReferenceError( + 'Field not initialized (references NULL pointer)') + + def add_metadata(self, dict metadata): + """ + Add metadata as dict of string keys and values to Field + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + field : pyarrow.Field + """ + cdef shared_ptr[CKeyValueMetadata] c_meta + convert_metadata(metadata, &c_meta) + + cdef shared_ptr[CField] new_field + with nogil: + check_status(self.field.AddMetadata(c_meta, &new_field)) + + return box_field(new_field) + + def remove_metadata(self): + """ + Create new field without metadata, if any + + Returns + ------- + field : pyarrow.Field + """ + cdef shared_ptr[CField] new_field + with nogil: + new_field = self.field.RemoveMetadata() + return box_field(new_field) + cdef class Schema: @@ -226,6 +278,14 @@ cdef class Schema: return result + cdef init(self, const vector[shared_ptr[CField]]& fields): + self.schema = new CSchema(fields) + self.sp_schema.reset(self.schema) + + cdef init_schema(self, const shared_ptr[CSchema]& schema): + self.schema = schema.get() + self.sp_schema = schema + property names: def __get__(self): @@ -236,20 +296,10 @@ cdef class Schema: result.append(name) return result - cdef init(self, const vector[shared_ptr[CField]]& fields): - self.schema = new CSchema(fields) - self.sp_schema.reset(self.schema) - - cdef init_schema(self, const shared_ptr[CSchema]& schema): - self.schema = schema.get() - self.sp_schema = schema + property metadata: - cpdef dict custom_metadata(self): - cdef: - CKeyValueMetadata metadata = self.schema.custom_metadata() - unordered_map[c_string, c_string] result - metadata.ToUnorderedMap(&result) - return result + def __get__(self): + return box_metadata(self.schema.metadata().get()) def equals(self, other): """ @@ -274,23 +324,40 @@ cdef class Schema: """ return box_field(self.schema.GetFieldByName(tobytes(name))) - @classmethod - def from_fields(cls, fields): - cdef: - Schema result - Field field - vector[shared_ptr[CField]] c_fields + def add_metadata(self, dict metadata): + """ + Add metadata as dict of string keys and values to Schema - c_fields.resize(len(fields)) + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes - for i in range(len(fields)): - field = fields[i] - c_fields[i] = field.sp_field + Returns + ------- + schema : pyarrow.Schema + """ + cdef shared_ptr[CKeyValueMetadata] c_meta + convert_metadata(metadata, &c_meta) - result = Schema() - result.init(c_fields) + cdef shared_ptr[CSchema] new_schema + with nogil: + check_status(self.schema.AddMetadata(c_meta, &new_schema)) - return result + return box_schema(new_schema) + + def remove_metadata(self): + """ + Create new schema without metadata, if any + + Returns + ------- + schema : pyarrow.Schema + """ + cdef shared_ptr[CSchema] new_schema + with nogil: + new_schema = self.schema.RemoveMetadata() + return box_schema(new_schema) def __str__(self): return frombytes(self.schema.ToString()) @@ -299,6 +366,15 @@ cdef class Schema: return self.__str__() +cdef box_metadata(const CKeyValueMetadata* metadata): + cdef unordered_map[c_string, c_string] result + if metadata != NULL: + metadata.ToUnorderedMap(&result) + return result + else: + return None + + cdef dict _type_cache = {} @@ -315,8 +391,49 @@ cdef DataType primitive_type(Type type): #------------------------------------------------------------ # Type factory functions -def field(name, type, bint nullable=True): - return Field.from_py(name, type, nullable) +cdef int convert_metadata(dict metadata, + shared_ptr[CKeyValueMetadata]* out) except -1: + cdef: + shared_ptr[CKeyValueMetadata] meta = ( + make_shared[CKeyValueMetadata]()) + c_string key, value + + for py_key, py_value in metadata.items(): + key = tobytes(py_key) + value = tobytes(py_value) + meta.get().Append(key, value) + out[0] = meta + return 0 + + +def field(name, DataType type, bint nullable=True, dict metadata=None): + """ + Create a pyarrow.Field instance + + Parameters + ---------- + name : string or bytes + type : pyarrow.DataType + nullable : boolean, default True + metadata : dict, default None + Keys and values must be coercible to bytes + + Returns + ------- + field : pyarrow.Field + """ + cdef: + shared_ptr[CKeyValueMetadata] c_meta + Field result = Field() + + if metadata is not None: + convert_metadata(metadata, &c_meta) + + result.sp_field.reset(new CField(tobytes(name), type.sp_type, + nullable, c_meta)) + result.field = result.sp_field.get() + result.type = type + return result cdef set PRIMITIVE_TYPES = set([ @@ -554,7 +671,28 @@ def struct(fields): def schema(fields): - return Schema.from_fields(fields) + """ + Construct pyarrow.Schema from collection of fields + + Parameters + ---------- + field : list or iterable + + Returns + ------- + schema : pyarrow.Schema + """ + cdef: + Schema result + Field field + vector[shared_ptr[CField]] c_fields + + for i, field in enumerate(fields): + c_fields.push_back(field.sp_field) + + result = Schema() + result.init(c_fields) + return result cdef DataType box_data_type(const shared_ptr[CDataType]& type): http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/_table.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_table.pyx b/python/pyarrow/_table.pyx index ed0782b..599e046 100644 --- a/python/pyarrow/_table.pyx +++ b/python/pyarrow/_table.pyx @@ -272,11 +272,12 @@ cdef class Column: return chunked_array -cdef CKeyValueMetadata key_value_metadata_from_dict(dict metadata): +cdef shared_ptr[const CKeyValueMetadata] key_value_metadata_from_dict( + dict metadata): cdef: unordered_map[c_string, c_string] unordered_metadata = metadata - CKeyValueMetadata c_metadata = CKeyValueMetadata(unordered_metadata) - return c_metadata + return (<shared_ptr[const CKeyValueMetadata]> + make_shared[CKeyValueMetadata](unordered_metadata)) cdef int _schema_from_arrays( http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index ef1a332..8a730b3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -23,6 +23,10 @@ cdef extern from "arrow/util/key_value_metadata.h" namespace "arrow" nogil: cdef cppclass CKeyValueMetadata" arrow::KeyValueMetadata": CKeyValueMetadata() CKeyValueMetadata(const unordered_map[c_string, c_string]&) + + c_bool Equals(const CKeyValueMetadata& other) + + void Append(const c_string& key, const c_string& value) void ToUnorderedMap(unordered_map[c_string, c_string]*) const cdef extern from "arrow/api.h" namespace "arrow" nogil: @@ -168,25 +172,48 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CDataType] type() c_bool nullable() + c_string ToString() + c_bool Equals(const CField& other) + + shared_ptr[const CKeyValueMetadata] metadata() + CField(const c_string& name, const shared_ptr[CDataType]& type, c_bool nullable) + CField(const c_string& name, const shared_ptr[CDataType]& type, + c_bool nullable, const shared_ptr[CKeyValueMetadata]& metadata) + + # Removed const in Cython so don't have to cast to get code to generate + CStatus AddMetadata(const shared_ptr[CKeyValueMetadata]& metadata, + shared_ptr[CField]* out) + shared_ptr[CField] RemoveMetadata() + + cdef cppclass CStructType" arrow::StructType"(CDataType): CStructType(const vector[shared_ptr[CField]]& fields) cdef cppclass CSchema" arrow::Schema": CSchema(const vector[shared_ptr[CField]]& fields) CSchema(const vector[shared_ptr[CField]]& fields, - const CKeyValueMetadata& custom_metadata) + const shared_ptr[const CKeyValueMetadata]& metadata) + + # Does not actually exist, but gets Cython to not complain + CSchema(const vector[shared_ptr[CField]]& fields, + const shared_ptr[CKeyValueMetadata]& metadata) c_bool Equals(const CSchema& other) shared_ptr[CField] field(int i) - const CKeyValueMetadata& custom_metadata() const + shared_ptr[const CKeyValueMetadata] metadata() shared_ptr[CField] GetFieldByName(c_string& name) int num_fields() c_string ToString() + # Removed const in Cython so don't have to cast to get code to generate + CStatus AddMetadata(const shared_ptr[CKeyValueMetadata]& metadata, + shared_ptr[CSchema]* out) + shared_ptr[CSchema] RemoveMetadata() + cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray): c_bool Value(int i) http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/tests/pandas_examples.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py index e081c38..313a3ae 100644 --- a/python/pyarrow/tests/pandas_examples.py +++ b/python/pyarrow/tests/pandas_examples.py @@ -73,7 +73,7 @@ def dataframe_with_arrays(): ] df = pd.DataFrame(arrays) - schema = pa.Schema.from_fields(fields) + schema = pa.schema(fields) return df, schema @@ -114,6 +114,6 @@ def dataframe_with_lists(): ] df = pd.DataFrame(arrays) - schema = pa.Schema.from_fields(fields) + schema = pa.schema(fields) return df, schema http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index f360234..2779da3 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -106,10 +106,10 @@ class TestPandasConversion(unittest.TestCase): for numpy_dtype, arrow_dtype in dtypes: values = np.random.randn(num_values) data[numpy_dtype] = values.astype(numpy_dtype) - fields.append(pa.Field.from_py(numpy_dtype, arrow_dtype)) + fields.append(pa.field(numpy_dtype, arrow_dtype)) df = pd.DataFrame(data) - schema = pa.Schema.from_fields(fields) + schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema) def test_float_nulls(self): @@ -127,7 +127,7 @@ class TestPandasConversion(unittest.TestCase): arr = pa.Array.from_pandas(values, null_mask) arrays.append(arr) - fields.append(pa.Field.from_py(name, arrow_dtype)) + fields.append(pa.field(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) @@ -136,7 +136,7 @@ class TestPandasConversion(unittest.TestCase): columns=names) table = pa.Table.from_arrays(arrays, names) - assert table.schema.equals(pa.Schema.from_fields(fields)) + assert table.schema.equals(pa.schema(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) @@ -159,10 +159,10 @@ class TestPandasConversion(unittest.TestCase): min(info.max, np.iinfo('i8').max), size=num_values) data[dtype] = values.astype(dtype) - fields.append(pa.Field.from_py(dtype, arrow_dtype)) + fields.append(pa.field(dtype, arrow_dtype)) df = pd.DataFrame(data) - schema = pa.Schema.from_fields(fields) + schema = pa.schema(fields) self._check_pandas_roundtrip(df, expected_schema=schema) def test_integer_with_nulls(self): @@ -200,8 +200,8 @@ class TestPandasConversion(unittest.TestCase): np.random.seed(0) df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) - field = pa.Field.from_py('bools', pa.bool_()) - schema = pa.Schema.from_fields([field]) + field = pa.field('bools', pa.bool_()) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema) def test_boolean_nulls(self): @@ -217,8 +217,8 @@ class TestPandasConversion(unittest.TestCase): expected = values.astype(object) expected[mask] = None - field = pa.Field.from_py('bools', pa.bool_()) - schema = pa.Schema.from_fields([field]) + field = pa.field('bools', pa.bool_()) + schema = pa.schema([field]) ex_frame = pd.DataFrame({'bools': expected}) table = pa.Table.from_arrays([arr], ['bools']) @@ -230,16 +230,16 @@ class TestPandasConversion(unittest.TestCase): def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) - field = pa.Field.from_py('bools', pa.bool_()) - schema = pa.Schema.from_fields([field]) + field = pa.field('bools', pa.bool_()) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema) def test_unicode(self): repeats = 1000 values = [u'foo', None, u'bar', u'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) - field = pa.Field.from_py('strings', pa.string()) - schema = pa.Schema.from_fields([field]) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, expected_schema=schema) @@ -257,7 +257,7 @@ class TestPandasConversion(unittest.TestCase): def test_fixed_size_bytes(self): values = [b'foo', None, b'bar', None, None, b'hey'] df = pd.DataFrame({'strings': values}) - schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))]) + schema = pa.schema([pa.field('strings', pa.binary(3))]) table = pa.Table.from_pandas(df, schema=schema) assert table.schema[0].type == schema[0].type assert table.schema[0].name == schema[0].name @@ -267,7 +267,7 @@ class TestPandasConversion(unittest.TestCase): def test_fixed_size_bytes_does_not_accept_varying_lengths(self): values = [b'foo', None, b'ba', None, None, b'hey'] df = pd.DataFrame({'strings': values}) - schema = pa.Schema.from_fields([pa.field('strings', pa.binary(3))]) + schema = pa.schema([pa.field('strings', pa.binary(3))]) with self.assertRaises(pa.ArrowInvalid): pa.Table.from_pandas(df, schema=schema) @@ -279,8 +279,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - field = pa.Field.from_py('datetime64', pa.timestamp('ms')) - schema = pa.Schema.from_fields([field]) + field = pa.field('datetime64', pa.timestamp('ms')) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) @@ -291,8 +291,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - field = pa.Field.from_py('datetime64', pa.timestamp('ns')) - schema = pa.Schema.from_fields([field]) + field = pa.field('datetime64', pa.timestamp('ns')) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema) @@ -304,8 +304,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - field = pa.Field.from_py('datetime64', pa.timestamp('ms')) - schema = pa.Schema.from_fields([field]) + field = pa.field('datetime64', pa.timestamp('ms')) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) @@ -316,8 +316,8 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - field = pa.Field.from_py('datetime64', pa.timestamp('ns')) - schema = pa.Schema.from_fields([field]) + field = pa.field('datetime64', pa.timestamp('ns')) + schema = pa.schema([field]) self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema) @@ -353,8 +353,8 @@ class TestPandasConversion(unittest.TestCase): datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]}) table = pa.Table.from_pandas(df) - field = pa.Field.from_py('date', pa.date32()) - schema = pa.Schema.from_fields([field]) + field = pa.field('date', pa.date32()) + schema = pa.schema([field]) assert table.schema.equals(schema) result = table.to_pandas() expected = df.copy() @@ -526,8 +526,8 @@ class TestPandasConversion(unittest.TestCase): ] }) converted = pa.Table.from_pandas(expected) - field = pa.Field.from_py('decimals', pa.decimal(7, 3)) - schema = pa.Schema.from_fields([field]) + field = pa.field('decimals', pa.decimal(7, 3)) + schema = pa.schema([field]) assert converted.schema.equals(schema) def test_decimal_32_to_pandas(self): @@ -549,8 +549,8 @@ class TestPandasConversion(unittest.TestCase): ] }) converted = pa.Table.from_pandas(expected) - field = pa.Field.from_py('decimals', pa.decimal(12, 6)) - schema = pa.Schema.from_fields([field]) + field = pa.field('decimals', pa.decimal(12, 6)) + schema = pa.schema([field]) assert converted.schema.equals(schema) def test_decimal_64_to_pandas(self): @@ -572,8 +572,8 @@ class TestPandasConversion(unittest.TestCase): ] }) converted = pa.Table.from_pandas(expected) - field = pa.Field.from_py('decimals', pa.decimal(26, 11)) - schema = pa.Schema.from_fields([field]) + field = pa.field('decimals', pa.decimal(26, 11)) + schema = pa.schema([field]) assert converted.schema.equals(schema) def test_decimal_128_to_pandas(self): http://git-wip-us.apache.org/repos/asf/arrow/blob/e876abbd/python/pyarrow/tests/test_schema.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index da704f3..b3abc0f 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -118,7 +118,7 @@ def test_field(): assert f.name == 'foo' assert f.nullable assert f.type is t - assert repr(f) == "Field('foo', type=string)" + assert repr(f) == "pyarrow.Field<foo: string>" f = pa.field('foo', t, False) assert not f.nullable @@ -152,6 +152,52 @@ def test_field_empty(): repr(f) +def test_field_add_remove_metadata(): + f0 = pa.field('foo', pa.int32()) + + assert f0.metadata is None + + metadata = {b'foo': b'bar', b'pandas': b'badger'} + + f1 = f0.add_metadata(metadata) + assert f1.metadata == metadata + + f3 = f1.remove_metadata() + assert f3.metadata is None + + # idempotent + f4 = f3.remove_metadata() + assert f4.metadata is None + + f5 = pa.field('foo', pa.int32(), True, metadata) + f6 = f0.add_metadata(metadata) + assert f5.equals(f6) + + +def test_schema_add_remove_metadata(): + fields = [ + pa.field('foo', pa.int32()), + pa.field('bar', pa.string()), + pa.field('baz', pa.list_(pa.int8())) + ] + + s1 = pa.schema(fields) + + assert s1.metadata is None + + metadata = {b'foo': b'bar', b'pandas': b'badger'} + + s2 = s1.add_metadata(metadata) + assert s2.metadata == metadata + + s3 = s2.remove_metadata() + assert s3.metadata is None + + # idempotent + s4 = s3.remove_metadata() + assert s4.metadata is None + + def test_schema_equals(): fields = [ pa.field('foo', pa.int32()),
