http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/ipc/metadata.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc new file mode 100644 index 0000000..642f21a --- /dev/null +++ b/cpp/src/arrow/ipc/metadata.cc @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/metadata.h" + +#include <flatbuffers/flatbuffers.h> +#include <cstdint> +#include <memory> +#include <vector> + +// Generated C++ flatbuffer IDL +#include "arrow/ipc/Message_generated.h" +#include "arrow/ipc/metadata-internal.h" + +#include "arrow/schema.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +Status WriteSchema(const Schema* schema, std::shared_ptr<Buffer>* out) { + MessageBuilder message; + RETURN_NOT_OK(message.SetSchema(schema)); + RETURN_NOT_OK(message.Finish()); + return message.GetBuffer(out); +} + +//---------------------------------------------------------------------- +// Message reader + +class Message::Impl { + public: + explicit Impl(const std::shared_ptr<Buffer>& buffer, + const flatbuf::Message* message) : + buffer_(buffer), + message_(message) {} + + Message::Type type() const { + switch (message_->header_type()) { + case flatbuf::MessageHeader_Schema: + return Message::SCHEMA; + case flatbuf::MessageHeader_DictionaryBatch: + return Message::DICTIONARY_BATCH; + case flatbuf::MessageHeader_RecordBatch: + return Message::RECORD_BATCH; + default: + return Message::NONE; + } + } + + const void* header() const { + return message_->header(); + } + + int64_t body_length() const { + return message_->bodyLength(); + } + + private: + // Owns the memory this message accesses + std::shared_ptr<Buffer> buffer_; + + const flatbuf::Message* message_; +}; + +class SchemaMessage::Impl { + public: + explicit Impl(const void* schema) : + schema_(static_cast<const flatbuf::Schema*>(schema)) {} + + const flatbuf::Field* field(int i) const { + return schema_->fields()->Get(i); + } + + int num_fields() const { + return schema_->fields()->size(); + } + + private: + const flatbuf::Schema* schema_; +}; + +Message::Message() {} + +Status Message::Open(const std::shared_ptr<Buffer>& buffer, + std::shared_ptr<Message>* out) { + std::shared_ptr<Message> result(new Message()); + + // The buffer is prefixed by its size as int32_t + const uint8_t* fb_head = buffer->data() + sizeof(int32_t); + const flatbuf::Message* message = flatbuf::GetMessage(fb_head); + + // TODO(wesm): verify message + result->impl_.reset(new Impl(buffer, message)); + *out = result; + + return Status::OK(); +} + +Message::Type Message::type() const { + return impl_->type(); +} + +int64_t Message::body_length() const { + return impl_->body_length(); +} + +std::shared_ptr<Message> Message::get_shared_ptr() { + return this->shared_from_this(); +} + +std::shared_ptr<SchemaMessage> Message::GetSchema() { + return std::make_shared<SchemaMessage>(this->shared_from_this(), + impl_->header()); +} + +SchemaMessage::SchemaMessage(const std::shared_ptr<Message>& message, + const void* schema) { + message_ = message; + impl_.reset(new Impl(schema)); +} + +int SchemaMessage::num_fields() const { + return impl_->num_fields(); +} + +Status SchemaMessage::GetField(int i, std::shared_ptr<Field>* out) const { + const flatbuf::Field* field = impl_->field(i); + return FieldFromFlatbuffer(field, out); +} + +Status SchemaMessage::GetSchema(std::shared_ptr<Schema>* out) const { + std::vector<std::shared_ptr<Field>> fields(num_fields()); + for (int i = 0; i < this->num_fields(); ++i) { + RETURN_NOT_OK(GetField(i, &fields[i])); + } + *out = std::make_shared<Schema>(fields); + return Status::OK(); +} + +class RecordBatchMessage::Impl { + public: + explicit Impl(const void* batch) : + batch_(static_cast<const flatbuf::RecordBatch*>(batch)) { + nodes_ = batch_->nodes(); + buffers_ = batch_->buffers(); + } + + const flatbuf::FieldNode* field(int i) const { + return nodes_->Get(i); + } + + const flatbuf::Buffer* buffer(int i) const { + return buffers_->Get(i); + } + + int32_t length() const { + return batch_->length(); + } + + int num_buffers() const { + return batch_->buffers()->size(); + } + + int num_fields() const { + return batch_->nodes()->size(); + } + + private: + const flatbuf::RecordBatch* batch_; + const flatbuffers::Vector<const flatbuf::FieldNode*>* nodes_; + const flatbuffers::Vector<const flatbuf::Buffer*>* buffers_; +}; + +std::shared_ptr<RecordBatchMessage> Message::GetRecordBatch() { + return std::make_shared<RecordBatchMessage>(this->shared_from_this(), + impl_->header()); +} + +RecordBatchMessage::RecordBatchMessage(const std::shared_ptr<Message>& message, + const void* batch) { + message_ = message; + impl_.reset(new Impl(batch)); +} + +// TODO(wesm): Copying the flatbuffer data isn't great, but this will do for +// now +FieldMetadata RecordBatchMessage::field(int i) const { + const flatbuf::FieldNode* node = impl_->field(i); + + FieldMetadata result; + result.length = node->length(); + result.null_count = node->null_count(); + return result; +} + +BufferMetadata RecordBatchMessage::buffer(int i) const { + const flatbuf::Buffer* buffer = impl_->buffer(i); + + BufferMetadata result; + result.page = buffer->page(); + result.offset = buffer->offset(); + result.length = buffer->length(); + return result; +} + +int32_t RecordBatchMessage::length() const { + return impl_->length(); +} + +int RecordBatchMessage::num_buffers() const { + return impl_->num_buffers(); +} + +int RecordBatchMessage::num_fields() const { + return impl_->num_fields(); +} + +} // namespace ipc +} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/ipc/metadata.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h new file mode 100644 index 0000000..c728852 --- /dev/null +++ b/cpp/src/arrow/ipc/metadata.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// C++ object model and user API for interprocess schema messaging + +#ifndef ARROW_IPC_METADATA_H +#define ARROW_IPC_METADATA_H + +#include <cstdint> +#include <memory> + +namespace arrow { + +class Buffer; +struct Field; +class Schema; +class Status; + +namespace ipc { + +//---------------------------------------------------------------------- +// Message read/write APIs + +// Serialize arrow::Schema as a Flatbuffer +Status WriteSchema(const Schema* schema, std::shared_ptr<Buffer>* out); + +//---------------------------------------------------------------------- + +// Read interface classes. We do not fully deserialize the flatbuffers so that +// individual fields metadata can be retrieved from very large schema without +// + +class Message; + +// Container for serialized Schema metadata contained in an IPC message +class SchemaMessage { + public: + // Accepts an opaque flatbuffer pointer + SchemaMessage(const std::shared_ptr<Message>& message, const void* schema); + + int num_fields() const; + + // Construct an arrow::Field for the i-th value in the metadata + Status GetField(int i, std::shared_ptr<Field>* out) const; + + // Construct a complete Schema from the message. May be expensive for very + // large schemas if you are only interested in a few fields + Status GetSchema(std::shared_ptr<Schema>* out) const; + + private: + // Parent, owns the flatbuffer data + std::shared_ptr<Message> message_; + + class Impl; + std::unique_ptr<Impl> impl_; +}; + +// Field metadata +struct FieldMetadata { + int32_t length; + int32_t null_count; +}; + +struct BufferMetadata { + int32_t page; + int64_t offset; + int64_t length; +}; + +// Container for serialized record batch metadata contained in an IPC message +class RecordBatchMessage { + public: + // Accepts an opaque flatbuffer pointer + RecordBatchMessage(const std::shared_ptr<Message>& message, + const void* batch_meta); + + FieldMetadata field(int i) const; + BufferMetadata buffer(int i) const; + + int32_t length() const; + int num_buffers() const; + int num_fields() const; + + private: + // Parent, owns the flatbuffer data + std::shared_ptr<Message> message_; + + class Impl; + std::unique_ptr<Impl> impl_; +}; + +class DictionaryBatchMessage { + public: + int64_t id() const; + std::unique_ptr<RecordBatchMessage> data() const; +}; + +class Message : public std::enable_shared_from_this<Message> { + public: + enum Type { + NONE, + SCHEMA, + DICTIONARY_BATCH, + RECORD_BATCH + }; + + static Status Open(const std::shared_ptr<Buffer>& buffer, + std::shared_ptr<Message>* out); + + std::shared_ptr<Message> get_shared_ptr(); + + int64_t body_length() const; + + Type type() const; + + // These methods only to be invoked if you have checked the message type + std::shared_ptr<SchemaMessage> GetSchema(); + std::shared_ptr<RecordBatchMessage> GetRecordBatch(); + std::shared_ptr<DictionaryBatchMessage> GetDictionaryBatch(); + + private: + Message(); + + // Hide serialization details from user API + class Impl; + std::unique_ptr<Impl> impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_METADATA_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/ipc/test-common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h new file mode 100644 index 0000000..0fccce9 --- /dev/null +++ b/cpp/src/arrow/ipc/test-common.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_TEST_COMMON_H +#define ARROW_IPC_TEST_COMMON_H + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +namespace arrow { +namespace ipc { + +class MemoryMapFixture { + public: + void TearDown() { + for (auto path : tmp_files_) { + std::remove(path.c_str()); + } + } + + void CreateFile(const std::string path, int64_t size) { + FILE* file = fopen(path.c_str(), "w"); + if (file != nullptr) { + tmp_files_.push_back(path); + } + ftruncate(fileno(file), size); + fclose(file); + } + + private: + std::vector<std::string> tmp_files_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_TEST_COMMON_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/schema-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc new file mode 100644 index 0000000..a1de1dc --- /dev/null +++ b/cpp/src/arrow/schema-test.cc @@ -0,0 +1,104 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/schema.h" +#include "arrow/type.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { + +const auto INT32 = std::make_shared<Int32Type>(); + +TEST(TestField, Basics) { + Field f0("f0", INT32); + Field f0_nn("f0", INT32, false); + + ASSERT_EQ(f0.name, "f0"); + ASSERT_EQ(f0.type->ToString(), INT32->ToString()); + + ASSERT_TRUE(f0.nullable); + ASSERT_FALSE(f0_nn.nullable); +} + +TEST(TestField, Equals) { + Field f0("f0", INT32); + Field f0_nn("f0", INT32, false); + Field f0_other("f0", INT32); + + ASSERT_EQ(f0, f0_other); + ASSERT_NE(f0, f0_nn); +} + +class TestSchema : public ::testing::Test { + public: + void SetUp() {} +}; + +TEST_F(TestSchema, Basics) { + auto f0 = std::make_shared<Field>("f0", INT32); + auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(), false); + auto f1_optional = std::make_shared<Field>("f1", std::make_shared<UInt8Type>()); + + auto f2 = std::make_shared<Field>("f2", std::make_shared<StringType>()); + + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + auto schema = std::make_shared<Schema>(fields); + + ASSERT_EQ(3, schema->num_fields()); + ASSERT_EQ(f0, schema->field(0)); + ASSERT_EQ(f1, schema->field(1)); + ASSERT_EQ(f2, schema->field(2)); + + auto schema2 = std::make_shared<Schema>(fields); + + vector<shared_ptr<Field>> fields3 = {f0, f1_optional, f2}; + auto schema3 = std::make_shared<Schema>(fields3); + ASSERT_TRUE(schema->Equals(schema2)); + ASSERT_FALSE(schema->Equals(schema3)); + + ASSERT_TRUE(schema->Equals(*schema2.get())); + ASSERT_FALSE(schema->Equals(*schema3.get())); +} + +TEST_F(TestSchema, ToString) { + auto f0 = std::make_shared<Field>("f0", INT32); + auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(), false); + auto f2 = std::make_shared<Field>("f2", std::make_shared<StringType>()); + auto f3 = std::make_shared<Field>("f3", + std::make_shared<ListType>(std::make_shared<Int16Type>())); + + vector<shared_ptr<Field>> fields = {f0, f1, f2, f3}; + auto schema = std::make_shared<Schema>(fields); + + std::string result = schema->ToString(); + std::string expected = R"(f0: int32 +f1: uint8 not null +f2: string +f3: list<item: int16>)"; + + ASSERT_EQ(expected, result); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/schema.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc new file mode 100644 index 0000000..18aad0e --- /dev/null +++ b/cpp/src/arrow/schema.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/schema.h" + +#include <memory> +#include <string> +#include <sstream> +#include <vector> + +#include "arrow/type.h" + +namespace arrow { + +Schema::Schema(const std::vector<std::shared_ptr<Field>>& fields) : + fields_(fields) {} + +bool Schema::Equals(const Schema& other) const { + if (this == &other) return true; + if (num_fields() != other.num_fields()) { + return false; + } + for (int i = 0; i < num_fields(); ++i) { + if (!field(i)->Equals(*other.field(i).get())) { + return false; + } + } + return true; +} + +bool Schema::Equals(const std::shared_ptr<Schema>& other) const { + return Equals(*other.get()); +} + +std::string Schema::ToString() const { + std::stringstream buffer; + + int i = 0; + for (auto field : fields_) { + if (i > 0) { + buffer << std::endl; + } + buffer << field->ToString(); + ++i; + } + return buffer.str(); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/schema.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h new file mode 100644 index 0000000..52f3c1c --- /dev/null +++ b/cpp/src/arrow/schema.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SCHEMA_H +#define ARROW_SCHEMA_H + +#include <memory> +#include <string> +#include <vector> + +namespace arrow { + +struct Field; + +class Schema { + public: + explicit Schema(const std::vector<std::shared_ptr<Field>>& fields); + + // Returns true if all of the schema fields are equal + bool Equals(const Schema& other) const; + bool Equals(const std::shared_ptr<Schema>& other) const; + + // Return the ith schema element. Does not boundscheck + const std::shared_ptr<Field>& field(int i) const { + return fields_[i]; + } + + // Render a string representation of the schema suitable for debugging + std::string ToString() const; + + int num_fields() const { + return fields_.size(); + } + + private: + std::vector<std::shared_ptr<Field>> fields_; +}; + +} // namespace arrow + +#endif // ARROW_FIELD_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc new file mode 100644 index 0000000..4c7b8f8 --- /dev/null +++ b/cpp/src/arrow/table-test.cc @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/primitive.h" +#include "arrow/util/status.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { + +const auto INT16 = std::make_shared<Int16Type>(); +const auto UINT8 = std::make_shared<UInt8Type>(); +const auto INT32 = std::make_shared<Int32Type>(); + +class TestTable : public TestBase { + public: + void MakeExample1(int length) { + auto f0 = std::make_shared<Field>("f0", INT32); + auto f1 = std::make_shared<Field>("f1", UINT8); + auto f2 = std::make_shared<Field>("f2", INT16); + + vector<shared_ptr<Field>> fields = {f0, f1, f2}; + schema_ = std::make_shared<Schema>(fields); + + columns_ = { + std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)), + std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)), + std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length)) + }; + } + + protected: + std::unique_ptr<Table> table_; + shared_ptr<Schema> schema_; + vector<std::shared_ptr<Column>> columns_; +}; + +TEST_F(TestTable, EmptySchema) { + auto empty_schema = shared_ptr<Schema>(new Schema({})); + table_.reset(new Table("data", empty_schema, columns_)); + ASSERT_OK(table_->ValidateColumns()); + ASSERT_EQ(0, table_->num_rows()); + ASSERT_EQ(0, table_->num_columns()); +} + +TEST_F(TestTable, Ctors) { + int length = 100; + MakeExample1(length); + + std::string name = "data"; + + table_.reset(new Table(name, schema_, columns_)); + ASSERT_OK(table_->ValidateColumns()); + ASSERT_EQ(name, table_->name()); + ASSERT_EQ(length, table_->num_rows()); + ASSERT_EQ(3, table_->num_columns()); + + table_.reset(new Table(name, schema_, columns_, length)); + ASSERT_OK(table_->ValidateColumns()); + ASSERT_EQ(name, table_->name()); + ASSERT_EQ(length, table_->num_rows()); +} + +TEST_F(TestTable, Metadata) { + int length = 100; + MakeExample1(length); + + std::string name = "data"; + table_.reset(new Table(name, schema_, columns_)); + + ASSERT_TRUE(table_->schema()->Equals(schema_)); + + auto col = table_->column(0); + ASSERT_EQ(schema_->field(0)->name, col->name()); + ASSERT_EQ(schema_->field(0)->type, col->type()); +} + +TEST_F(TestTable, InvalidColumns) { + // Check that columns are all the same length + int length = 100; + MakeExample1(length); + + table_.reset(new Table("data", schema_, columns_, length - 1)); + ASSERT_RAISES(Invalid, table_->ValidateColumns()); + + columns_.clear(); + + // Wrong number of columns + table_.reset(new Table("data", schema_, columns_, length)); + ASSERT_RAISES(Invalid, table_->ValidateColumns()); + + columns_ = { + std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)), + std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)), + std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length - 1)) + }; + + table_.reset(new Table("data", schema_, columns_, length)); + ASSERT_RAISES(Invalid, table_->ValidateColumns()); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc new file mode 100644 index 0000000..e405c1d --- /dev/null +++ b/cpp/src/arrow/table.cc @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/table.h" + +#include <cstdlib> +#include <memory> +#include <sstream> + +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/util/status.h" + +namespace arrow { + +RowBatch::RowBatch(const std::shared_ptr<Schema>& schema, int num_rows, + const std::vector<std::shared_ptr<Array>>& columns) : + schema_(schema), + num_rows_(num_rows), + columns_(columns) {} + +const std::string& RowBatch::column_name(int i) const { + return schema_->field(i)->name; +} + +Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema, + const std::vector<std::shared_ptr<Column>>& columns) : + name_(name), + schema_(schema), + columns_(columns) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } +} + +Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema, + const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows) : + name_(name), + schema_(schema), + columns_(columns), + num_rows_(num_rows) {} + +Status Table::ValidateColumns() const { + if (num_columns() != schema_->num_fields()) { + return Status::Invalid("Number of columns did not match schema"); + } + + // Make sure columns are all the same length + for (size_t i = 0; i < columns_.size(); ++i) { + const Column* col = columns_[i].get(); + if (col == nullptr) { + std::stringstream ss; + ss << "Column " << i << " named " << col->name() + << " was null"; + return Status::Invalid(ss.str()); + } + if (col->length() != num_rows_) { + std::stringstream ss; + ss << "Column " << i << " named " << col->name() + << " expected length " + << num_rows_ + << " but got length " + << col->length(); + return Status::Invalid(ss.str()); + } + } + return Status::OK(); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h new file mode 100644 index 0000000..e2f73a2 --- /dev/null +++ b/cpp/src/arrow/table.h @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TABLE_H +#define ARROW_TABLE_H + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +namespace arrow { + +class Array; +class Column; +class Schema; +class Status; + +// A row batch is a simpler and more rigid table data structure intended for +// use primarily in shared memory IPC. It contains a schema (metadata) and a +// corresponding vector of equal-length Arrow arrays +class RowBatch { + public: + // num_rows is a parameter to allow for row batches of a particular size not + // having any materialized columns. Each array should have the same length as + // num_rows + RowBatch(const std::shared_ptr<Schema>& schema, int num_rows, + const std::vector<std::shared_ptr<Array>>& columns); + + // @returns: the table's schema + const std::shared_ptr<Schema>& schema() const { + return schema_; + } + + // @returns: the i-th column + // Note: Does not boundscheck + const std::shared_ptr<Array>& column(int i) const { + return columns_[i]; + } + + const std::string& column_name(int i) const; + + // @returns: the number of columns in the table + int num_columns() const { + return columns_.size(); + } + + // @returns: the number of rows (the corresponding length of each column) + int64_t num_rows() const { + return num_rows_; + } + + private: + std::shared_ptr<Schema> schema_; + int num_rows_; + std::vector<std::shared_ptr<Array>> columns_; +}; + +// Immutable container of fixed-length columns conforming to a particular schema +class Table { + public: + // If columns is zero-length, the table's number of rows is zero + Table(const std::string& name, const std::shared_ptr<Schema>& schema, + const std::vector<std::shared_ptr<Column>>& columns); + + // num_rows is a parameter to allow for tables of a particular size not + // having any materialized columns. Each column should therefore have the + // same length as num_rows -- you can validate this using + // Table::ValidateColumns + Table(const std::string& name, const std::shared_ptr<Schema>& schema, + const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows); + + // @returns: the table's name, if any (may be length 0) + const std::string& name() const { + return name_; + } + + // @returns: the table's schema + const std::shared_ptr<Schema>& schema() const { + return schema_; + } + + // Note: Does not boundscheck + // @returns: the i-th column + const std::shared_ptr<Column>& column(int i) const { + return columns_[i]; + } + + // @returns: the number of columns in the table + int num_columns() const { + return columns_.size(); + } + + // @returns: the number of rows (the corresponding length of each column) + int64_t num_rows() const { + return num_rows_; + } + + // After construction, perform any checks to validate the input arguments + Status ValidateColumns() const; + + private: + // The table's name, optional + std::string name_; + + std::shared_ptr<Schema> schema_; + std::vector<std::shared_ptr<Column>> columns_; + + int64_t num_rows_; +}; + +} // namespace arrow + +#endif // ARROW_TABLE_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt deleted file mode 100644 index d9f00e7..0000000 --- a/cpp/src/arrow/table/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -####################################### -# arrow_table -####################################### - -# Headers: top level -install(FILES - column.h - schema.h - table.h - DESTINATION include/arrow/table) - -ADD_ARROW_TEST(column-test) -ADD_ARROW_TEST(schema-test) -ADD_ARROW_TEST(table-test) - -ADD_ARROW_BENCHMARK(column-benchmark) http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/column-benchmark.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/column-benchmark.cc b/cpp/src/arrow/table/column-benchmark.cc deleted file mode 100644 index c01146d..0000000 --- a/cpp/src/arrow/table/column-benchmark.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - - -#include "benchmark/benchmark.h" - -#include "arrow/test-util.h" -#include "arrow/table/test-common.h" -#include "arrow/types/integer.h" -#include "arrow/util/memory-pool.h" - -namespace arrow { -namespace { - template <typename ArrayType> - std::shared_ptr<Array> MakePrimitive(int32_t length, int32_t null_count = 0) { - auto pool = GetDefaultMemoryPool(); - auto data = std::make_shared<PoolBuffer>(pool); - auto nulls = std::make_shared<PoolBuffer>(pool); - data->Resize(length * sizeof(typename ArrayType::value_type)); - nulls->Resize(util::bytes_for_bits(length)); - return std::make_shared<ArrayType>(length, data, 10, nulls); - } -} // anonymous namespace - - -static void BM_BuildInt32ColumnByChunk(benchmark::State& state) { //NOLINT non-const reference - ArrayVector arrays; - for (int chunk_n = 0; chunk_n < state.range_x(); ++chunk_n) { - arrays.push_back(MakePrimitive<Int32Array>(100, 10)); - } - const auto INT32 = std::make_shared<Int32Type>(); - const auto field = std::make_shared<Field>("c0", INT32); - std::unique_ptr<Column> column; - while (state.KeepRunning()) { - column.reset(new Column(field, arrays)); - } -} - -BENCHMARK(BM_BuildInt32ColumnByChunk)->Range(5, 50000); - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/column-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc deleted file mode 100644 index 3b102e4..0000000 --- a/cpp/src/arrow/table/column-test.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> -#include <cstdint> -#include <memory> -#include <string> -#include <vector> - -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/test-common.h" -#include "arrow/test-util.h" -#include "arrow/type.h" -#include "arrow/types/integer.h" - -using std::shared_ptr; -using std::vector; - -namespace arrow { - -const auto INT32 = std::make_shared<Int32Type>(); - -class TestColumn : public TestBase { - protected: - std::shared_ptr<ChunkedArray> data_; - std::unique_ptr<Column> column_; -}; - -TEST_F(TestColumn, BasicAPI) { - ArrayVector arrays; - arrays.push_back(MakePrimitive<Int32Array>(100)); - arrays.push_back(MakePrimitive<Int32Array>(100, 10)); - arrays.push_back(MakePrimitive<Int32Array>(100, 20)); - - auto field = std::make_shared<Field>("c0", INT32); - column_.reset(new Column(field, arrays)); - - ASSERT_EQ("c0", column_->name()); - ASSERT_TRUE(column_->type()->Equals(INT32)); - ASSERT_EQ(300, column_->length()); - ASSERT_EQ(30, column_->null_count()); - ASSERT_EQ(3, column_->data()->num_chunks()); -} - -TEST_F(TestColumn, ChunksInhomogeneous) { - ArrayVector arrays; - arrays.push_back(MakePrimitive<Int32Array>(100)); - arrays.push_back(MakePrimitive<Int32Array>(100, 10)); - - auto field = std::make_shared<Field>("c0", INT32); - column_.reset(new Column(field, arrays)); - - ASSERT_OK(column_->ValidateData()); - - arrays.push_back(MakePrimitive<Int16Array>(100, 10)); - column_.reset(new Column(field, arrays)); - ASSERT_RAISES(Invalid, column_->ValidateData()); -} - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/column.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/table/column.cc deleted file mode 100644 index 573e650..0000000 --- a/cpp/src/arrow/table/column.cc +++ /dev/null @@ -1,68 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/table/column.h" - -#include <memory> -#include <sstream> - -#include "arrow/type.h" -#include "arrow/util/status.h" - -namespace arrow { - -ChunkedArray::ChunkedArray(const ArrayVector& chunks) : - chunks_(chunks) { - length_ = 0; - for (const std::shared_ptr<Array>& chunk : chunks) { - length_ += chunk->length(); - null_count_ += chunk->null_count(); - } -} - -Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks) : - field_(field) { - data_ = std::make_shared<ChunkedArray>(chunks); -} - -Column::Column(const std::shared_ptr<Field>& field, - const std::shared_ptr<Array>& data) : - field_(field) { - data_ = std::make_shared<ChunkedArray>(ArrayVector({data})); -} - -Column::Column(const std::shared_ptr<Field>& field, - const std::shared_ptr<ChunkedArray>& data) : - field_(field), - data_(data) {} - -Status Column::ValidateData() { - for (int i = 0; i < data_->num_chunks(); ++i) { - const std::shared_ptr<DataType>& type = data_->chunk(i)->type(); - if (!this->type()->Equals(type)) { - std::stringstream ss; - ss << "In chunk " << i << " expected type " - << this->type()->ToString() - << " but saw " - << type->ToString(); - return Status::Invalid(ss.str()); - } - } - return Status::OK(); -} - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/column.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/table/column.h deleted file mode 100644 index dfc7516..0000000 --- a/cpp/src/arrow/table/column.h +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TABLE_COLUMN_H -#define ARROW_TABLE_COLUMN_H - -#include <memory> -#include <string> -#include <vector> - -#include "arrow/array.h" -#include "arrow/type.h" - -namespace arrow { - -typedef std::vector<std::shared_ptr<Array> > ArrayVector; - -// A data structure managing a list of primitive Arrow arrays logically as one -// large array -class ChunkedArray { - public: - explicit ChunkedArray(const ArrayVector& chunks); - - // @returns: the total length of the chunked array; computed on construction - int64_t length() const { - return length_; - } - - int64_t null_count() const { - return null_count_; - } - - int num_chunks() const { - return chunks_.size(); - } - - const std::shared_ptr<Array>& chunk(int i) const { - return chunks_[i]; - } - - protected: - ArrayVector chunks_; - int64_t length_; - int64_t null_count_; -}; - -// An immutable column data structure consisting of a field (type metadata) and -// a logical chunked data array (which can be validated as all being the same -// type). -class Column { - public: - Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks); - Column(const std::shared_ptr<Field>& field, - const std::shared_ptr<ChunkedArray>& data); - - Column(const std::shared_ptr<Field>& field, const std::shared_ptr<Array>& data); - - int64_t length() const { - return data_->length(); - } - - int64_t null_count() const { - return data_->null_count(); - } - - // @returns: the column's name in the passed metadata - const std::string& name() const { - return field_->name; - } - - // @returns: the column's type according to the metadata - const std::shared_ptr<DataType>& type() const { - return field_->type; - } - - // @returns: the column's data as a chunked logical array - const std::shared_ptr<ChunkedArray>& data() const { - return data_; - } - // Verify that the column's array data is consistent with the passed field's - // metadata - Status ValidateData(); - - protected: - std::shared_ptr<Field> field_; - std::shared_ptr<ChunkedArray> data_; -}; - -} // namespace arrow - -#endif // ARROW_TABLE_COLUMN_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/schema-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/table/schema-test.cc deleted file mode 100644 index 9dfade2..0000000 --- a/cpp/src/arrow/table/schema-test.cc +++ /dev/null @@ -1,110 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> -#include <memory> -#include <string> -#include <vector> - -#include "arrow/table/schema.h" -#include "arrow/type.h" -#include "arrow/types/string.h" - -using std::shared_ptr; -using std::vector; - -namespace arrow { - -const auto INT32 = std::make_shared<Int32Type>(); - -TEST(TestField, Basics) { - shared_ptr<DataType> ftype = INT32; - shared_ptr<DataType> ftype_nn = std::make_shared<Int32Type>(false); - Field f0("f0", ftype); - Field f0_nn("f0", ftype_nn); - - ASSERT_EQ(f0.name, "f0"); - ASSERT_EQ(f0.type->ToString(), ftype->ToString()); - - ASSERT_TRUE(f0.nullable()); - ASSERT_FALSE(f0_nn.nullable()); -} - -TEST(TestField, Equals) { - shared_ptr<DataType> ftype = INT32; - shared_ptr<DataType> ftype_nn = std::make_shared<Int32Type>(false); - - Field f0("f0", ftype); - Field f0_nn("f0", ftype_nn); - Field f0_other("f0", ftype); - - ASSERT_EQ(f0, f0_other); - ASSERT_NE(f0, f0_nn); -} - -class TestSchema : public ::testing::Test { - public: - void SetUp() {} -}; - -TEST_F(TestSchema, Basics) { - auto f0 = std::make_shared<Field>("f0", INT32); - auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(false)); - auto f1_optional = std::make_shared<Field>("f1", std::make_shared<UInt8Type>()); - - auto f2 = std::make_shared<Field>("f2", std::make_shared<StringType>()); - - vector<shared_ptr<Field> > fields = {f0, f1, f2}; - auto schema = std::make_shared<Schema>(fields); - - ASSERT_EQ(3, schema->num_fields()); - ASSERT_EQ(f0, schema->field(0)); - ASSERT_EQ(f1, schema->field(1)); - ASSERT_EQ(f2, schema->field(2)); - - auto schema2 = std::make_shared<Schema>(fields); - - vector<shared_ptr<Field> > fields3 = {f0, f1_optional, f2}; - auto schema3 = std::make_shared<Schema>(fields3); - ASSERT_TRUE(schema->Equals(schema2)); - ASSERT_FALSE(schema->Equals(schema3)); - - ASSERT_TRUE(schema->Equals(*schema2.get())); - ASSERT_FALSE(schema->Equals(*schema3.get())); -} - -TEST_F(TestSchema, ToString) { - auto f0 = std::make_shared<Field>("f0", std::make_shared<Int32Type>()); - auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(false)); - auto f2 = std::make_shared<Field>("f2", std::make_shared<StringType>()); - auto f3 = std::make_shared<Field>("f3", - std::make_shared<ListType>(std::make_shared<Int16Type>())); - - vector<shared_ptr<Field> > fields = {f0, f1, f2, f3}; - auto schema = std::make_shared<Schema>(fields); - - std::string result = schema->ToString(); - std::string expected = R"(f0 int32 -f1 uint8 not null -f2 string -f3 list<int16> -)"; - - ASSERT_EQ(expected, result); -} - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/schema.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/schema.cc b/cpp/src/arrow/table/schema.cc deleted file mode 100644 index d49d0a7..0000000 --- a/cpp/src/arrow/table/schema.cc +++ /dev/null @@ -1,58 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/table/schema.h" - -#include <memory> -#include <string> -#include <sstream> -#include <vector> - -#include "arrow/type.h" - -namespace arrow { - -Schema::Schema(const std::vector<std::shared_ptr<Field> >& fields) : - fields_(fields) {} - -bool Schema::Equals(const Schema& other) const { - if (this == &other) return true; - if (num_fields() != other.num_fields()) { - return false; - } - for (int i = 0; i < num_fields(); ++i) { - if (!field(i)->Equals(*other.field(i).get())) { - return false; - } - } - return true; -} - -bool Schema::Equals(const std::shared_ptr<Schema>& other) const { - return Equals(*other.get()); -} - -std::string Schema::ToString() const { - std::stringstream buffer; - - for (auto field : fields_) { - buffer << field->ToString() << std::endl; - } - return buffer.str(); -} - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/schema.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/schema.h b/cpp/src/arrow/table/schema.h deleted file mode 100644 index 103f01b..0000000 --- a/cpp/src/arrow/table/schema.h +++ /dev/null @@ -1,55 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_SCHEMA_H -#define ARROW_SCHEMA_H - -#include <memory> -#include <string> -#include <vector> - -#include "arrow/type.h" - -namespace arrow { - -class Schema { - public: - explicit Schema(const std::vector<std::shared_ptr<Field> >& fields); - - // Returns true if all of the schema fields are equal - bool Equals(const Schema& other) const; - bool Equals(const std::shared_ptr<Schema>& other) const; - - // Return the ith schema element. Does not boundscheck - const std::shared_ptr<Field>& field(int i) const { - return fields_[i]; - } - - // Render a string representation of the schema suitable for debugging - std::string ToString() const; - - int num_fields() const { - return fields_.size(); - } - - private: - std::vector<std::shared_ptr<Field> > fields_; -}; - -} // namespace arrow - -#endif // ARROW_FIELD_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/table-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table/table-test.cc deleted file mode 100644 index 8b354e8..0000000 --- a/cpp/src/arrow/table/table-test.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> -#include <cstdint> -#include <memory> -#include <string> -#include <vector> - -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" -#include "arrow/table/test-common.h" -#include "arrow/test-util.h" -#include "arrow/type.h" -#include "arrow/types/integer.h" - -using std::shared_ptr; -using std::vector; - -namespace arrow { - -const auto INT16 = std::make_shared<Int16Type>(); -const auto UINT8 = std::make_shared<UInt8Type>(); -const auto INT32 = std::make_shared<Int32Type>(); - -class TestTable : public TestBase { - public: - void MakeExample1(int length) { - auto f0 = std::make_shared<Field>("f0", INT32); - auto f1 = std::make_shared<Field>("f1", UINT8); - auto f2 = std::make_shared<Field>("f2", INT16); - - vector<shared_ptr<Field> > fields = {f0, f1, f2}; - schema_ = std::make_shared<Schema>(fields); - - columns_ = { - std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)), - std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)), - std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length)) - }; - } - - protected: - std::unique_ptr<Table> table_; - shared_ptr<Schema> schema_; - vector<std::shared_ptr<Column> > columns_; -}; - -TEST_F(TestTable, EmptySchema) { - auto empty_schema = shared_ptr<Schema>(new Schema({})); - table_.reset(new Table("data", empty_schema, columns_)); - ASSERT_OK(table_->ValidateColumns()); - ASSERT_EQ(0, table_->num_rows()); - ASSERT_EQ(0, table_->num_columns()); -} - -TEST_F(TestTable, Ctors) { - int length = 100; - MakeExample1(length); - - std::string name = "data"; - - table_.reset(new Table(name, schema_, columns_)); - ASSERT_OK(table_->ValidateColumns()); - ASSERT_EQ(name, table_->name()); - ASSERT_EQ(length, table_->num_rows()); - ASSERT_EQ(3, table_->num_columns()); - - table_.reset(new Table(name, schema_, columns_, length)); - ASSERT_OK(table_->ValidateColumns()); - ASSERT_EQ(name, table_->name()); - ASSERT_EQ(length, table_->num_rows()); -} - -TEST_F(TestTable, Metadata) { - int length = 100; - MakeExample1(length); - - std::string name = "data"; - table_.reset(new Table(name, schema_, columns_)); - - ASSERT_TRUE(table_->schema()->Equals(schema_)); - - auto col = table_->column(0); - ASSERT_EQ(schema_->field(0)->name, col->name()); - ASSERT_EQ(schema_->field(0)->type, col->type()); -} - -TEST_F(TestTable, InvalidColumns) { - // Check that columns are all the same length - int length = 100; - MakeExample1(length); - - table_.reset(new Table("data", schema_, columns_, length - 1)); - ASSERT_RAISES(Invalid, table_->ValidateColumns()); - - columns_.clear(); - - // Wrong number of columns - table_.reset(new Table("data", schema_, columns_, length)); - ASSERT_RAISES(Invalid, table_->ValidateColumns()); - - columns_ = { - std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)), - std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)), - std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length - 1)) - }; - - table_.reset(new Table("data", schema_, columns_, length)); - ASSERT_RAISES(Invalid, table_->ValidateColumns()); -} - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/table.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table/table.cc deleted file mode 100644 index 0c788b8..0000000 --- a/cpp/src/arrow/table/table.cc +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/table/table.h" - -#include <memory> -#include <sstream> - -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/type.h" -#include "arrow/util/status.h" - -namespace arrow { - -Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema, - const std::vector<std::shared_ptr<Column> >& columns) : - name_(name), - schema_(schema), - columns_(columns) { - if (columns.size() == 0) { - num_rows_ = 0; - } else { - num_rows_ = columns[0]->length(); - } -} - -Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema, - const std::vector<std::shared_ptr<Column> >& columns, int64_t num_rows) : - name_(name), - schema_(schema), - columns_(columns), - num_rows_(num_rows) {} - -Status Table::ValidateColumns() const { - if (num_columns() != schema_->num_fields()) { - return Status::Invalid("Number of columns did not match schema"); - } - - if (columns_.size() == 0) { - return Status::OK(); - } - - // Make sure columns are all the same length - for (size_t i = 0; i < columns_.size(); ++i) { - const Column* col = columns_[i].get(); - if (col->length() != num_rows_) { - std::stringstream ss; - ss << "Column " << i << " expected length " - << num_rows_ - << " but got length " - << col->length(); - return Status::Invalid(ss.str()); - } - } - return Status::OK(); -} - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/table.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/table.h b/cpp/src/arrow/table/table.h deleted file mode 100644 index b012938..0000000 --- a/cpp/src/arrow/table/table.h +++ /dev/null @@ -1,82 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TABLE_TABLE_H -#define ARROW_TABLE_TABLE_H - -#include <memory> -#include <string> -#include <vector> - -namespace arrow { - -class Column; -class Schema; -class Status; - -// Immutable container of fixed-length columns conforming to a particular schema -class Table { - public: - // If columns is zero-length, the table's number of rows is zero - Table(const std::string& name, const std::shared_ptr<Schema>& schema, - const std::vector<std::shared_ptr<Column> >& columns); - - Table(const std::string& name, const std::shared_ptr<Schema>& schema, - const std::vector<std::shared_ptr<Column> >& columns, int64_t num_rows); - - // @returns: the table's name, if any (may be length 0) - const std::string& name() const { - return name_; - } - - // @returns: the table's schema - const std::shared_ptr<Schema>& schema() const { - return schema_; - } - - // Note: Does not boundscheck - // @returns: the i-th column - const std::shared_ptr<Column>& column(int i) const { - return columns_[i]; - } - - // @returns: the number of columns in the table - int num_columns() const { - return columns_.size(); - } - - // @returns: the number of rows (the corresponding length of each column) - int64_t num_rows() const { - return num_rows_; - } - - // After construction, perform any checks to validate the input arguments - Status ValidateColumns() const; - - private: - // The table's name, optional - std::string name_; - - std::shared_ptr<Schema> schema_; - std::vector<std::shared_ptr<Column> > columns_; - - int64_t num_rows_; -}; - -} // namespace arrow - -#endif // ARROW_TABLE_TABLE_H http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/table/test-common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h deleted file mode 100644 index 50a5f6a..0000000 --- a/cpp/src/arrow/table/test-common.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest.h> -#include <cstdint> -#include <memory> -#include <string> -#include <vector> - -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" -#include "arrow/test-util.h" -#include "arrow/type.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/buffer.h" -#include "arrow/util/memory-pool.h" - -namespace arrow { - -class TestBase : public ::testing::Test { - public: - void SetUp() { - pool_ = GetDefaultMemoryPool(); - } - - template <typename ArrayType> - std::shared_ptr<Array> MakePrimitive(int32_t length, int32_t null_count = 0) { - auto data = std::make_shared<PoolBuffer>(pool_); - auto nulls = std::make_shared<PoolBuffer>(pool_); - EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); - return std::make_shared<ArrayType>(length, data, 10, nulls); - } - - protected: - MemoryPool* pool_; -}; - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 0898c8e..a9fb2a7 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -18,26 +18,39 @@ #ifndef ARROW_TEST_UTIL_H_ #define ARROW_TEST_UTIL_H_ -#include <gtest/gtest.h> +#include <cstdint> #include <memory> +#include <random> #include <string> #include <vector> +#include "gtest/gtest.h" + +#include "arrow/type.h" +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" #include "arrow/util/random.h" #include "arrow/util/status.h" #define ASSERT_RAISES(ENUM, expr) \ do { \ Status s = (expr); \ - ASSERT_TRUE(s.Is##ENUM()); \ + if (!s.Is##ENUM()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) #define ASSERT_OK(expr) \ do { \ Status s = (expr); \ - ASSERT_TRUE(s.ok()); \ + if (!s.ok()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) @@ -50,6 +63,27 @@ namespace arrow { +class TestBase : public ::testing::Test { + public: + void SetUp() { + pool_ = default_memory_pool(); + } + + template <typename ArrayType> + std::shared_ptr<Array> MakePrimitive(int32_t length, int32_t null_count = 0) { + auto data = std::make_shared<PoolBuffer>(pool_); + auto nulls = std::make_shared<PoolBuffer>(pool_); + EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); + EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); + return std::make_shared<ArrayType>(length, data, 10, nulls); + } + + protected: + MemoryPool* pool_; +}; + +namespace test { + template <typename T> void randint(int64_t N, T lower, T upper, std::vector<T>* out) { Random rng(random_seed()); @@ -84,6 +118,33 @@ void random_nulls(int64_t n, double pct_null, std::vector<bool>* nulls) { } } +static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution<int> d(0, 255); + + for (int i = 0; i < n; ++i) { + out[i] = d(gen) & 0xFF; + } +} + +template <typename T> +void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution<T> d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +static inline int bitmap_popcount(const uint8_t* data, int length) { + int count = 0; + for (int i = 0; i < length; ++i) { + // TODO: accelerate this + if (util::get_bit(data, i)) ++count; + } + return count; +} + static inline int null_count(const std::vector<uint8_t>& nulls) { int result = 0; for (size_t i = 0; i < nulls.size(); ++i) { @@ -102,6 +163,7 @@ std::shared_ptr<Buffer> bytes_to_null_buffer(uint8_t* bytes, int length) { return out; } +} // namespace test } // namespace arrow #endif // ARROW_TEST_UTIL_H_ http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0a2e817..f7f835e 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -24,45 +24,37 @@ namespace arrow { std::string Field::ToString() const { std::stringstream ss; - ss << this->name << " " << this->type->ToString(); + ss << this->name << ": " << this->type->ToString(); + if (!this->nullable) { + ss << " not null"; + } return ss.str(); } DataType::~DataType() {} -StringType::StringType(bool nullable) - : DataType(LogicalType::STRING, nullable) {} - -StringType::StringType(const StringType& other) - : StringType(other.nullable) {} +StringType::StringType() : DataType(Type::STRING) {} std::string StringType::ToString() const { std::string result(name()); - if (!nullable) { - result.append(" not null"); - } return result; } std::string ListType::ToString() const { std::stringstream s; - s << "list<" << value_type->ToString() << ">"; - if (!this->nullable) { - s << " not null"; - } + s << "list<" << value_field()->ToString() << ">"; return s.str(); } std::string StructType::ToString() const { std::stringstream s; s << "struct<"; - for (size_t i = 0; i < fields_.size(); ++i) { + for (int i = 0; i < this->num_children(); ++i) { if (i > 0) s << ", "; - const std::shared_ptr<Field>& field = fields_[i]; + const std::shared_ptr<Field>& field = this->child(i); s << field->name << ": " << field->type->ToString(); } s << ">"; - if (!nullable) s << " not null"; return s.str(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 00b01ea..5984b67 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -18,62 +18,34 @@ #ifndef ARROW_TYPE_H #define ARROW_TYPE_H +#include <cstdint> #include <memory> #include <string> #include <vector> namespace arrow { -// Physical data type that describes the memory layout of values. See details -// for each type -enum class LayoutEnum: char { - // A physical type consisting of some non-negative number of bytes - BYTE = 0, - - // A physical type consisting of some non-negative number of bits - BIT = 1, - - // A parametric variable-length value type. Full specification requires a - // child logical type - LIST = 2, - - // A collection of multiple equal-length child arrays. Parametric type taking - // 1 or more child logical types - STRUCT = 3, - - // An array with heterogeneous value types. Parametric types taking 1 or more - // child logical types - DENSE_UNION = 4, - SPARSE_UNION = 5 -}; - - -struct LayoutType { - LayoutEnum type; - explicit LayoutType(LayoutEnum type) : type(type) {} -}; - // Data types in this library are all *logical*. They can be expressed as // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) -struct LogicalType { +struct Type { enum type { // A degenerate NULL type represented as 0 bytes/bits NA = 0, - // Little-endian integer types - UINT8 = 1, - INT8 = 2, - UINT16 = 3, - INT16 = 4, - UINT32 = 5, - INT32 = 6, - UINT64 = 7, - INT64 = 8, - // A boolean value represented as 1 bit - BOOL = 9, + BOOL = 1, + + // Little-endian integer types + UINT8 = 2, + INT8 = 3, + UINT16 = 4, + INT16 = 5, + UINT32 = 6, + INT32 = 7, + UINT64 = 8, + INT64 = 9, // 4-byte floating point value FLOAT = 10, @@ -131,30 +103,38 @@ struct LogicalType { }; }; +struct Field; + struct DataType { - LogicalType::type type; - bool nullable; + Type::type type; - explicit DataType(LogicalType::type type, bool nullable = true) : - type(type), - nullable(nullable) {} + std::vector<std::shared_ptr<Field>> children_; + + explicit DataType(Type::type type) : + type(type) {} virtual ~DataType(); bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses - return this == other || (this->type == other->type && - this->nullable == other->nullable); + return this == other || (this->type == other->type); } bool Equals(const std::shared_ptr<DataType>& other) { return Equals(other.get()); } + const std::shared_ptr<Field>& child(int i) const { + return children_[i]; + } + + int num_children() const { + return children_.size(); + } + virtual std::string ToString() const = 0; }; -typedef std::shared_ptr<LayoutType> LayoutPtr; typedef std::shared_ptr<DataType> TypePtr; // A field is a piece of metadata that includes (for now) a name and a data @@ -166,9 +146,13 @@ struct Field { // The field's data type TypePtr type; - Field(const std::string& name, const TypePtr& type) : + // Fields can be nullable + bool nullable; + + Field(const std::string& name, const TypePtr& type, bool nullable = true) : name(name), - type(type) {} + type(type), + nullable(nullable) {} bool operator==(const Field& other) const { return this->Equals(other); @@ -180,6 +164,7 @@ struct Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && + this->nullable == other.nullable && this->type->Equals(other.type.get())); } @@ -187,36 +172,12 @@ struct Field { return Equals(*other.get()); } - bool nullable() const { - return this->type->nullable; - } - std::string ToString() const; }; -struct BytesType : public LayoutType { - int size; - - explicit BytesType(int size) - : LayoutType(LayoutEnum::BYTE), - size(size) {} - - BytesType(const BytesType& other) - : BytesType(other.size) {} -}; - -struct ListLayoutType : public LayoutType { - LayoutPtr value_type; - - explicit ListLayoutType(const LayoutPtr& value_type) - : LayoutType(LayoutEnum::BYTE), - value_type(value_type) {} -}; - template <typename Derived> struct PrimitiveType : public DataType { - explicit PrimitiveType(bool nullable = true) - : DataType(Derived::type_enum, nullable) {} + PrimitiveType() : DataType(Derived::type_enum) {} std::string ToString() const override; }; @@ -224,22 +185,19 @@ struct PrimitiveType : public DataType { template <typename Derived> inline std::string PrimitiveType<Derived>::ToString() const { std::string result(static_cast<const Derived*>(this)->name()); - if (!nullable) { - result.append(" not null"); - } return result; } -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ - static constexpr int size = SIZE; \ - \ - explicit TYPENAME(bool nullable = true) \ - : PrimitiveType<TYPENAME>(nullable) {} \ - \ - static const char* name() { \ - return NAME; \ +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr Type::type type_enum = Type::ENUM; \ + static constexpr int size = SIZE; \ + \ + TYPENAME() \ + : PrimitiveType<TYPENAME>() {} \ + \ + static const char* name() { \ + return NAME; \ } struct NullType : public PrimitiveType<NullType> { @@ -292,11 +250,23 @@ struct DoubleType : public PrimitiveType<DoubleType> { struct ListType : public DataType { // List can contain any other logical value type - TypePtr value_type; + explicit ListType(const std::shared_ptr<DataType>& value_type) + : DataType(Type::LIST) { + children_ = {std::make_shared<Field>("item", value_type)}; + } + + explicit ListType(const std::shared_ptr<Field>& value_field) + : DataType(Type::LIST) { + children_ = {value_field}; + } - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(LogicalType::LIST, nullable), - value_type(value_type) {} + const std::shared_ptr<Field>& value_field() const { + return children_[0]; + } + + const std::shared_ptr<DataType>& value_type() const { + return children_[0]->type; + } static char const *name() { return "list"; @@ -307,9 +277,7 @@ struct ListType : public DataType { // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true); - - StringType(const StringType& other); + StringType(); static char const *name() { return "string"; @@ -319,20 +287,9 @@ struct StringType : public DataType { }; struct StructType : public DataType { - std::vector<std::shared_ptr<Field> > fields_; - - explicit StructType(const std::vector<std::shared_ptr<Field> >& fields, - bool nullable = true) - : DataType(LogicalType::STRUCT, nullable) { - fields_ = fields; - } - - const std::shared_ptr<Field>& field(int i) const { - return fields_[i]; - } - - int num_children() const { - return fields_.size(); + explicit StructType(const std::vector<std::shared_ptr<Field>>& fields) + : DataType(Type::STRUCT) { + children_ = fields; } std::string ToString() const override; http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 57cabde..595b3be 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -26,8 +26,6 @@ install(FILES construct.h datetime.h decimal.h - floating.h - integer.h json.h list.h primitive.h http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/boolean.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index a5023d7..1cb91f9 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -22,7 +22,7 @@ namespace arrow { -typedef PrimitiveArrayImpl<BooleanType> BooleanArray; +// typedef PrimitiveArrayImpl<BooleanType> BooleanArray; class BooleanBuilder : public ArrayBuilder { }; http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/collection.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 42a9c92..46d84f1 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -25,7 +25,7 @@ namespace arrow { -template <LogicalType::type T> +template <Type::type T> struct CollectionType : public DataType { std::vector<TypePtr> child_types_; http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/construct.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 43f01a3..290decd 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -19,24 +19,26 @@ #include <memory> -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" +#include "arrow/type.h" +#include "arrow/types/primitive.h" #include "arrow/types/list.h" #include "arrow/types/string.h" +#include "arrow/util/buffer.h" #include "arrow/util/status.h" namespace arrow { class ArrayBuilder; -// Initially looked at doing this with vtables, but shared pointers makes it -// difficult - #define BUILDER_CASE(ENUM, BuilderType) \ - case LogicalType::ENUM: \ + case Type::ENUM: \ out->reset(new BuilderType(pool, type)); \ return Status::OK(); +// Initially looked at doing this with vtables, but shared pointers makes it +// difficult +// +// TODO(wesm): come up with a less monolithic strategy Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, std::shared_ptr<ArrayBuilder>* out) { switch (type->type) { @@ -56,30 +58,41 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, BUILDER_CASE(STRING, StringBuilder); - case LogicalType::LIST: + case Type::LIST: { std::shared_ptr<ArrayBuilder> value_builder; const std::shared_ptr<DataType>& value_type = static_cast<ListType*>( - type.get())->value_type; + type.get())->value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } - // BUILDER_CASE(CHAR, CharBuilder); - - // BUILDER_CASE(VARCHAR, VarcharBuilder); - // BUILDER_CASE(BINARY, BinaryBuilder); - - // BUILDER_CASE(DATE, DateBuilder); - // BUILDER_CASE(TIMESTAMP, TimestampBuilder); - // BUILDER_CASE(TIME, TimeBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} - // BUILDER_CASE(LIST, ListBuilder); - // BUILDER_CASE(STRUCT, StructBuilder); - // BUILDER_CASE(DENSE_UNION, DenseUnionBuilder); - // BUILDER_CASE(SPARSE_UNION, SparseUnionBuilder); +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_count, nulls)); \ + return Status::OK(); +Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, + int32_t length, const std::shared_ptr<Buffer>& data, + int32_t null_count, const std::shared_ptr<Buffer>& nulls, + std::shared_ptr<Array>* out) { + switch (type->type) { + MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT8, Int8Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT16, UInt16Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT16, Int16Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT32, UInt32Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); + MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); default: return Status::NotImplemented(type->ToString()); } http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/construct.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 59ebe1a..089c484 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -18,19 +18,26 @@ #ifndef ARROW_TYPES_CONSTRUCT_H #define ARROW_TYPES_CONSTRUCT_H +#include <cstdint> #include <memory> -#include "arrow/type.h" - namespace arrow { +class Array; class ArrayBuilder; +class Buffer; +struct DataType; class MemoryPool; class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, std::shared_ptr<ArrayBuilder>* out); +Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, + int32_t length, const std::shared_ptr<Buffer>& data, + int32_t null_count, const std::shared_ptr<Buffer>& nulls, + std::shared_ptr<Array>* out); + } // namespace arrow #endif // ARROW_BUILDER_H_ http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/datetime.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index 765fc29..e57b66a 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,8 +31,8 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY, bool nullable = true) - : DataType(LogicalType::DATE, nullable), + explicit DateType(Unit unit = Unit::DAY) + : DataType(Type::DATE), unit(unit) {} DateType(const DateType& other) @@ -41,10 +41,6 @@ struct DateType : public DataType { static char const *name() { return "date"; } - - // virtual std::string ToString() { - // return name(); - // } }; @@ -58,8 +54,8 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) - : DataType(LogicalType::TIMESTAMP, nullable), + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(Type::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) @@ -68,10 +64,6 @@ struct TimestampType : public DataType { static char const *name() { return "timestamp"; } - - // virtual std::string ToString() { - // return name(); - // } }; } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/floating.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/floating.cc b/cpp/src/arrow/types/floating.cc deleted file mode 100644 index bde2826..0000000 --- a/cpp/src/arrow/types/floating.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/floating.h" - -namespace arrow { - -} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/65db0da8/cpp/src/arrow/types/floating.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/floating.h b/cpp/src/arrow/types/floating.h deleted file mode 100644 index e752278..0000000 --- a/cpp/src/arrow/types/floating.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_FLOATING_H -#define ARROW_TYPES_FLOATING_H - -#include <string> - -#include "arrow/types/primitive.h" -#include "arrow/type.h" - -namespace arrow { - -typedef PrimitiveArrayImpl<FloatType> FloatArray; -typedef PrimitiveArrayImpl<DoubleType> DoubleArray; - -typedef PrimitiveBuilder<FloatType, FloatArray> FloatBuilder; -typedef PrimitiveBuilder<DoubleType, DoubleArray> DoubleBuilder; - -} // namespace arrow - -#endif // ARROW_TYPES_FLOATING_H