Repository: arrow Updated Branches: refs/heads/master 3aac4adef -> d2d27555b
ARROW-658: [C++] Implement a prototype in-memory arrow::Tensor type I haven't implemented much beyond the data container and automatically computing row major strides. If we agree on the basics, then I will implement IPC read/writes of this data structure in a follow up patch. cc @pcmoritz @robertnishihara @JohanMabille @sylvaincorlay Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #438 from wesm/ARROW-658 and squashes the following commits: 7f82028 [Wes McKinney] Include numeric STL header 8160393 [Wes McKinney] std::accumulate is in algorithm header bdd4c55 [Wes McKinney] No need to special case 0-dim 471c719 [Wes McKinney] Add test for 0-d tensor. Use std::accumulate in Tensor::size 8d4a13a [Wes McKinney] Make std::vector args const-refs 8bd9716 [Wes McKinney] Add extern templates for numeric tensors 7d805bf [Wes McKinney] cpplint 8b65aea [Wes McKinney] Implement a prototype in-memory arrow::Tensor type Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/d2d27555 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/d2d27555 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/d2d27555 Branch: refs/heads/master Commit: d2d27555b4b2f3f0ba26539211bfe8b4d1b52481 Parents: 3aac4ad Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Mon Mar 27 10:43:56 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Mon Mar 27 10:43:56 2017 -0400 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 1 + cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/buffer.cc | 4 - cpp/src/arrow/buffer.h | 7 +- cpp/src/arrow/tensor-test.cc | 73 ++++++++++++++++++ cpp/src/arrow/tensor.cc | 116 ++++++++++++++++++++++++++++ cpp/src/arrow/tensor.h | 158 ++++++++++++++++++++++++++++++++++++++ cpp/src/arrow/type_fwd.h | 13 +++- 8 files changed, 359 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c77cf60..e4c18ca 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -792,6 +792,7 @@ set(ARROW_SRCS src/arrow/schema.cc src/arrow/status.cc src/arrow/table.cc + src/arrow/tensor.cc src/arrow/type.cc src/arrow/visitor.cc http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 0e83aac..f965f1d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -65,6 +65,7 @@ ADD_ARROW_TEST(pretty_print-test) ADD_ARROW_TEST(status-test) ADD_ARROW_TEST(type-test) ADD_ARROW_TEST(table-test) +ADD_ARROW_TEST(tensor-test) ADD_ARROW_BENCHMARK(builder-benchmark) ADD_ARROW_BENCHMARK(column-benchmark) http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/buffer.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc index 28edf5e..be747e1 100644 --- a/cpp/src/arrow/buffer.cc +++ b/cpp/src/arrow/buffer.cc @@ -68,10 +68,6 @@ bool Buffer::Equals(const Buffer& other) const { static_cast<size_t>(size_)))); } -std::shared_ptr<Buffer> MutableBuffer::GetImmutableView() { - return std::make_shared<Buffer>(this->get_shared_ptr(), 0, size()); -} - PoolBuffer::PoolBuffer(MemoryPool* pool) : ResizableBuffer(nullptr, 0) { if (pool == nullptr) { pool = default_memory_pool(); } pool_ = pool; http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/buffer.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 449bb53..713d57a 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -43,7 +43,7 @@ class Status; /// of bytes that where allocated for the buffer in total. /// /// The following invariant is always true: Size < Capacity -class ARROW_EXPORT Buffer : public std::enable_shared_from_this<Buffer> { +class ARROW_EXPORT Buffer { public: Buffer(const uint8_t* data, int64_t size) : is_mutable_(false), data_(data), size_(size), capacity_(size) {} @@ -58,8 +58,6 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this<Buffer> { /// we might add utility methods to help determine if a buffer satisfies this contract. Buffer(const std::shared_ptr<Buffer>& parent, int64_t offset, int64_t size); - std::shared_ptr<Buffer> get_shared_ptr() { return shared_from_this(); } - bool is_mutable() const { return is_mutable_; } /// Return true if both buffers are the same size and contain the same bytes @@ -111,9 +109,6 @@ class ARROW_EXPORT MutableBuffer : public Buffer { uint8_t* mutable_data() { return mutable_data_; } - /// Get a read-only view of this buffer - std::shared_ptr<Buffer> GetImmutableView(); - protected: MutableBuffer() : Buffer(nullptr, 0), mutable_data_(nullptr) {} http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/tensor-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/tensor-test.cc b/cpp/src/arrow/tensor-test.cc new file mode 100644 index 0000000..99a9493 --- /dev/null +++ b/cpp/src/arrow/tensor-test.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Unit tests for DataType (and subclasses), Field, and Schema + +#include <memory> +#include <string> +#include <vector> + +#include "gtest/gtest.h" + +#include "arrow/buffer.h" +#include "arrow/tensor.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { + +TEST(TestTensor, ZeroDim) { + const int64_t values = 1; + std::vector<int64_t> shape = {}; + + using T = int64_t; + + std::shared_ptr<MutableBuffer> buffer; + ASSERT_OK(AllocateBuffer(default_memory_pool(), values * sizeof(T), &buffer)); + + Int64Tensor t0(buffer, shape); + + ASSERT_EQ(1, t0.size()); +} + +TEST(TestTensor, BasicCtors) { + const int64_t values = 24; + std::vector<int64_t> shape = {4, 6}; + std::vector<int64_t> strides = {48, 8}; + std::vector<std::string> dim_names = {"foo", "bar"}; + + using T = int64_t; + + std::shared_ptr<MutableBuffer> buffer; + ASSERT_OK(AllocateBuffer(default_memory_pool(), values * sizeof(T), &buffer)); + + Int64Tensor t1(buffer, shape); + Int64Tensor t2(buffer, shape, strides); + Int64Tensor t3(buffer, shape, strides, dim_names); + + ASSERT_EQ(24, t1.size()); + ASSERT_TRUE(t1.is_mutable()); + ASSERT_FALSE(t1.has_dim_names()); + + ASSERT_EQ(strides, t1.strides()); + ASSERT_EQ(strides, t2.strides()); + + ASSERT_EQ("foo", t3.dim_name(0)); + ASSERT_EQ("bar", t3.dim_name(1)); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/tensor.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc new file mode 100644 index 0000000..c0d128f --- /dev/null +++ b/cpp/src/arrow/tensor.cc @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/tensor.h" + +#include <algorithm> +#include <cstdint> +#include <functional> +#include <memory> +#include <numeric> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/logging.h" + +namespace arrow { + +void ComputeRowMajorStrides(const FixedWidthType& type, const std::vector<int64_t>& shape, + std::vector<int64_t>* strides) { + int64_t remaining = type.bit_width() / 8; + for (int64_t dimsize : shape) { + remaining *= dimsize; + } + + for (int64_t dimsize : shape) { + remaining /= dimsize; + strides->push_back(remaining); + } +} + +/// Constructor with strides and dimension names +Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape, const std::vector<int64_t>& strides, + const std::vector<std::string>& dim_names) + : type_(type), data_(data), shape_(shape), strides_(strides), dim_names_(dim_names) { + DCHECK(is_tensor_supported(type->type)); + if (shape.size() > 0 && strides.size() == 0) { + ComputeRowMajorStrides(static_cast<const FixedWidthType&>(*type_), shape, &strides_); + } +} + +Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape, const std::vector<int64_t>& strides) + : Tensor(type, data, shape, strides, {}) {} + +Tensor::Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape) + : Tensor(type, data, shape, {}, {}) {} + +const std::string& Tensor::dim_name(int i) const { + DCHECK_LT(i, static_cast<int>(dim_names_.size())); + return dim_names_[i]; +} + +int64_t Tensor::size() const { + return std::accumulate( + shape_.begin(), shape_.end(), 1, std::multiplies<int64_t>()); +} + +template <typename T> +NumericTensor<T>::NumericTensor(const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape, const std::vector<int64_t>& strides, + const std::vector<std::string>& dim_names) + : Tensor(TypeTraits<T>::type_singleton(), data, shape, strides, dim_names), + raw_data_(nullptr), + mutable_raw_data_(nullptr) { + if (data_) { + raw_data_ = reinterpret_cast<const value_type*>(data_->data()); + if (data_->is_mutable()) { + auto mut_buf = static_cast<MutableBuffer*>(data_.get()); + mutable_raw_data_ = reinterpret_cast<value_type*>(mut_buf->mutable_data()); + } + } +} + +template <typename T> +NumericTensor<T>::NumericTensor( + const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape) + : NumericTensor(data, shape, {}, {}) {} + +template <typename T> +NumericTensor<T>::NumericTensor(const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape, const std::vector<int64_t>& strides) + : NumericTensor(data, shape, strides, {}) {} + +template class NumericTensor<Int8Type>; +template class NumericTensor<UInt8Type>; +template class NumericTensor<Int16Type>; +template class NumericTensor<UInt16Type>; +template class NumericTensor<Int32Type>; +template class NumericTensor<UInt32Type>; +template class NumericTensor<Int64Type>; +template class NumericTensor<UInt64Type>; +template class NumericTensor<HalfFloatType>; +template class NumericTensor<FloatType>; +template class NumericTensor<DoubleType>; + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/tensor.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h new file mode 100644 index 0000000..0059368 --- /dev/null +++ b/cpp/src/arrow/tensor.h @@ -0,0 +1,158 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TENSOR_H +#define ARROW_TENSOR_H + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/buffer.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class MutableBuffer; +class Status; + +static inline bool is_tensor_supported(Type::type type_id) { + switch (type_id) { + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::UINT64: + case Type::INT64: + case Type::HALF_FLOAT: + case Type::FLOAT: + case Type::DOUBLE: + return true; + default: + break; + } + return false; +} + +class ARROW_EXPORT Tensor { + public: + virtual ~Tensor() = default; + + /// Constructor with no dimension names or strides, data assumed to be row-major + Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape); + + /// Constructor with non-negative strides + Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape, const std::vector<int64_t>& strides); + + /// Constructor with strides and dimension names + Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data, + const std::vector<int64_t>& shape, const std::vector<int64_t>& strides, + const std::vector<std::string>& dim_names); + + std::shared_ptr<Buffer> data() const { return data_; } + const std::vector<int64_t>& shape() const { return shape_; } + const std::vector<int64_t>& strides() const { return strides_; } + + const std::string& dim_name(int i) const; + bool has_dim_names() const { return shape_.size() > 0 && dim_names_.size() > 0; } + + /// Total number of value cells in the tensor + int64_t size() const; + + /// Return true if the underlying data buffer is mutable + bool is_mutable() const { return data_->is_mutable(); } + + protected: + Tensor() {} + + std::shared_ptr<DataType> type_; + + std::shared_ptr<Buffer> data_; + + std::vector<int64_t> shape_; + std::vector<int64_t> strides_; + + /// These names are optional + std::vector<std::string> dim_names_; + + private: + DISALLOW_COPY_AND_ASSIGN(Tensor); +}; + +template <typename T> +class ARROW_EXPORT NumericTensor : public Tensor { + public: + using value_type = typename T::c_type; + + NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape); + + /// Constructor with non-negative strides + NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape, + const std::vector<int64_t>& strides); + + /// Constructor with strides and dimension names + NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape, + const std::vector<int64_t>& strides, const std::vector<std::string>& dim_names); + + const value_type* raw_data() const { return raw_data_; } + value_type* raw_data() { return mutable_raw_data_; } + + private: + const value_type* raw_data_; + value_type* mutable_raw_data_; +}; + +// ---------------------------------------------------------------------- +// extern templates and other details + +// gcc and clang disagree about how to handle template visibility when you have +// explicit specializations https://llvm.org/bugs/show_bug.cgi?id=24815 +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +// Only instantiate these templates once +extern template class ARROW_EXPORT NumericTensor<Int8Type>; +extern template class ARROW_EXPORT NumericTensor<UInt8Type>; +extern template class ARROW_EXPORT NumericTensor<Int16Type>; +extern template class ARROW_EXPORT NumericTensor<UInt16Type>; +extern template class ARROW_EXPORT NumericTensor<Int32Type>; +extern template class ARROW_EXPORT NumericTensor<UInt32Type>; +extern template class ARROW_EXPORT NumericTensor<Int64Type>; +extern template class ARROW_EXPORT NumericTensor<UInt64Type>; +extern template class ARROW_EXPORT NumericTensor<HalfFloatType>; +extern template class ARROW_EXPORT NumericTensor<FloatType>; +extern template class ARROW_EXPORT NumericTensor<DoubleType>; + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + +} // namespace arrow + +#endif // ARROW_TENSOR_H http://git-wip-us.apache.org/repos/asf/arrow/blob/d2d27555/cpp/src/arrow/type_fwd.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 201f4e9..04ddf7e 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -30,6 +30,7 @@ struct DataType; class Array; class ArrayBuilder; struct Field; +class Tensor; class Buffer; class MemoryPool; @@ -78,10 +79,14 @@ class NumericArray; template <typename TypeClass> class NumericBuilder; -#define _NUMERIC_TYPE_DECL(KLASS) \ - struct KLASS##Type; \ - using KLASS##Array = NumericArray<KLASS##Type>; \ - using KLASS##Builder = NumericBuilder<KLASS##Type>; +template <typename TypeClass> +class NumericTensor; + +#define _NUMERIC_TYPE_DECL(KLASS) \ + struct KLASS##Type; \ + using KLASS##Array = NumericArray<KLASS##Type>; \ + using KLASS##Builder = NumericBuilder<KLASS##Type>; \ + using KLASS##Tensor = NumericTensor<KLASS##Type>; _NUMERIC_TYPE_DECL(Int8); _NUMERIC_TYPE_DECL(Int16);