ARROW-655: [C++/Python] Implement DecimalArray Adds Decimal support for C++ and Python.
TODOs: - [x] Tighten up some of the GIL acquisition. E.g., we may not need to hold it when importing the decimal module if we acquire it where we import the decimal module. - [x] Investigate FreeBSD issue (manifesting on OS X) where typeinfo symbols for `__int128_t` are not exported: https://bugs.llvm.org//show_bug.cgi?id=26156. - [x] See if there's a better way to visit scalar decimals, rather than keeping extra state on the class. Seems like an unacceptable hack. Author: Phillip Cloud <cpcl...@gmail.com> Closes #403 from cpcloud/decimal and squashes the following commits: e5470fd [Phillip Cloud] Remove unnecessary header in helpers.h 07713a7 [Phillip Cloud] Remove more boost leakage f764156 [Phillip Cloud] Revert "Transitively link static libs as well" a7109b2 [Phillip Cloud] Transitively link static libs as well bf2a7ea [Phillip Cloud] Move IsNegative to cc file cb2c1ac [Phillip Cloud] Do not link boost regex to jemalloc e63b766 [Phillip Cloud] Remove python extra cmake args 805bbac [Phillip Cloud] ARROW-655: [C++/Python] Implement DecimalArray Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/754bcce6 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/754bcce6 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/754bcce6 Branch: refs/heads/master Commit: 754bcce686ecf02e123dcf4801715bf155f15e1f Parents: 449f991 Author: Phillip Cloud <cpcl...@gmail.com> Authored: Sun Apr 9 15:19:53 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Sun Apr 9 15:19:53 2017 -0400 ---------------------------------------------------------------------- .travis.yml | 1 + cpp/CMakeLists.txt | 27 ++- cpp/cmake_modules/FindPythonLibsNew.cmake | 3 +- cpp/src/arrow/array-decimal-test.cc | 194 +++++++++++++++++++++- cpp/src/arrow/array.cc | 49 +++++- cpp/src/arrow/array.h | 31 +++- cpp/src/arrow/builder.cc | 88 +++++++++- cpp/src/arrow/builder.h | 29 +++- cpp/src/arrow/compare.cc | 40 ++++- cpp/src/arrow/ipc/CMakeLists.txt | 7 +- cpp/src/arrow/python/CMakeLists.txt | 3 +- cpp/src/arrow/python/builtin_convert.cc | 62 ++++++- cpp/src/arrow/python/builtin_convert.h | 2 +- cpp/src/arrow/python/common.h | 9 +- cpp/src/arrow/python/helpers.cc | 79 +++++++++ cpp/src/arrow/python/helpers.h | 26 ++- cpp/src/arrow/python/pandas_convert.cc | 176 +++++++++++++++++++- cpp/src/arrow/python/python-test.cc | 33 ++++ cpp/src/arrow/type.cc | 18 +- cpp/src/arrow/type.h | 26 ++- cpp/src/arrow/type_fwd.h | 2 + cpp/src/arrow/type_traits.h | 13 +- cpp/src/arrow/util/CMakeLists.txt | 2 + cpp/src/arrow/util/bit-util.h | 1 - cpp/src/arrow/util/decimal-test.cc | 161 ++++++++++++++++++ cpp/src/arrow/util/decimal.cc | 141 ++++++++++++++++ cpp/src/arrow/util/decimal.h | 144 ++++++++++++++++ cpp/src/arrow/visitor_inline.h | 2 +- format/Schema.fbs | 2 + python/pyarrow/__init__.py | 2 +- python/pyarrow/array.pxd | 4 + python/pyarrow/array.pyx | 5 + python/pyarrow/includes/common.pxd | 5 + python/pyarrow/includes/libarrow.pxd | 16 ++ python/pyarrow/scalar.pxd | 1 + python/pyarrow/scalar.pyx | 25 ++- python/pyarrow/schema.pxd | 10 +- python/pyarrow/schema.pyx | 28 +++- python/pyarrow/tests/test_convert_builtin.py | 40 +++++ python/pyarrow/tests/test_convert_pandas.py | 70 ++++++++ 40 files changed, 1497 insertions(+), 80 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index b219b03..f74a3b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,6 +14,7 @@ addons: - valgrind - libboost-dev - libboost-filesystem-dev + - libboost-regex-dev - libboost-system-dev - libjemalloc-dev - gtk-doc-tools http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9947a34..5852fe5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -398,30 +398,36 @@ if (ARROW_BOOST_USE_SHARED) add_definitions(-DBOOST_ALL_DYN_LINK) endif() - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS system filesystem regex REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_shared) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + set(BOOST_REGEX_LIBRARY boost_regex_shared) else() # Find static boost headers and libs # TODO Differentiate here between release and debug builds set(Boost_USE_STATIC_LIBS ON) - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS system filesystem regex REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) + set(BOOST_REGEX_LIBRARY boost_regex_static) endif() message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIRS}) @@ -435,7 +441,11 @@ ADD_THIRDPARTY_LIB(boost_filesystem STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}" SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}") -SET(ARROW_BOOST_LIBS boost_system boost_filesystem) +ADD_THIRDPARTY_LIB(boost_regex + STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}" + SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}") + +SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex) include_directories(SYSTEM ${Boost_INCLUDE_DIR}) @@ -695,14 +705,16 @@ endif() set(ARROW_MIN_TEST_LIBS arrow_static arrow_test_main - ${ARROW_BASE_LIBS}) + ${ARROW_BASE_LIBS} + ${BOOST_REGEX_LIBRARY}) set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) set(ARROW_BENCHMARK_LINK_LIBS arrow_static arrow_benchmark_main - ${ARROW_BASE_LIBS}) + ${ARROW_BASE_LIBS} + ${BOOST_REGEX_LIBRARY}) ############################################################ # "make ctags" target @@ -796,7 +808,7 @@ endif() ############################################################ set(ARROW_LINK_LIBS -) + ${BOOST_REGEX_LIBRARY}) set(ARROW_PRIVATE_LINK_LIBS ) @@ -816,6 +828,7 @@ set(ARROW_SRCS src/arrow/visitor.cc src/arrow/util/bit-util.cc + src/arrow/util/decimal.cc ) if(NOT APPLE AND NOT MSVC) @@ -825,9 +838,11 @@ if(NOT APPLE AND NOT MSVC) set(ARROW_SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/arrow/symbols.map") endif() + ADD_ARROW_LIB(arrow SOURCES ${ARROW_SRCS} SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} + SHARED_LINK_LIBS ${ARROW_LINK_LIBS} ) add_subdirectory(src/arrow) http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/cmake_modules/FindPythonLibsNew.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/FindPythonLibsNew.cmake b/cpp/cmake_modules/FindPythonLibsNew.cmake index dfe5661..d9cc4b3 100644 --- a/cpp/cmake_modules/FindPythonLibsNew.cmake +++ b/cpp/cmake_modules/FindPythonLibsNew.cmake @@ -175,7 +175,8 @@ else() find_library(PYTHON_LIBRARY NAMES "python${PYTHON_LIBRARY_SUFFIX}" PATHS ${_PYTHON_LIBS_SEARCH} - NO_SYSTEM_ENVIRONMENT_PATH) + NO_SYSTEM_ENVIRONMENT_PATH + NO_CMAKE_SYSTEM_PATH) message(STATUS "Found Python lib ${PYTHON_LIBRARY}") endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/array-decimal-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-decimal-test.cc b/cpp/src/arrow/array-decimal-test.cc index b64023b..4c01f92 100644 --- a/cpp/src/arrow/array-decimal-test.cc +++ b/cpp/src/arrow/array-decimal-test.cc @@ -15,13 +15,16 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/type.h" #include "gtest/gtest.h" -#include "arrow/type.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" +#include "arrow/util/decimal.h" namespace arrow { -TEST(TypesTest, TestDecimalType) { +TEST(TypesTest, TestDecimal32Type) { DecimalType t1(8, 4); ASSERT_EQ(t1.type, Type::DECIMAL); @@ -29,6 +32,193 @@ TEST(TypesTest, TestDecimalType) { ASSERT_EQ(t1.scale, 4); ASSERT_EQ(t1.ToString(), std::string("decimal(8, 4)")); + + // Test properties + ASSERT_EQ(t1.byte_width(), 4); + ASSERT_EQ(t1.bit_width(), 32); } +TEST(TypesTest, TestDecimal64Type) { + DecimalType t1(12, 5); + + ASSERT_EQ(t1.type, Type::DECIMAL); + ASSERT_EQ(t1.precision, 12); + ASSERT_EQ(t1.scale, 5); + + ASSERT_EQ(t1.ToString(), std::string("decimal(12, 5)")); + + // Test properties + ASSERT_EQ(t1.byte_width(), 8); + ASSERT_EQ(t1.bit_width(), 64); +} + +TEST(TypesTest, TestDecimal128Type) { + DecimalType t1(27, 7); + + ASSERT_EQ(t1.type, Type::DECIMAL); + ASSERT_EQ(t1.precision, 27); + ASSERT_EQ(t1.scale, 7); + + ASSERT_EQ(t1.ToString(), std::string("decimal(27, 7)")); + + // Test properties + ASSERT_EQ(t1.byte_width(), 16); + ASSERT_EQ(t1.bit_width(), 128); +} + +template <typename T> +class DecimalTestBase { + public: + virtual std::vector<uint8_t> data( + const std::vector<T>& input, size_t byte_width) const = 0; + + void test(int precision, const std::vector<T>& draw, + const std::vector<uint8_t>& valid_bytes, + const std::vector<uint8_t>& sign_bitmap = {}, int64_t offset = 0) const { + auto type = std::make_shared<DecimalType>(precision, 4); + int byte_width = type->byte_width(); + auto pool = default_memory_pool(); + auto builder = std::make_shared<DecimalBuilder>(pool, type); + size_t null_count = 0; + + size_t size = draw.size(); + builder->Reserve(size); + + for (size_t i = 0; i < size; ++i) { + if (valid_bytes[i]) { + builder->Append(draw[i]); + } else { + builder->AppendNull(); + ++null_count; + } + } + + std::shared_ptr<Buffer> expected_sign_bitmap; + if (!sign_bitmap.empty()) { + BitUtil::BytesToBits(sign_bitmap, &expected_sign_bitmap); + } + + auto raw_bytes = data(draw, byte_width); + auto expected_data = std::make_shared<Buffer>(raw_bytes.data(), size * byte_width); + auto expected_null_bitmap = test::bytes_to_null_buffer(valid_bytes); + int64_t expected_null_count = test::null_count(valid_bytes); + auto expected = std::make_shared<DecimalArray>(type, size, expected_data, + expected_null_bitmap, expected_null_count, offset, expected_sign_bitmap); + + std::shared_ptr<Array> out; + ASSERT_OK(builder->Finish(&out)); + ASSERT_TRUE(out->Equals(*expected)); + } +}; + +template <typename T> +class DecimalTest : public DecimalTestBase<T> { + public: + std::vector<uint8_t> data( + const std::vector<T>& input, size_t byte_width) const override { + std::vector<uint8_t> result; + result.reserve(input.size() * byte_width); + // TODO(phillipc): There's probably a better way to do this + constexpr static const size_t bytes_per_element = sizeof(T); + for (size_t i = 0, j = 0; i < input.size(); ++i, j += bytes_per_element) { + *reinterpret_cast<typename T::value_type*>(&result[j]) = input[i].value; + } + return result; + } +}; + +template <> +class DecimalTest<Decimal128> : public DecimalTestBase<Decimal128> { + public: + std::vector<uint8_t> data( + const std::vector<Decimal128>& input, size_t byte_width) const override { + std::vector<uint8_t> result; + result.reserve(input.size() * byte_width); + constexpr static const size_t bytes_per_element = 16; + for (size_t i = 0; i < input.size(); ++i) { + uint8_t stack_bytes[bytes_per_element] = {0}; + uint8_t* bytes = stack_bytes; + bool is_negative; + ToBytes(input[i], &bytes, &is_negative); + + for (size_t i = 0; i < bytes_per_element; ++i) { + result.push_back(bytes[i]); + } + } + return result; + } +}; + +class Decimal32BuilderTest : public ::testing::TestWithParam<int>, + public DecimalTest<Decimal32> {}; + +class Decimal64BuilderTest : public ::testing::TestWithParam<int>, + public DecimalTest<Decimal64> {}; + +class Decimal128BuilderTest : public ::testing::TestWithParam<int>, + public DecimalTest<Decimal128> {}; + +TEST_P(Decimal32BuilderTest, NoNulls) { + int precision = GetParam(); + std::vector<Decimal32> draw = { + Decimal32(1), Decimal32(2), Decimal32(2389), Decimal32(4), Decimal32(-12348)}; + std::vector<uint8_t> valid_bytes = {true, true, true, true, true}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal64BuilderTest, NoNulls) { + int precision = GetParam(); + std::vector<Decimal64> draw = { + Decimal64(1), Decimal64(2), Decimal64(2389), Decimal64(4), Decimal64(-12348)}; + std::vector<uint8_t> valid_bytes = {true, true, true, true, true}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal128BuilderTest, NoNulls) { + int precision = GetParam(); + std::vector<Decimal128> draw = { + Decimal128(1), Decimal128(-2), Decimal128(2389), Decimal128(4), Decimal128(-12348)}; + std::vector<uint8_t> valid_bytes = {true, true, true, true, true}; + std::vector<uint8_t> sign_bitmap = {false, true, false, false, true}; + this->test(precision, draw, valid_bytes, sign_bitmap); +} + +TEST_P(Decimal32BuilderTest, WithNulls) { + int precision = GetParam(); + std::vector<Decimal32> draw = { + Decimal32(1), Decimal32(2), Decimal32(-1), Decimal32(4), Decimal32(-1)}; + std::vector<uint8_t> valid_bytes = {true, true, false, true, false}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal64BuilderTest, WithNulls) { + int precision = GetParam(); + std::vector<Decimal64> draw = { + Decimal64(-1), Decimal64(2), Decimal64(-1), Decimal64(4), Decimal64(-1)}; + std::vector<uint8_t> valid_bytes = {true, true, false, true, false}; + this->test(precision, draw, valid_bytes); +} + +TEST_P(Decimal128BuilderTest, WithNulls) { + int precision = GetParam(); + std::vector<Decimal128> draw = {Decimal128(1), Decimal128(2), Decimal128(-1), + Decimal128(4), Decimal128(-1), Decimal128(1), Decimal128(2), + Decimal128("230342903942.234234"), Decimal128("-23049302932.235234")}; + std::vector<uint8_t> valid_bytes = { + true, true, false, true, false, true, true, true, true}; + std::vector<uint8_t> sign_bitmap = { + false, false, false, false, false, false, false, false, true}; + this->test(precision, draw, valid_bytes, sign_bitmap); +} + +INSTANTIATE_TEST_CASE_P(Decimal32BuilderTest, Decimal32BuilderTest, + ::testing::Range( + DecimalPrecision<int32_t>::minimum, DecimalPrecision<int32_t>::maximum)); +INSTANTIATE_TEST_CASE_P(Decimal64BuilderTest, Decimal64BuilderTest, + ::testing::Range( + DecimalPrecision<int64_t>::minimum, DecimalPrecision<int64_t>::maximum)); +INSTANTIATE_TEST_CASE_P(Decimal128BuilderTest, Decimal128BuilderTest, + ::testing::Range( + DecimalPrecision<int128_t>::minimum, DecimalPrecision<int128_t>::maximum)); + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index bd20654..4e73e71 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -27,6 +27,7 @@ #include "arrow/status.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/visitor.h" #include "arrow/visitor_inline.h" @@ -283,10 +284,8 @@ std::shared_ptr<Array> StringArray::Slice(int64_t offset, int64_t length) const FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length, const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count, int64_t offset) - : PrimitiveArray(type, length, data, null_bitmap, null_count, offset) { - DCHECK(type->type == Type::FIXED_SIZE_BINARY); - byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width(); -} + : PrimitiveArray(type, length, data, null_bitmap, null_count, offset), + byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()) {} std::shared_ptr<Array> FixedSizeBinaryArray::Slice(int64_t offset, int64_t length) const { ConformSliceParams(offset_, length_, &offset, &length); @@ -294,6 +293,48 @@ std::shared_ptr<Array> FixedSizeBinaryArray::Slice(int64_t offset, int64_t lengt type_, length, data_, null_bitmap_, kUnknownNullCount, offset); } +const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const { + return raw_data_ + (i + offset_) * byte_width_; +} + +// ---------------------------------------------------------------------- +// Decimal +DecimalArray::DecimalArray(const std::shared_ptr<DataType>& type, int64_t length, + const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap, + int64_t null_count, int64_t offset, const std::shared_ptr<Buffer>& sign_bitmap) + : FixedSizeBinaryArray(type, length, data, null_bitmap, null_count, offset), + sign_bitmap_(sign_bitmap), + sign_bitmap_data_(sign_bitmap != nullptr ? sign_bitmap->data() : nullptr) {} + +bool DecimalArray::IsNegative(int64_t i) const { + return sign_bitmap_data_ != nullptr ? BitUtil::GetBit(sign_bitmap_data_, i) : false; +} + +template <typename T> +ARROW_EXPORT Decimal<T> DecimalArray::Value(int64_t i) const { + Decimal<T> result; + FromBytes(GetValue(i), &result); + return result; +} + +template ARROW_EXPORT Decimal32 DecimalArray::Value(int64_t i) const; +template ARROW_EXPORT Decimal64 DecimalArray::Value(int64_t i) const; + +template <> +ARROW_EXPORT Decimal128 DecimalArray::Value(int64_t i) const { + Decimal128 result; + FromBytes(GetValue(i), IsNegative(i), &result); + return result; +} + +template ARROW_EXPORT Decimal128 DecimalArray::Value(int64_t i) const; + +std::shared_ptr<Array> DecimalArray::Slice(int64_t offset, int64_t length) const { + ConformSliceParams(offset_, length_, &offset, &length); + return std::make_shared<DecimalArray>( + type_, length, data_, null_bitmap_, kUnknownNullCount, offset, sign_bitmap_); +} + // ---------------------------------------------------------------------- // Struct http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 9f0e739..a4117fa 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -39,6 +39,9 @@ class MemoryPool; class MutableBuffer; class Status; +template <typename T> +struct Decimal; + /// Immutable data array with some logical type and some length. /// /// Any memory is owned by the respective Buffer instance (or its parents). @@ -356,9 +359,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0, int64_t offset = 0); - const uint8_t* GetValue(int64_t i) const { - return raw_data_ + (i + offset_) * byte_width_; - } + const uint8_t* GetValue(int64_t i) const; int32_t byte_width() const { return byte_width_; } @@ -371,6 +372,30 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray { }; // ---------------------------------------------------------------------- +// DecimalArray +class ARROW_EXPORT DecimalArray : public FixedSizeBinaryArray { + public: + using TypeClass = Type; + + DecimalArray(const std::shared_ptr<DataType>& type, int64_t length, + const std::shared_ptr<Buffer>& data, + const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0, + int64_t offset = 0, const std::shared_ptr<Buffer>& sign_bitmap = nullptr); + + bool IsNegative(int64_t i) const; + + template <typename T> + ARROW_EXPORT Decimal<T> Value(int64_t i) const; + + std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override; + + private: + /// Only needed for 128 bit Decimals + std::shared_ptr<Buffer> sign_bitmap_; + const uint8_t* sign_bitmap_data_; +}; + +// ---------------------------------------------------------------------- // Struct class ARROW_EXPORT StructArray : public Array { http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 40b81cf..a3677ef 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -27,6 +27,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" namespace arrow { @@ -324,6 +325,85 @@ Status BooleanBuilder::Append( } // ---------------------------------------------------------------------- +// DecimalBuilder +DecimalBuilder::DecimalBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type) + : FixedSizeBinaryBuilder(pool, type), + sign_bitmap_(nullptr), + sign_bitmap_data_(nullptr) {} + +template <typename T> +ARROW_EXPORT Status DecimalBuilder::Append(const Decimal<T>& val) { + DCHECK_EQ(sign_bitmap_, nullptr) << "sign_bitmap_ is not null"; + DCHECK_EQ(sign_bitmap_data_, nullptr) << "sign_bitmap_data_ is not null"; + + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(reinterpret_cast<const uint8_t*>(&val.value)); +} + +template ARROW_EXPORT Status DecimalBuilder::Append(const Decimal32& val); +template ARROW_EXPORT Status DecimalBuilder::Append(const Decimal64& val); + +template <> +ARROW_EXPORT Status DecimalBuilder::Append(const Decimal128& value) { + DCHECK_NE(sign_bitmap_, nullptr) << "sign_bitmap_ is null"; + DCHECK_NE(sign_bitmap_data_, nullptr) << "sign_bitmap_data_ is null"; + + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + uint8_t stack_bytes[16] = {0}; + uint8_t* bytes = stack_bytes; + bool is_negative; + ToBytes(value, &bytes, &is_negative); + RETURN_NOT_OK(FixedSizeBinaryBuilder::Append(bytes)); + + // TODO(phillipc): calculate the proper storage size here (do we have a function to do + // this)? + // TODO(phillipc): Reserve number of elements + RETURN_NOT_OK(sign_bitmap_->Reserve(1)); + BitUtil::SetBitTo(sign_bitmap_data_, length_ - 1, is_negative); + return Status::OK(); +} + +template ARROW_EXPORT Status DecimalBuilder::Append(const Decimal128& val); + +Status DecimalBuilder::Init(int64_t capacity) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Init(capacity)); + if (byte_width_ == 16) { + AllocateResizableBuffer(pool_, null_bitmap_->size(), &sign_bitmap_); + sign_bitmap_data_ = sign_bitmap_->mutable_data(); + memset(sign_bitmap_data_, 0, static_cast<size_t>(sign_bitmap_->capacity())); + } + return Status::OK(); +} + +Status DecimalBuilder::Resize(int64_t capacity) { + int64_t old_bytes = null_bitmap_ != nullptr ? null_bitmap_->size() : 0; + if (sign_bitmap_ == nullptr) { return Init(capacity); } + RETURN_NOT_OK(FixedSizeBinaryBuilder::Resize(capacity)); + + if (byte_width_ == 16) { + RETURN_NOT_OK(sign_bitmap_->Resize(null_bitmap_->size())); + int64_t new_bytes = sign_bitmap_->size(); + sign_bitmap_data_ = sign_bitmap_->mutable_data(); + + // The buffer might be overpadded to deal with padding according to the spec + if (old_bytes < new_bytes) { + memset(sign_bitmap_data_ + old_bytes, 0, + static_cast<size_t>(sign_bitmap_->capacity() - old_bytes)); + } + } + return Status::OK(); +} + +Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) { + std::shared_ptr<Buffer> data = byte_builder_.Finish(); + + /// TODO(phillipc): not sure where to get the offset argument here + *out = std::make_shared<DecimalArray>( + type_, length_, data, null_bitmap_, null_count_, 0, sign_bitmap_); + return Status::OK(); +} + +// ---------------------------------------------------------------------- // ListBuilder ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> value_builder, @@ -440,10 +520,9 @@ Status StringBuilder::Finish(std::shared_ptr<Array>* out) { FixedSizeBinaryBuilder::FixedSizeBinaryBuilder( MemoryPool* pool, const std::shared_ptr<DataType>& type) - : ArrayBuilder(pool, type), byte_builder_(pool) { - DCHECK(type->type == Type::FIXED_SIZE_BINARY); - byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width(); -} + : ArrayBuilder(pool, type), + byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()), + byte_builder_(pool) {} Status FixedSizeBinaryBuilder::Append(const uint8_t* value) { RETURN_NOT_OK(Reserve(1)); @@ -543,6 +622,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, BUILDER_CASE(STRING, StringBuilder); BUILDER_CASE(BINARY, BinaryBuilder); BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); + BUILDER_CASE(DECIMAL, DecimalBuilder); case Type::LIST: { std::shared_ptr<ArrayBuilder> value_builder; std::shared_ptr<DataType> value_type = http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 60cdc4c..d42ab5b 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -37,6 +37,9 @@ namespace arrow { class Array; +template <typename T> +struct Decimal; + static constexpr int64_t kMinBuilderCapacity = 1 << 5; /// Base class for all data array builders. @@ -76,12 +79,12 @@ class ARROW_EXPORT ArrayBuilder { Status SetNotNull(int64_t length); /// Allocates initial capacity requirements for the builder. In most - /// cases subclasses should override and call there parent classes + /// cases subclasses should override and call their parent class's /// method as well. virtual Status Init(int64_t capacity); /// Resizes the null_bitmap array. In most - /// cases subclasses should override and call there parent classes + /// cases subclasses should override and call their parent class's /// method as well. virtual Status Resize(int64_t new_bits); @@ -275,9 +278,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { return Status::OK(); } - Status Append(uint8_t val) { - return Append(val != 0); - } + Status Append(uint8_t val) { return Append(val != 0); } /// Vector append /// @@ -415,6 +416,24 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { BufferBuilder byte_builder_; }; +class ARROW_EXPORT DecimalBuilder : public FixedSizeBinaryBuilder { + public: + explicit DecimalBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type); + + template <typename T> + ARROW_EXPORT Status Append(const Decimal<T>& val); + + Status Init(int64_t capacity) override; + Status Resize(int64_t capacity) override; + Status Finish(std::shared_ptr<Array>* out) override; + + private: + /// We only need these for 128 bit decimals, because boost stores the sign + /// separate from the underlying bytes. + std::shared_ptr<ResizableBuffer> sign_bitmap_; + uint8_t* sign_bitmap_data_; +}; + // ---------------------------------------------------------------------- // Struct http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/compare.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 7451439..2297e4b 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -29,6 +29,7 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -232,6 +233,41 @@ class RangeEqualsVisitor { return Status::OK(); } + Status Visit(const DecimalArray& left) { + const auto& right = static_cast<const DecimalArray&>(right_); + + int32_t width = left.byte_width(); + + const uint8_t* left_data = nullptr; + const uint8_t* right_data = nullptr; + + if (left.data()) { left_data = left.raw_data() + left.offset() * width; } + + if (right.data()) { right_data = right.raw_data() + right.offset() * width; } + + for (int64_t i = left_start_idx_, o_i = right_start_idx_; i < left_end_idx_; + ++i, ++o_i) { + if (left.IsNegative(i) != right.IsNegative(o_i)) { + result_ = false; + return Status::OK(); + } + + const bool is_null = left.IsNull(i); + if (is_null != right.IsNull(o_i)) { + result_ = false; + return Status::OK(); + } + if (is_null) continue; + + if (std::memcmp(left_data + width * i, right_data + width * o_i, width)) { + result_ = false; + return Status::OK(); + } + } + result_ = true; + return Status::OK(); + } + Status Visit(const NullArray& left) { UNUSED(left); result_ = true; @@ -244,10 +280,6 @@ class RangeEqualsVisitor { return CompareValues<T>(left); } - Status Visit(const DecimalArray& left) { - return Status::NotImplemented("Decimal type"); - } - Status Visit(const ListArray& left) { result_ = CompareLists(left); return Status::OK(); http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/ipc/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 57db033..c6880c5 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -27,7 +27,8 @@ set(ARROW_IPC_SHARED_LINK_LIBS set(ARROW_IPC_TEST_LINK_LIBS arrow_ipc_static arrow_io_static - arrow_static) + arrow_static + ${BOOST_REGEX_LIBRARY}) set(ARROW_IPC_SRCS feather.cc @@ -161,7 +162,8 @@ if(MSVC) arrow_io_static arrow_static ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_SYSTEM_LIBRARY}) + ${BOOST_SYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY}) else() set(UTIL_LINK_LIBS arrow_ipc_static @@ -169,6 +171,7 @@ else() arrow_static ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY} dl) endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index c69d976..604527f 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -37,7 +37,8 @@ set(ARROW_PYTHON_MIN_TEST_LIBS arrow_python_static arrow_ipc_static arrow_io_static - arrow_static) + arrow_static + ${BOOST_REGEX_LIBRARY}) if(ARROW_BUILD_TESTS) ADD_THIRDPARTY_LIB(python http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/builtin_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index 25b32ee..189ecee 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -17,12 +17,16 @@ #include <Python.h> #include <datetime.h> + +#include <algorithm> #include <sstream> +#include <string> #include "arrow/python/builtin_convert.h" #include "arrow/api.h" #include "arrow/status.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/python/helpers.h" @@ -109,7 +113,6 @@ class ScalarVisitor { int64_t float_count_; int64_t binary_count_; int64_t unicode_count_; - // Place to accumulate errors // std::vector<Status> errors_; }; @@ -394,8 +397,7 @@ class BytesConverter : public TypedConverter<BinaryBuilder> { } else if (PyBytes_Check(item)) { bytes_obj = item; } else { - return Status::Invalid( - "Value that cannot be converted to bytes was encountered"); + return Status::Invalid("Value that cannot be converted to bytes was encountered"); } // No error checking length = PyBytes_GET_SIZE(bytes_obj); @@ -429,8 +431,7 @@ class FixedWidthBytesConverter : public TypedConverter<FixedSizeBinaryBuilder> { } else if (PyBytes_Check(item)) { bytes_obj = item; } else { - return Status::Invalid( - "Value that cannot be converted to bytes was encountered"); + return Status::Invalid("Value that cannot be converted to bytes was encountered"); } // No error checking RETURN_NOT_OK(CheckPythonBytesAreFixedLength(bytes_obj, expected_length)); @@ -495,6 +496,54 @@ class ListConverter : public TypedConverter<ListBuilder> { std::shared_ptr<SeqConverter> value_converter_; }; +#define DECIMAL_CONVERT_CASE(bit_width, item, builder) \ + case bit_width: { \ + arrow::Decimal##bit_width out; \ + RETURN_NOT_OK(PythonDecimalToArrowDecimal((item), &out)); \ + RETURN_NOT_OK((builder)->Append(out)); \ + break; \ + } + +class DecimalConverter : public TypedConverter<arrow::DecimalBuilder> { + public: + Status AppendData(PyObject* seq) override { + /// Ensure we've allocated enough space + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + + /// Can the compiler figure out that the case statement below isn't necessary + /// once we're running? + const int bit_width = + std::dynamic_pointer_cast<arrow::DecimalType>(typed_builder_->type()) + ->bit_width(); + + OwnedRef ref; + PyObject* item = nullptr; + for (int64_t i = 0; i < size; ++i) { + ref.reset(PySequence_GetItem(seq, i)); + item = ref.obj(); + + /// TODO(phillipc): Check for nan? + if (item != Py_None) { + switch (bit_width) { + DECIMAL_CONVERT_CASE(32, item, typed_builder_) + DECIMAL_CONVERT_CASE(64, item, typed_builder_) + DECIMAL_CONVERT_CASE(128, item, typed_builder_) + default: + break; + } + RETURN_IF_PYERROR(); + } else { + RETURN_NOT_OK(typed_builder_->AppendNull()); + } + } + + return Status::OK(); + } +}; + +#undef DECIMAL_CONVERT_CASE + // Dynamic constructor for sequence converters std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type) { switch (type->type) { @@ -516,6 +565,9 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type return std::make_shared<UTF8Converter>(); case Type::LIST: return std::make_shared<ListConverter>(); + case Type::DECIMAL: { + return std::make_shared<DecimalConverter>(); + } case Type::STRUCT: default: return nullptr; http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/builtin_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h index 00ff0fd..3c2e350 100644 --- a/cpp/src/arrow/python/builtin_convert.h +++ b/cpp/src/arrow/python/builtin_convert.h @@ -25,7 +25,7 @@ #include <memory> -#include <arrow/type.h> +#include "arrow/type.h" #include "arrow/util/visibility.h" http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 32bfa78..a6806ab 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -57,12 +57,13 @@ class OwnedRef { } void reset(PyObject* obj) { - if (obj_ != nullptr) { Py_XDECREF(obj_); } + /// TODO(phillipc): Should we acquire the GIL here? It definitely needs to be + /// acquired, + /// but callers have probably already acquired it + Py_XDECREF(obj_); obj_ = obj; } - void release() { obj_ = nullptr; } - PyObject* obj() const { return obj_; } private: @@ -72,6 +73,7 @@ class OwnedRef { struct PyObjectStringify { OwnedRef tmp_obj; const char* bytes; + Py_ssize_t size; explicit PyObjectStringify(PyObject* obj) { PyObject* bytes_obj; @@ -82,6 +84,7 @@ struct PyObjectStringify { bytes_obj = obj; } bytes = PyBytes_AsString(bytes_obj); + size = PyBytes_GET_SIZE(bytes_obj); } }; http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/helpers.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index be5f412..ffba7bb 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -16,6 +16,8 @@ // under the License. #include "arrow/python/helpers.h" +#include "arrow/python/common.h" +#include "arrow/util/decimal.h" #include <arrow/api.h> @@ -52,5 +54,82 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) { } } +Status ImportModule(const std::string& module_name, OwnedRef* ref) { + PyAcquireGIL lock; + PyObject* module = PyImport_ImportModule(module_name.c_str()); + RETURN_IF_PYERROR(); + ref->reset(module); + return Status::OK(); +} + +Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref) { + /// Assumes that ImportModule was called first + DCHECK_NE(module.obj(), nullptr) << "Cannot import from nullptr Python module"; + + PyAcquireGIL lock; + PyObject* attr = PyObject_GetAttrString(module.obj(), name.c_str()); + RETURN_IF_PYERROR(); + ref->reset(attr); + return Status::OK(); +} + +template <typename T> +Status PythonDecimalToArrowDecimal(PyObject* python_decimal, Decimal<T>* arrow_decimal) { + // Call Python's str(decimal_object) + OwnedRef str_obj(PyObject_Str(python_decimal)); + RETURN_IF_PYERROR(); + + PyObjectStringify str(str_obj.obj()); + RETURN_IF_PYERROR(); + + const char* bytes = str.bytes; + DCHECK_NE(bytes, nullptr); + + Py_ssize_t size = str.size; + + std::string c_string(bytes, size); + return FromString(c_string, arrow_decimal); +} + +template Status PythonDecimalToArrowDecimal( + PyObject* python_decimal, Decimal32* arrow_decimal); +template Status PythonDecimalToArrowDecimal( + PyObject* python_decimal, Decimal64* arrow_decimal); +template Status PythonDecimalToArrowDecimal( + PyObject* python_decimal, Decimal128* arrow_decimal); + +Status InferDecimalPrecisionAndScale( + PyObject* python_decimal, int* precision, int* scale) { + // Call Python's str(decimal_object) + OwnedRef str_obj(PyObject_Str(python_decimal)); + RETURN_IF_PYERROR(); + PyObjectStringify str(str_obj.obj()); + + const char* bytes = str.bytes; + DCHECK_NE(bytes, nullptr); + + auto size = str.size; + + std::string c_string(bytes, size); + return FromString(c_string, static_cast<Decimal32*>(nullptr), precision, scale); +} + +Status DecimalFromString( + PyObject* decimal_constructor, const std::string& decimal_string, PyObject** out) { + DCHECK_NE(decimal_constructor, nullptr); + DCHECK_NE(out, nullptr); + + auto string_size = decimal_string.size(); + DCHECK_GT(string_size, 0); + + auto string_bytes = decimal_string.c_str(); + DCHECK_NE(string_bytes, nullptr); + + *out = PyObject_CallFunction( + decimal_constructor, const_cast<char*>("s#"), string_bytes, string_size); + RETURN_IF_PYERROR(); + return Status::OK(); +} + } // namespace py } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/helpers.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 611e814..a19b25f 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -18,16 +18,38 @@ #ifndef PYARROW_HELPERS_H #define PYARROW_HELPERS_H +#include <Python.h> + #include <memory> +#include <string> +#include <utility> #include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { + +template <typename T> +struct Decimal; + namespace py { -ARROW_EXPORT -std::shared_ptr<DataType> GetPrimitiveType(Type::type type); +class OwnedRef; + +ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type); + +Status ImportModule(const std::string& module_name, OwnedRef* ref); +Status ImportFromModule( + const OwnedRef& module, const std::string& module_name, OwnedRef* ref); + +template <typename T> +Status PythonDecimalToArrowDecimal(PyObject* python_decimal, Decimal<T>* arrow_decimal); + +Status InferDecimalPrecisionAndScale( + PyObject* python_decimal, int* precision = nullptr, int* scale = nullptr); + +Status DecimalFromString( + PyObject* decimal_constructor, const std::string& decimal_string, PyObject** out); } // namespace py } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 48d3489..f6e627e 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -41,12 +41,14 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" #include "arrow/python/config.h" +#include "arrow/python/helpers.h" #include "arrow/python/numpy-internal.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/type_traits.h" @@ -375,6 +377,7 @@ class PandasConverter : public TypeVisitor { Status ConvertDates(); Status ConvertLists(const std::shared_ptr<DataType>& type); Status ConvertObjects(); + Status ConvertDecimals(); protected: MemoryPool* pool_; @@ -468,15 +471,14 @@ Status InvalidConversion(PyObject* obj, const std::string& expected_type_name) { RETURN_IF_PYERROR(); DCHECK_NE(type_name.obj(), nullptr); - OwnedRef bytes_obj(PyUnicode_AsUTF8String(type_name.obj())); + PyObjectStringify bytestring(type_name.obj()); RETURN_IF_PYERROR(); - DCHECK_NE(bytes_obj.obj(), nullptr); - - Py_ssize_t size = PyBytes_GET_SIZE(bytes_obj.obj()); - const char* bytes = PyBytes_AS_STRING(bytes_obj.obj()); + const char* bytes = bytestring.bytes; DCHECK_NE(bytes, nullptr) << "bytes from type(...).__name__ were null"; + Py_ssize_t size = bytestring.size; + std::string cpp_type_name(bytes, size); std::stringstream ss; @@ -517,6 +519,59 @@ Status PandasConverter::ConvertDates() { return date_builder.Finish(&out_); } +#define CONVERT_DECIMAL_CASE(bit_width, builder, object) \ + case bit_width: { \ + Decimal##bit_width d; \ + RETURN_NOT_OK(PythonDecimalToArrowDecimal((object), &d)); \ + RETURN_NOT_OK((builder).Append(d)); \ + break; \ + } + +Status PandasConverter::ConvertDecimals() { + PyAcquireGIL lock; + + // Import the decimal module and Decimal class + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(ImportModule("decimal", &decimal)); + RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + + PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + PyObject* object = objects[0]; + + int precision; + int scale; + + RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); + + type_ = std::make_shared<DecimalType>(precision, scale); + + const int bit_width = std::dynamic_pointer_cast<DecimalType>(type_)->bit_width(); + DecimalBuilder decimal_builder(pool_, type_); + + RETURN_NOT_OK(decimal_builder.Resize(length_)); + + for (int64_t i = 0; i < length_; ++i) { + object = objects[i]; + if (PyObject_IsInstance(object, Decimal.obj())) { + switch (bit_width) { + CONVERT_DECIMAL_CASE(32, decimal_builder, object) + CONVERT_DECIMAL_CASE(64, decimal_builder, object) + CONVERT_DECIMAL_CASE(128, decimal_builder, object) + default: + break; + } + } else if (PyObject_is_null(object)) { + decimal_builder.AppendNull(); + } else { + return InvalidConversion(object, "decimal.Decimal"); + } + } + return decimal_builder.Finish(&out_); +} + +#undef CONVERT_DECIMAL_CASE + Status PandasConverter::ConvertObjectStrings() { PyAcquireGIL lock; @@ -554,6 +609,90 @@ Status PandasConverter::ConvertObjectFixedWidthBytes( return Status::OK(); } +template <typename T> +Status validate_precision(int precision) { + constexpr static const int maximum_precision = DecimalPrecision<T>::maximum; + if (!(precision > 0 && precision <= maximum_precision)) { + std::stringstream ss; + ss << "Invalid precision: " << precision << ". Minimum is 1, maximum is " + << maximum_precision; + return Status::Invalid(ss.str()); + } + return Status::OK(); +} + +template <typename T> +Status RawDecimalToString( + const uint8_t* bytes, int precision, int scale, std::string* result) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(result, nullptr); + RETURN_NOT_OK(validate_precision<T>(precision)); + Decimal<T> decimal; + FromBytes(bytes, &decimal); + *result = ToString(decimal, precision, scale); + return Status::OK(); +} + +template Status RawDecimalToString<int32_t>( + const uint8_t*, int, int, std::string* result); +template Status RawDecimalToString<int64_t>( + const uint8_t*, int, int, std::string* result); + +Status RawDecimalToString(const uint8_t* bytes, int precision, int scale, + bool is_negative, std::string* result) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(result, nullptr); + RETURN_NOT_OK(validate_precision<int128_t>(precision)); + Decimal128 decimal; + FromBytes(bytes, is_negative, &decimal); + *result = ToString(decimal, precision, scale); + return Status::OK(); +} + +static Status ConvertDecimals(const ChunkedArray& data, PyObject** out_values) { + PyAcquireGIL lock; + OwnedRef decimal_ref; + OwnedRef Decimal_ref; + RETURN_NOT_OK(ImportModule("decimal", &decimal_ref)); + RETURN_NOT_OK(ImportFromModule(decimal_ref, "Decimal", &Decimal_ref)); + PyObject* Decimal = Decimal_ref.obj(); + + for (int c = 0; c < data.num_chunks(); c++) { + auto* arr(static_cast<arrow::DecimalArray*>(data.chunk(c).get())); + auto type(std::dynamic_pointer_cast<arrow::DecimalType>(arr->type())); + const int precision = type->precision; + const int scale = type->scale; + const int bit_width = type->bit_width(); + + for (int64_t i = 0; i < arr->length(); ++i) { + if (arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else { + const uint8_t* raw_value = arr->GetValue(i); + std::string s; + switch (bit_width) { + case 32: + RETURN_NOT_OK(RawDecimalToString<int32_t>(raw_value, precision, scale, &s)); + break; + case 64: + RETURN_NOT_OK(RawDecimalToString<int64_t>(raw_value, precision, scale, &s)); + break; + case 128: + RETURN_NOT_OK( + RawDecimalToString(raw_value, precision, scale, arr->IsNegative(i), &s)); + break; + default: + break; + } + RETURN_NOT_OK(DecimalFromString(Decimal, s, out_values++)); + } + } + } + + return Status::OK(); +} + Status PandasConverter::ConvertBooleans() { PyAcquireGIL lock; @@ -598,6 +737,7 @@ Status PandasConverter::ConvertObjects() { // // * Strings // * Booleans with nulls + // * decimal.Decimals // * Mixed type (not supported at the moment by arrow format) // // Additionally, nulls may be encoded either as np.nan or None. So we have to @@ -613,6 +753,7 @@ Status PandasConverter::ConvertObjects() { PyDateTime_IMPORT; } + // This means we received an explicit type from the user if (type_) { switch (type_->type) { case Type::STRING: @@ -627,10 +768,17 @@ Status PandasConverter::ConvertObjects() { const auto& list_field = static_cast<const ListType&>(*type_); return ConvertLists(list_field.value_field()->type); } + case Type::DECIMAL: + return ConvertDecimals(); default: return Status::TypeError("No known conversion to Arrow type"); } } else { + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(ImportModule("decimal", &decimal)); + RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + for (int64_t i = 0; i < length_; ++i) { if (PyObject_is_null(objects[i])) { continue; @@ -640,6 +788,8 @@ Status PandasConverter::ConvertObjects() { return ConvertBooleans(); } else if (PyDate_CheckExact(objects[i])) { return ConvertDates(); + } else if (PyObject_IsInstance(const_cast<PyObject*>(objects[i]), Decimal.obj())) { + return ConvertDecimals(); } else { return InvalidConversion( const_cast<PyObject*>(objects[i]), "string, bool, or date"); @@ -847,6 +997,7 @@ class PandasBlock { INT64, FLOAT, DOUBLE, + DECIMAL, BOOL, DATETIME, DATETIME_WITH_TZ, @@ -1193,6 +1344,8 @@ class ObjectBlock : public PandasBlock { RETURN_NOT_OK(ConvertBinaryLike<StringArray>(data, out_buffer)); } else if (type == Type::FIXED_SIZE_BINARY) { RETURN_NOT_OK(ConvertFixedSizeBinary(data, out_buffer)); + } else if (type == Type::DECIMAL) { + RETURN_NOT_OK(ConvertDecimals(data, out_buffer)); } else if (type == Type::LIST) { auto list_type = std::static_pointer_cast<ListType>(col->type()); switch (list_type->value_type()->type) { @@ -1519,6 +1672,7 @@ Status MakeBlock(PandasBlock::type type, int64_t num_rows, int num_columns, BLOCK_CASE(DOUBLE, Float64Block); BLOCK_CASE(BOOL, BoolBlock); BLOCK_CASE(DATETIME, DatetimeBlock); + BLOCK_CASE(DECIMAL, ObjectBlock); default: return Status::NotImplemented("Unsupported block type"); } @@ -1649,6 +1803,9 @@ class DataFrameBlockCreator { case Type::DICTIONARY: output_type = PandasBlock::CATEGORICAL; break; + case Type::DECIMAL: + output_type = PandasBlock::DECIMAL; + break; default: return Status::NotImplemented(col->type()->ToString()); } @@ -1892,6 +2049,7 @@ class ArrowDeserializer { CONVERT_CASE(TIMESTAMP); CONVERT_CASE(DICTIONARY); CONVERT_CASE(LIST); + CONVERT_CASE(DECIMAL); default: { std::stringstream ss; ss << "Arrow type reading not implemented for " << col_->type()->ToString(); @@ -1999,6 +2157,13 @@ class ArrowDeserializer { return ConvertFixedSizeBinary(data_, out_values); } + template <int TYPE> + inline typename std::enable_if<TYPE == Type::DECIMAL, Status>::type ConvertValues() { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); + return ConvertDecimals(data_, out_values); + } + #define CONVERTVALUES_LISTSLIKE_CASE(ArrowType, ArrowEnum) \ case Type::ArrowEnum: \ return ConvertListsLike<ArrowType>(col_, out_values); @@ -2021,6 +2186,7 @@ class ArrowDeserializer { CONVERTVALUES_LISTSLIKE_CASE(FloatType, FLOAT) CONVERTVALUES_LISTSLIKE_CASE(DoubleType, DOUBLE) CONVERTVALUES_LISTSLIKE_CASE(StringType, STRING) + CONVERTVALUES_LISTSLIKE_CASE(DecimalType, DECIMAL) default: { std::stringstream ss; ss << "Not implemented type for lists: " << list_type->value_type()->ToString(); http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/python/python-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index f269ebf..b63d2ff 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -28,8 +28,11 @@ #include "arrow/python/builtin_convert.h" #include "arrow/python/common.h" +#include "arrow/python/helpers.h" #include "arrow/python/pandas_convert.h" +#include "arrow/util/decimal.h" + namespace arrow { namespace py { @@ -37,6 +40,36 @@ TEST(PyBuffer, InvalidInputObject) { PyBuffer buffer(Py_None); } +TEST(DecimalTest, TestPythonDecimalToArrowDecimal128) { + PyAcquireGIL lock; + + OwnedRef decimal; + OwnedRef Decimal; + ASSERT_OK(ImportModule("decimal", &decimal)); + ASSERT_NE(decimal.obj(), nullptr); + + ASSERT_OK(ImportFromModule(decimal, "Decimal", &Decimal)); + ASSERT_NE(Decimal.obj(), nullptr); + + std::string decimal_string("-39402950693754869342983"); + const char* format = "s#"; + auto c_string = decimal_string.c_str(); + ASSERT_NE(c_string, nullptr); + + auto c_string_size = decimal_string.size(); + ASSERT_GT(c_string_size, 0); + OwnedRef pydecimal(PyObject_CallFunction( + Decimal.obj(), const_cast<char*>(format), c_string, c_string_size)); + ASSERT_NE(pydecimal.obj(), nullptr); + ASSERT_EQ(PyErr_Occurred(), nullptr); + + Decimal128 arrow_decimal; + int128_t boost_decimal(decimal_string); + PyObject* obj = pydecimal.obj(); + ASSERT_OK(PythonDecimalToArrowDecimal(obj, &arrow_decimal)); + ASSERT_EQ(boost_decimal, arrow_decimal.value); +} + TEST(PandasConversionTest, TestObjectBlockWriteFails) { StringBuilder builder(default_memory_pool()); const char value[] = {'\xf1', '\0'}; http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index abbb626..df4590f 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -17,6 +17,7 @@ #include "arrow/type.h" +#include <climits> #include <sstream> #include <string> @@ -91,7 +92,7 @@ std::string BinaryType::ToString() const { } int FixedSizeBinaryType::bit_width() const { - return 8 * byte_width(); + return CHAR_BIT * byte_width(); } std::string FixedSizeBinaryType::ToString() const { @@ -380,6 +381,10 @@ std::shared_ptr<Field> field( return std::make_shared<Field>(name, type, nullable); } +std::shared_ptr<DataType> decimal(int precision, int scale) { + return std::make_shared<DecimalType>(precision, scale); +} + static const BufferDescr kValidityBuffer(BufferType::VALIDITY, 1); static const BufferDescr kOffsetBuffer(BufferType::OFFSET, 32); static const BufferDescr kTypeBuffer(BufferType::TYPE, 32); @@ -402,7 +407,11 @@ std::vector<BufferDescr> BinaryType::GetBufferLayout() const { } std::vector<BufferDescr> FixedSizeBinaryType::GetBufferLayout() const { - return {kValidityBuffer, BufferDescr(BufferType::DATA, byte_width_ * 8)}; + return {kValidityBuffer, BufferDescr(BufferType::DATA, bit_width())}; +} + +std::vector<BufferDescr> DecimalType::GetBufferLayout() const { + return {kValidityBuffer, kBooleanBuffer, BufferDescr(BufferType::DATA, bit_width())}; } std::vector<BufferDescr> ListType::GetBufferLayout() const { @@ -427,9 +436,4 @@ std::string DecimalType::ToString() const { return s.str(); } -std::vector<BufferDescr> DecimalType::GetBufferLayout() const { - // TODO(wesm) - return {}; -} - } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/type.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 36ab9d8..3a35f56 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -360,6 +360,8 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType { explicit FixedSizeBinaryType(int32_t byte_width) : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {} + explicit FixedSizeBinaryType(int32_t byte_width, Type::type type_id) + : FixedWidthType(type_id), byte_width_(byte_width) {} Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; @@ -399,19 +401,31 @@ struct ARROW_EXPORT StructType : public NestedType { std::vector<BufferDescr> GetBufferLayout() const override; }; -struct ARROW_EXPORT DecimalType : public DataType { +static inline int decimal_byte_width(int precision) { + if (precision >= 0 && precision < 10) { + return 4; + } else if (precision >= 10 && precision < 19) { + return 8; + } else { + // TODO(phillipc): validate that we can't construct > 128 bit types + return 16; + } +} + +struct ARROW_EXPORT DecimalType : public FixedSizeBinaryType { static constexpr Type::type type_id = Type::DECIMAL; explicit DecimalType(int precision_, int scale_) - : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} - int precision; - int scale; - + : FixedSizeBinaryType(decimal_byte_width(precision_), Type::DECIMAL), + precision(precision_), + scale(scale_) {} + std::vector<BufferDescr> GetBufferLayout() const override; Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; static std::string name() { return "decimal"; } - std::vector<BufferDescr> GetBufferLayout() const override; + int precision; + int scale; }; enum class UnionMode : char { SPARSE, DENSE }; http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/type_fwd.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 2e27ce9..acf12c3 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -69,6 +69,7 @@ class StructBuilder; struct DecimalType; class DecimalArray; +class DecimalBuilder; struct UnionType; class UnionArray; @@ -146,6 +147,7 @@ std::shared_ptr<DataType> ARROW_EXPORT binary(); std::shared_ptr<DataType> ARROW_EXPORT date32(); std::shared_ptr<DataType> ARROW_EXPORT date64(); +std::shared_ptr<DataType> ARROW_EXPORT decimal(int precision, int scale); } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/type_traits.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 353b638..3e8ea23 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -229,6 +229,13 @@ struct TypeTraits<DoubleType> { }; template <> +struct TypeTraits<DecimalType> { + using ArrayType = DecimalArray; + using BuilderType = DecimalBuilder; + constexpr static bool is_parameter_free = false; +}; + +template <> struct TypeTraits<BooleanType> { using ArrayType = BooleanArray; using BuilderType = BooleanBuilder; @@ -289,12 +296,6 @@ struct TypeTraits<DictionaryType> { constexpr static bool is_parameter_free = false; }; -template <> -struct TypeTraits<DecimalType> { - // using ArrayType = DecimalArray; - constexpr static bool is_parameter_free = false; -}; - // Not all type classes have a c_type template <typename T> struct as_void { http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/util/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index c1b6877..054f110 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -22,6 +22,7 @@ # Headers: top level install(FILES bit-util.h + decimal.h logging.h macros.h random.h @@ -70,3 +71,4 @@ endif() ADD_ARROW_TEST(bit-util-test) ADD_ARROW_TEST(stl-util-test) +ADD_ARROW_TEST(decimal-test) http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/util/bit-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 42afd07..90a1c3e 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -149,7 +149,6 @@ int64_t ARROW_EXPORT CountSetBits( bool ARROW_EXPORT BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, int64_t right_offset, int64_t bit_length); - } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/util/decimal-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc new file mode 100644 index 0000000..1e22643 --- /dev/null +++ b/cpp/src/arrow/util/decimal-test.cc @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +#include "arrow/util/decimal.h" + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" + +namespace arrow { + +template <typename T> +class DecimalTest : public ::testing::Test { + public: + DecimalTest() : string_value("234.23445") { integer_value.value = 23423445; } + Decimal<T> integer_value; + std::string string_value; +}; + +typedef ::testing::Types<int32_t, int64_t, int128_t> DecimalTypes; +TYPED_TEST_CASE(DecimalTest, DecimalTypes); + +TYPED_TEST(DecimalTest, TestToString) { + Decimal<TypeParam> decimal(this->integer_value); + int precision = 8; + int scale = 5; + std::string result = ToString(decimal, precision, scale); + ASSERT_EQ(result, this->string_value); +} + +TYPED_TEST(DecimalTest, TestFromString) { + Decimal<TypeParam> expected(this->integer_value); + Decimal<TypeParam> result; + int precision, scale; + ASSERT_OK(FromString(this->string_value, &result, &precision, &scale)); + ASSERT_EQ(result.value, expected.value); + ASSERT_EQ(precision, 8); + ASSERT_EQ(scale, 5); +} + +TEST(DecimalTest, TestStringToInt32) { + int32_t value = 0; + StringToInteger("123", "456", 1, &value); + ASSERT_EQ(value, 123456); +} + +TEST(DecimalTest, TestStringToInt64) { + int64_t value = 0; + StringToInteger("123456789", "456", -1, &value); + ASSERT_EQ(value, -123456789456); +} + +TEST(DecimalTest, TestStringToInt128) { + int128_t value = 0; + StringToInteger("123456789", "456789123", 1, &value); + ASSERT_EQ(value, 123456789456789123); +} + +TEST(DecimalTest, TestFromString128) { + static const std::string string_value("-23049223942343532412"); + Decimal<int128_t> result(string_value); + int128_t expected = -230492239423435324; + ASSERT_EQ(result.value, expected * 100 - 12); + + // Sanity check that our number is actually using more than 64 bits + ASSERT_NE(result.value, static_cast<int64_t>(result.value)); +} + +TEST(DecimalTest, TestFromDecimalString128) { + static const std::string string_value("-23049223942343.532412"); + Decimal<int128_t> result(string_value); + int128_t expected = -230492239423435324; + ASSERT_EQ(result.value, expected * 100 - 12); + + // Sanity check that our number is actually using more than 64 bits + ASSERT_NE(result.value, static_cast<int64_t>(result.value)); +} + +TEST(DecimalTest, TestDecimal32Precision) { + auto min_precision = DecimalPrecision<int32_t>::minimum; + auto max_precision = DecimalPrecision<int32_t>::maximum; + ASSERT_EQ(min_precision, 1); + ASSERT_EQ(max_precision, 9); +} + +TEST(DecimalTest, TestDecimal64Precision) { + auto min_precision = DecimalPrecision<int64_t>::minimum; + auto max_precision = DecimalPrecision<int64_t>::maximum; + ASSERT_EQ(min_precision, 10); + ASSERT_EQ(max_precision, 18); +} + +TEST(DecimalTest, TestDecimal128Precision) { + auto min_precision = DecimalPrecision<int128_t>::minimum; + auto max_precision = DecimalPrecision<int128_t>::maximum; + ASSERT_EQ(min_precision, 19); + ASSERT_EQ(max_precision, 38); +} + +TEST(DecimalTest, TestDecimal32SignedRoundTrip) { + Decimal32 expected(std::string("-3402692")); + + uint8_t stack_bytes[4] = {0}; + uint8_t* bytes = stack_bytes; + ToBytes(expected, &bytes); + + Decimal32 result; + FromBytes(bytes, &result); + ASSERT_EQ(expected.value, result.value); +} + +TEST(DecimalTest, TestDecimal64SignedRoundTrip) { + Decimal64 expected(std::string("-34034293045.921")); + + uint8_t stack_bytes[8] = {0}; + uint8_t* bytes = stack_bytes; + ToBytes(expected, &bytes); + + Decimal64 result; + FromBytes(bytes, &result); + + ASSERT_EQ(expected.value, result.value); +} + +TEST(DecimalTest, TestDecimal128StringAndBytesRoundTrip) { + std::string string_value("-340282366920938463463374607431.711455"); + Decimal128 expected(string_value); + + std::string expected_string_value("-340282366920938463463374607431711455"); + int128_t expected_underlying_value(expected_string_value); + + ASSERT_EQ(expected.value, expected_underlying_value); + + uint8_t stack_bytes[16] = {0}; + uint8_t* bytes = stack_bytes; + bool is_negative; + ToBytes(expected, &bytes, &is_negative); + + ASSERT_TRUE(is_negative); + + Decimal128 result; + FromBytes(bytes, is_negative, &result); + + ASSERT_EQ(expected.value, result.value); +} +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/util/decimal.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc new file mode 100644 index 0000000..1ac3471 --- /dev/null +++ b/cpp/src/arrow/util/decimal.cc @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/decimal.h" + +#include <boost/regex.hpp> + +namespace arrow { + +static const boost::regex DECIMAL_PATTERN("(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?"); + +template <typename T> +ARROW_EXPORT Status FromString( + const std::string& s, Decimal<T>* out, int* precision, int* scale) { + if (s.empty()) { + return Status::Invalid("Empty string cannot be converted to decimal"); + } + boost::smatch match; + if (!boost::regex_match(s, match, DECIMAL_PATTERN)) { + std::stringstream ss; + ss << "String " << s << " is not a valid decimal string"; + return Status::Invalid(ss.str()); + } + const int8_t sign = match[1].str() == "-" ? -1 : 1; + std::string whole_part = match[4].str(); + std::string fractional_part = match[6].str(); + if (scale != nullptr) { *scale = static_cast<int>(fractional_part.size()); } + if (precision != nullptr) { + *precision = + static_cast<int>(whole_part.size()) + static_cast<int>(fractional_part.size()); + } + if (out != nullptr) { StringToInteger(whole_part, fractional_part, sign, &out->value); } + return Status::OK(); +} + +template ARROW_EXPORT Status FromString( + const std::string& s, Decimal32* out, int* precision, int* scale); +template ARROW_EXPORT Status FromString( + const std::string& s, Decimal64* out, int* precision, int* scale); +template ARROW_EXPORT Status FromString( + const std::string& s, Decimal128* out, int* precision, int* scale); + +void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int32_t* out) { + DCHECK(sign == -1 || sign == 1); + DCHECK_NE(out, nullptr); + DCHECK(!whole.empty() || !fractional.empty()); + if (!whole.empty()) { + *out = std::stoi(whole, nullptr, 10) * + static_cast<int32_t>(pow(10.0, static_cast<double>(fractional.size()))); + } + if (!fractional.empty()) { *out += std::stoi(fractional, nullptr, 10); } + *out *= sign; +} + +void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int64_t* out) { + DCHECK(sign == -1 || sign == 1); + DCHECK_NE(out, nullptr); + DCHECK(!whole.empty() || !fractional.empty()); + if (!whole.empty()) { + *out = static_cast<int64_t>(std::stoll(whole, nullptr, 10)) * + static_cast<int64_t>(pow(10.0, static_cast<double>(fractional.size()))); + } + if (!fractional.empty()) { *out += std::stoll(fractional, nullptr, 10); } + *out *= sign; +} + +void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int128_t* out) { + DCHECK(sign == -1 || sign == 1); + DCHECK_NE(out, nullptr); + DCHECK(!whole.empty() || !fractional.empty()); + *out = int128_t(whole + fractional) * sign; +} + +void FromBytes(const uint8_t* bytes, Decimal32* decimal) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(decimal, nullptr); + decimal->value = *reinterpret_cast<const int32_t*>(bytes); +} + +void FromBytes(const uint8_t* bytes, Decimal64* decimal) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(decimal, nullptr); + decimal->value = *reinterpret_cast<const int64_t*>(bytes); +} + +constexpr static const size_t BYTES_IN_128_BITS = 128 / CHAR_BIT; +constexpr static const size_t LIMB_SIZE = + sizeof(std::remove_pointer<int128_t::backend_type::limb_pointer>::type); +constexpr static const size_t BYTES_PER_LIMB = BYTES_IN_128_BITS / LIMB_SIZE; + +void FromBytes(const uint8_t* bytes, bool is_negative, Decimal128* decimal) { + DCHECK_NE(bytes, nullptr); + DCHECK_NE(decimal, nullptr); + + auto& decimal_value(decimal->value); + int128_t::backend_type& backend(decimal_value.backend()); + backend.resize(BYTES_PER_LIMB, BYTES_PER_LIMB); + std::memcpy(backend.limbs(), bytes, BYTES_IN_128_BITS); + if (is_negative) { decimal->value = -decimal->value; } +} + +void ToBytes(const Decimal32& value, uint8_t** bytes) { + DCHECK_NE(*bytes, nullptr); + *reinterpret_cast<int32_t*>(*bytes) = value.value; +} + +void ToBytes(const Decimal64& value, uint8_t** bytes) { + DCHECK_NE(*bytes, nullptr); + *reinterpret_cast<int64_t*>(*bytes) = value.value; +} + +void ToBytes(const Decimal128& decimal, uint8_t** bytes, bool* is_negative) { + DCHECK_NE(*bytes, nullptr); + DCHECK_NE(is_negative, nullptr); + + /// TODO(phillipc): boost multiprecision is unreliable here, int128_t can't be + /// roundtripped + const auto& backend(decimal.value.backend()); + auto boost_bytes = reinterpret_cast<const uint8_t*>(backend.limbs()); + std::memcpy(*bytes, boost_bytes, BYTES_IN_128_BITS); + *is_negative = backend.isneg(); +} + +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/util/decimal.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h new file mode 100644 index 0000000..46883e3 --- /dev/null +++ b/cpp/src/arrow/util/decimal.h @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_DECIMAL_H +#define ARROW_DECIMAL_H + +#include <cmath> +#include <cstdlib> +#include <iterator> +#include <regex> +#include <string> + +#include "arrow/status.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" + +#include <boost/multiprecision/cpp_int.hpp> + +namespace arrow { + +using boost::multiprecision::int128_t; + +template <typename T> +struct ARROW_EXPORT Decimal; + +ARROW_EXPORT void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int32_t* out); +ARROW_EXPORT void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int64_t* out); +ARROW_EXPORT void StringToInteger( + const std::string& whole, const std::string& fractional, int8_t sign, int128_t* out); + +template <typename T> +ARROW_EXPORT Status FromString(const std::string& s, Decimal<T>* out, + int* precision = nullptr, int* scale = nullptr); + +template <typename T> +struct ARROW_EXPORT Decimal { + Decimal() : value() {} + explicit Decimal(const std::string& s) : value() { FromString(s, this); } + explicit Decimal(const char* s) : Decimal(std::string(s)) {} + explicit Decimal(const T& value) : value(value) {} + + using value_type = T; + value_type value; +}; + +using Decimal32 = Decimal<int32_t>; +using Decimal64 = Decimal<int64_t>; +using Decimal128 = Decimal<int128_t>; + +template <typename T> +struct ARROW_EXPORT DecimalPrecision {}; + +template <> +struct ARROW_EXPORT DecimalPrecision<int32_t> { + constexpr static const int minimum = 1; + constexpr static const int maximum = 9; +}; + +template <> +struct ARROW_EXPORT DecimalPrecision<int64_t> { + constexpr static const int minimum = 10; + constexpr static const int maximum = 18; +}; + +template <> +struct ARROW_EXPORT DecimalPrecision<int128_t> { + constexpr static const int minimum = 19; + constexpr static const int maximum = 38; +}; + +template <typename T> +ARROW_EXPORT std::string ToString( + const Decimal<T>& decimal_value, int precision, int scale) { + T value = decimal_value.value; + + // Decimal values are sent to clients as strings so in the interest of + // speed the string will be created without the using stringstream with the + // whole/fractional_part(). + size_t last_char_idx = precision + (scale > 0) // Add a space for decimal place + + (scale == precision) // Add a space for leading 0 + + (value < 0); // Add a space for negative sign + std::string str = std::string(last_char_idx, '0'); + // Start filling in the values in reverse order by taking the last digit + // of the value. Use a positive value and worry about the sign later. At this + // point the last_char_idx points to the string terminator. + T remaining_value = value; + size_t first_digit_idx = 0; + if (value < 0) { + remaining_value = -value; + first_digit_idx = 1; + } + if (scale > 0) { + int remaining_scale = scale; + do { + str[--last_char_idx] = static_cast<char>( + (remaining_value % 10) + static_cast<T>('0')); // Ascii offset + remaining_value /= 10; + } while (--remaining_scale > 0); + str[--last_char_idx] = '.'; + DCHECK_GT(last_char_idx, first_digit_idx) << "Not enough space remaining"; + } + do { + str[--last_char_idx] = + static_cast<char>((remaining_value % 10) + static_cast<T>('0')); // Ascii offset + remaining_value /= 10; + if (remaining_value == 0) { + // Trim any extra leading 0's. + if (last_char_idx > first_digit_idx) str.erase(0, last_char_idx - first_digit_idx); + break; + } + // For safety, enforce string length independent of remaining_value. + } while (last_char_idx > first_digit_idx); + if (value < 0) str[0] = '-'; + return str; +} + +/// Conversion from raw bytes to a Decimal value +ARROW_EXPORT void FromBytes(const uint8_t* bytes, Decimal32* value); +ARROW_EXPORT void FromBytes(const uint8_t* bytes, Decimal64* value); +ARROW_EXPORT void FromBytes(const uint8_t* bytes, bool is_negative, Decimal128* decimal); + +/// Conversion from a Decimal value to raw bytes +ARROW_EXPORT void ToBytes(const Decimal32& value, uint8_t** bytes); +ARROW_EXPORT void ToBytes(const Decimal64& value, uint8_t** bytes); +ARROW_EXPORT void ToBytes(const Decimal128& decimal, uint8_t** bytes, bool* is_negative); + +} // namespace arrow +#endif // ARROW_DECIMAL_H http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/cpp/src/arrow/visitor_inline.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h index c61c9f5..29b3db6 100644 --- a/cpp/src/arrow/visitor_inline.h +++ b/cpp/src/arrow/visitor_inline.h @@ -93,7 +93,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) { ARRAY_VISIT_INLINE(TimestampType); ARRAY_VISIT_INLINE(Time32Type); ARRAY_VISIT_INLINE(Time64Type); - // ARRAY_VISIT_INLINE(DecimalType); + ARRAY_VISIT_INLINE(DecimalType); ARRAY_VISIT_INLINE(ListType); ARRAY_VISIT_INLINE(StructType); ARRAY_VISIT_INLINE(UnionType); http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/format/Schema.fbs ---------------------------------------------------------------------- diff --git a/format/Schema.fbs b/format/Schema.fbs index ca9c8e6..badc7ea 100644 --- a/format/Schema.fbs +++ b/format/Schema.fbs @@ -77,7 +77,9 @@ table Bool { } table Decimal { + /// Total number of decimal digits precision: int; + /// Number of digits after the decimal point "." scale: int; } http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8c52074..7b23cf6 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -71,7 +71,7 @@ from pyarrow.schema import (null, bool_, uint8, uint16, uint32, uint64, timestamp, date32, date64, float16, float32, float64, - binary, string, + binary, string, decimal, list_, struct, dictionary, field, DataType, FixedSizeBinaryType, Field, Schema, schema) http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/array.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index f6aaea2..3ba4871 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -116,6 +116,10 @@ cdef class FixedSizeBinaryArray(Array): pass +cdef class DecimalArray(FixedSizeBinaryArray): + pass + + cdef class ListArray(Array): pass http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 9f302e0..ee500e6 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -481,6 +481,10 @@ cdef class FixedSizeBinaryArray(Array): pass +cdef class DecimalArray(FixedSizeBinaryArray): + pass + + cdef class ListArray(Array): pass @@ -602,6 +606,7 @@ cdef dict _array_classes = { Type_STRING: StringArray, Type_DICTIONARY: DictionaryArray, Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, + Type_DECIMAL: DecimalArray, } cdef object box_array(const shared_ptr[CArray]& sp_array): http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/includes/common.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index ab38ff3..4860334 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -51,6 +51,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsTypeError() +cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil: + cdef cppclass int128_t: + pass + + cdef inline object PyObject_to_object(PyObject* o): # Cast to "object" increments reference count cdef object result = <object> o http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2a0488f..73d96b2 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -39,6 +39,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type_FLOAT" arrow::Type::FLOAT" Type_DOUBLE" arrow::Type::DOUBLE" + Type_DECIMAL" arrow::Type::DECIMAL" + Type_DATE32" arrow::Type::DATE32" Type_DATE64" arrow::Type::DATE64" Type_TIMESTAMP" arrow::Type::TIMESTAMP" @@ -58,6 +60,11 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: TimeUnit_MICRO" arrow::TimeUnit::MICRO" TimeUnit_NANO" arrow::TimeUnit::NANO" + cdef cppclass Decimal[T]: + Decimal(const T&) + + cdef c_string ToString[T](const Decimal[T]&, int, int) + cdef cppclass CDataType" arrow::DataType": Type type @@ -144,6 +151,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryType" arrow::FixedSizeBinaryType"(CFixedWidthType): CFixedSizeBinaryType(int byte_width) int byte_width() + int bit_width() + + cdef cppclass CDecimalType" arrow::DecimalType"(CFixedSizeBinaryType): + int precision + int scale + CDecimalType(int precision, int scale) cdef cppclass CField" arrow::Field": c_string name @@ -212,6 +225,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CFixedSizeBinaryArray" arrow::FixedSizeBinaryArray"(CArray): const uint8_t* GetValue(int i) + cdef cppclass CDecimalArray" arrow::DecimalArray"(CFixedSizeBinaryArray): + Decimal[T] Value[T](int i) + cdef cppclass CListArray" arrow::ListArray"(CArray): const int32_t* raw_value_offsets() int32_t value_offset(int i) http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/scalar.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pxd b/python/pyarrow/scalar.pxd index d6c3b35..62a5664 100644 --- a/python/pyarrow/scalar.pxd +++ b/python/pyarrow/scalar.pxd @@ -20,6 +20,7 @@ from pyarrow.includes.libarrow cimport * from pyarrow.schema cimport DataType + cdef class Scalar: cdef readonly: DataType type http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 1c0790a..f3d9321 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -17,9 +17,10 @@ from pyarrow.schema cimport DataType, box_data_type +from pyarrow.includes.common cimport int128_t from pyarrow.compat import frombytes import pyarrow.schema as schema - +import decimal import datetime cimport cpython as cp @@ -64,7 +65,7 @@ cdef class ArrayValue(Scalar): if hasattr(self, 'as_py'): return repr(self.as_py()) else: - return Scalar.__repr__(self) + return super(Scalar, self).__repr__() cdef class BooleanValue(ArrayValue): @@ -199,6 +200,25 @@ cdef class DoubleValue(ArrayValue): return ap.Value(self.index) +cdef class DecimalValue(ArrayValue): + + def as_py(self): + cdef: + CDecimalArray* ap = <CDecimalArray*> self.sp_array.get() + CDecimalType* t = <CDecimalType*> ap.type().get() + int bit_width = t.bit_width() + int precision = t.precision + int scale = t.scale + c_string s + if bit_width == 32: + s = ToString[int32_t](ap.Value[int32_t](self.index), precision, scale) + elif bit_width == 64: + s = ToString[int64_t](ap.Value[int64_t](self.index), precision, scale) + elif bit_width == 128: + s = ToString[int128_t](ap.Value[int128_t](self.index), precision, scale) + return decimal.Decimal(s.decode('utf8')) + + cdef class StringValue(ArrayValue): def as_py(self): @@ -286,6 +306,7 @@ cdef dict _scalar_classes = { Type_BINARY: BinaryValue, Type_STRING: StringValue, Type_FIXED_SIZE_BINARY: FixedSizeBinaryValue, + Type_DECIMAL: DecimalValue, } cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array, http://git-wip-us.apache.org/repos/asf/arrow/blob/754bcce6/python/pyarrow/schema.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 94d65bf..eceedba 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -20,6 +20,7 @@ from pyarrow.includes.libarrow cimport (CDataType, CDictionaryType, CTimestampType, CFixedSizeBinaryType, + CDecimalType, CField, CSchema) cdef class DataType: @@ -27,7 +28,7 @@ cdef class DataType: shared_ptr[CDataType] sp_type CDataType* type - cdef init(self, const shared_ptr[CDataType]& type) + cdef void init(self, const shared_ptr[CDataType]& type) cdef class DictionaryType(DataType): @@ -45,6 +46,11 @@ cdef class FixedSizeBinaryType(DataType): const CFixedSizeBinaryType* fixed_size_binary_type +cdef class DecimalType(FixedSizeBinaryType): + cdef: + const CDecimalType* decimal_type + + cdef class Field: cdef: shared_ptr[CField] sp_field @@ -55,6 +61,7 @@ cdef class Field: cdef init(self, const shared_ptr[CField]& field) + cdef class Schema: cdef: shared_ptr[CSchema] sp_schema @@ -63,6 +70,7 @@ cdef class Schema: cdef init(self, const vector[shared_ptr[CField]]& fields) cdef init_schema(self, const shared_ptr[CSchema]& schema) + cdef DataType box_data_type(const shared_ptr[CDataType]& type) cdef Field box_field(const shared_ptr[CField]& field) cdef Schema box_schema(const shared_ptr[CSchema]& schema)