This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new b241eb6 ARROW-1839/ARROW-1871: [C++/Python] Add Decimal Parquet Read/Write Tests b241eb6 is described below commit b241eb699b6151d9b1c8809528be4c80cce26c7e Author: Phillip Cloud <cpcl...@gmail.com> AuthorDate: Mon Dec 4 22:39:44 2017 -0500 ARROW-1839/ARROW-1871: [C++/Python] Add Decimal Parquet Read/Write Tests Author: Phillip Cloud <cpcl...@gmail.com> Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1335 from cpcloud/ARROW-1839 and squashes the following commits: 496ec796 [Wes McKinney] Make RescaleDecimal instead an instance method of Decimal128 878d9a1d [Phillip Cloud] Fix namespace 816ff42a [Phillip Cloud] Use IsPyInteger 41c447fb [Wes McKinney] Add missing license header 3b91e550 [Wes McKinney] Rename Rescale to RescaleDecimal accc653d [Wes McKinney] Make AppendItem methods non-virtual for CRTP inlining bbc14cf4 [Phillip Cloud] int64_t for looping b6f8592f [Phillip Cloud] Generate a column for different scales afe4396a [Phillip Cloud] docstring ade030e5 [Phillip Cloud] Clarify randdecimal contract 752e905b [Phillip Cloud] Use Python C API instead of string parsing 115fbc96 [Phillip Cloud] Remove iostream include 1f6339d8 [Phillip Cloud] Clean up testing a4b1d6bb [Phillip Cloud] Add randdecimal to util 07c356ac [Phillip Cloud] Remove inference from python objects c7637f19 [Phillip Cloud] Refactor test dba8a0d9 [Phillip Cloud] Add docs 4296b5c9 [Phillip Cloud] Infer the scale by looking at every Python decimal object 98e319af [Phillip Cloud] Remove debugging 2d02b44f [Phillip Cloud] More style fixes f235ef42 [Phillip Cloud] Style fixes 97c0a6b5 [Phillip Cloud] Make sure we assign to our out variable d4a32361 [Phillip Cloud] Move rescaling to decimal.h/cc 891137fd [Phillip Cloud] Formatting deea4711 [Phillip Cloud] Refactor rescaling functionality 108e8914 [Phillip Cloud] Format c28e2c17 [Phillip Cloud] Remove test cbb2ed76 [Phillip Cloud] Rewrite infer precision and scale from python 7a2de887 [Phillip Cloud] Add rescaling b2ad3b7f [Phillip Cloud] Rename e0d6080a [Phillip Cloud] Formatting fc5412a3 [Phillip Cloud] Checkpoint [ci skip] cf3a5a26 [Phillip Cloud] Test multiple precisions and scales 853fced9 [Phillip Cloud] Make the parquet read/write from python test more robust ff6eccf8 [Phillip Cloud] Add c++ formatting test for small number and small scale/precision cd38abd4 [Phillip Cloud] Add test util 45eafe73 [Phillip Cloud] Add better test d7a0aab4 [Phillip Cloud] Make sure we do not override the inferred type eff7d2b5 [Phillip Cloud] Use array syntax 801626b4 [Phillip Cloud] Use FormatValue e519c25f [Phillip Cloud] Add Decimal parquet tests --- cpp/src/arrow/python/arrow_to_pandas.cc | 20 ++----- cpp/src/arrow/python/builtin_convert.cc | 76 ++++++++++++--------------- cpp/src/arrow/python/helpers.cc | 77 +++++++++++++++++++++++---- cpp/src/arrow/python/helpers.h | 11 +++- cpp/src/arrow/python/numpy-internal.h | 4 +- cpp/src/arrow/python/numpy_to_arrow.cc | 35 +++++++++---- cpp/src/arrow/python/python-test.cc | 81 ++++++++++++++++++++-------- cpp/src/arrow/util/decimal-test.cc | 9 ++++ cpp/src/arrow/util/decimal.cc | 64 +++++++++++++++++++++++ cpp/src/arrow/util/decimal.h | 3 ++ python/pyarrow/tests/test_parquet.py | 44 +++++++++++++++- python/pyarrow/tests/util.py | 93 +++++++++++++++++++++++++++++++++ 12 files changed, 411 insertions(+), 106 deletions(-) diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 096bbd5..b1825cb 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -620,13 +620,6 @@ static Status ConvertTimes(PandasOptions options, const ChunkedArray& data, return Status::OK(); } -static Status RawDecimalToString(const uint8_t* bytes, int scale, std::string* result) { - DCHECK_NE(result, nullptr); - Decimal128 decimal(bytes); - *result = decimal.ToString(scale); - return Status::OK(); -} - static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { PyAcquireGIL lock; @@ -637,19 +630,14 @@ static Status ConvertDecimals(PandasOptions options, const ChunkedArray& data, PyObject* Decimal = Decimal_ref.obj(); for (int c = 0; c < data.num_chunks(); c++) { - auto* arr(static_cast<arrow::Decimal128Array*>(data.chunk(c).get())); - auto type(std::dynamic_pointer_cast<arrow::Decimal128Type>(arr->type())); - const int scale = type->scale(); + const auto& arr(static_cast<const arrow::Decimal128Array&>(*data.chunk(c).get())); - for (int64_t i = 0; i < arr->length(); ++i) { - if (arr->IsNull(i)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { Py_INCREF(Py_None); *out_values++ = Py_None; } else { - const uint8_t* raw_value = arr->GetValue(i); - std::string decimal_string; - RETURN_NOT_OK(RawDecimalToString(raw_value, scale, &decimal_string)); - *out_values++ = internal::DecimalFromString(Decimal, decimal_string); + *out_values++ = internal::DecimalFromString(Decimal, arr.FormatValue(i)); RETURN_IF_PYERROR(); } } diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc index c716c47..08cbae7 100644 --- a/cpp/src/arrow/python/builtin_convert.cc +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -37,14 +37,6 @@ namespace arrow { namespace py { -static inline bool IsPyInteger(PyObject* obj) { -#if PYARROW_IS_PY2 - return PyLong_Check(obj) || PyInt_Check(obj); -#else - return PyLong_Check(obj); -#endif -} - Status InvalidConversion(PyObject* obj, const std::string& expected_types, std::ostream* out) { OwnedRef type(PyObject_Type(obj)); @@ -91,7 +83,7 @@ class ScalarVisitor { ++bool_count_; } else if (PyFloat_Check(obj)) { ++float_count_; - } else if (IsPyInteger(obj)) { + } else if (internal::IsPyInteger(obj)) { ++int_count_; } else if (PyDate_CheckExact(obj)) { ++date_count_; @@ -390,28 +382,26 @@ class TypedConverterVisitor : public TypedConverter<BuilderType> { } return Status::OK(); } - - virtual Status AppendItem(const OwnedRef& item) = 0; }; class NullConverter : public TypedConverterVisitor<NullBuilder, NullConverter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { return Status::Invalid("NullConverter: passed non-None value"); } }; class BoolConverter : public TypedConverterVisitor<BooleanBuilder, BoolConverter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { return typed_builder_->Append(item.obj() == Py_True); } }; class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int8_t>::max() || val < std::numeric_limits<int8_t>::min())) { @@ -426,8 +416,8 @@ class Int8Converter : public TypedConverterVisitor<Int8Builder, Int8Converter> { class Int16Converter : public TypedConverterVisitor<Int16Builder, Int16Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int16_t>::max() || val < std::numeric_limits<int16_t>::min())) { @@ -442,8 +432,8 @@ class Int16Converter : public TypedConverterVisitor<Int16Builder, Int16Converter class Int32Converter : public TypedConverterVisitor<Int32Builder, Int32Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits<int32_t>::max() || val < std::numeric_limits<int32_t>::min())) { @@ -458,8 +448,8 @@ class Int32Converter : public TypedConverterVisitor<Int32Builder, Int32Converter class Int64Converter : public TypedConverterVisitor<Int64Builder, Int64Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); RETURN_IF_PYERROR(); return typed_builder_->Append(val); } @@ -467,8 +457,8 @@ class Int64Converter : public TypedConverterVisitor<Int64Builder, Int64Converter class UInt8Converter : public TypedConverterVisitor<UInt8Builder, UInt8Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint8_t>::max())) { return Status::Invalid( @@ -482,8 +472,8 @@ class UInt8Converter : public TypedConverterVisitor<UInt8Builder, UInt8Converter class UInt16Converter : public TypedConverterVisitor<UInt16Builder, UInt16Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint16_t>::max())) { return Status::Invalid( @@ -497,8 +487,8 @@ class UInt16Converter : public TypedConverterVisitor<UInt16Builder, UInt16Conver class UInt32Converter : public TypedConverterVisitor<UInt32Builder, UInt32Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - uint64_t val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<uint64_t>(PyLong_AsLongLong(item.obj())); if (ARROW_PREDICT_FALSE(val > std::numeric_limits<uint32_t>::max())) { return Status::Invalid( @@ -512,8 +502,8 @@ class UInt32Converter : public TypedConverterVisitor<UInt32Builder, UInt32Conver class UInt64Converter : public TypedConverterVisitor<UInt64Builder, UInt64Converter> { public: - inline Status AppendItem(const OwnedRef& item) { - int64_t val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); + Status AppendItem(const OwnedRef& item) { + const auto val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); RETURN_IF_PYERROR(); return typed_builder_->Append(val); } @@ -521,13 +511,13 @@ class UInt64Converter : public TypedConverterVisitor<UInt64Builder, UInt64Conver class Date32Converter : public TypedConverterVisitor<Date32Builder, Date32Converter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { int32_t t; if (PyDate_Check(item.obj())) { auto pydate = reinterpret_cast<PyDateTime_Date*>(item.obj()); t = static_cast<int32_t>(PyDate_to_s(pydate)); } else { - int64_t casted_val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); + const auto casted_val = static_cast<int64_t>(PyLong_AsLongLong(item.obj())); RETURN_IF_PYERROR(); if (casted_val > std::numeric_limits<int32_t>::max()) { return Status::Invalid("Integer as date32 larger than INT32_MAX"); @@ -540,7 +530,7 @@ class Date32Converter : public TypedConverterVisitor<Date32Builder, Date32Conver class Date64Converter : public TypedConverterVisitor<Date64Builder, Date64Converter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { int64_t t; if (PyDate_Check(item.obj())) { auto pydate = reinterpret_cast<PyDateTime_Date*>(item.obj()); @@ -558,7 +548,7 @@ class TimestampConverter public: explicit TimestampConverter(TimeUnit::type unit) : unit_(unit) {} - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { int64_t t; if (PyDateTime_Check(item.obj())) { auto pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj()); @@ -590,7 +580,7 @@ class TimestampConverter class DoubleConverter : public TypedConverterVisitor<DoubleBuilder, DoubleConverter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { double val = PyFloat_AsDouble(item.obj()); RETURN_IF_PYERROR(); return typed_builder_->Append(val); @@ -599,7 +589,7 @@ class DoubleConverter : public TypedConverterVisitor<DoubleBuilder, DoubleConver class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; const char* bytes; Py_ssize_t length; @@ -627,7 +617,7 @@ class BytesConverter : public TypedConverterVisitor<BinaryBuilder, BytesConverte class FixedWidthBytesConverter : public TypedConverterVisitor<FixedSizeBinaryBuilder, FixedWidthBytesConverter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; Py_ssize_t expected_length = @@ -654,7 +644,7 @@ class FixedWidthBytesConverter class UTF8Converter : public TypedConverterVisitor<StringBuilder, UTF8Converter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { PyObject* bytes_obj; OwnedRef tmp; const char* bytes; @@ -689,10 +679,10 @@ class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> { public: Status Init(ArrayBuilder* builder) override; - inline Status AppendItem(const OwnedRef& item) override { + Status AppendItem(const OwnedRef& item) { RETURN_NOT_OK(typed_builder_->Append()); PyObject* item_obj = item.obj(); - int64_t list_size = static_cast<int64_t>(PySequence_Size(item_obj)); + const auto list_size = static_cast<int64_t>(PySequence_Size(item_obj)); return value_converter_->AppendData(item_obj, list_size); } @@ -703,13 +693,11 @@ class ListConverter : public TypedConverterVisitor<ListBuilder, ListConverter> { class DecimalConverter : public TypedConverterVisitor<arrow::Decimal128Builder, DecimalConverter> { public: - inline Status AppendItem(const OwnedRef& item) { + Status AppendItem(const OwnedRef& item) { /// TODO(phillipc): Check for nan? - std::string string; - RETURN_NOT_OK(internal::PythonDecimalToString(item.obj(), &string)); - Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); + const auto& type = static_cast<const DecimalType&>(*typed_builder_->type()); + RETURN_NOT_OK(internal::DecimalFromPythonDecimal(item.obj(), type, &value)); return typed_builder_->Append(value); } }; diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc index 708d991..494f929 100644 --- a/cpp/src/arrow/python/helpers.cc +++ b/cpp/src/arrow/python/helpers.cc @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/python/helpers.h" +#include <sstream> + #include "arrow/python/common.h" +#include "arrow/python/helpers.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -91,20 +93,33 @@ Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { return Status::OK(); } -Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision, - int* scale) { - // Call Python's str(decimal_object) - OwnedRef str_obj(PyObject_Str(python_decimal)); +Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision, + int32_t* scale) { + DCHECK_NE(python_decimal, NULLPTR); + DCHECK_NE(precision, NULLPTR); + DCHECK_NE(scale, NULLPTR); + + OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", "()")); RETURN_IF_PYERROR(); - PyObjectStringify str(str_obj.obj()); + DCHECK(PyTuple_Check(as_tuple.obj())); - const char* bytes = str.bytes; - DCHECK_NE(bytes, nullptr); + OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits")); + RETURN_IF_PYERROR(); + DCHECK(PyTuple_Check(digits.obj())); - auto size = str.size; + const auto num_digits = static_cast<int32_t>(PyTuple_Size(digits.obj())); + RETURN_IF_PYERROR(); - std::string c_string(bytes, size); - return Decimal128::FromString(c_string, nullptr, precision, scale); + OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent")); + RETURN_IF_PYERROR(); + DCHECK(IsPyInteger(py_exponent.obj())); + + const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj())); + RETURN_IF_PYERROR(); + + *precision = num_digits; + *scale = -exponent; + return Status::OK(); } PyObject* DecimalFromString(PyObject* decimal_constructor, @@ -121,6 +136,46 @@ PyObject* DecimalFromString(PyObject* decimal_constructor, string_size); } +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal128* out) { + DCHECK_NE(python_decimal, NULLPTR); + DCHECK_NE(out, NULLPTR); + + std::string string; + RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string)); + + int32_t inferred_precision; + int32_t inferred_scale; + + RETURN_NOT_OK( + Decimal128::FromString(string, out, &inferred_precision, &inferred_scale)); + + const int32_t precision = arrow_type.precision(); + const int32_t scale = arrow_type.scale(); + + if (ARROW_PREDICT_FALSE(inferred_precision > precision)) { + std::stringstream buf; + buf << "Decimal type with precision " << inferred_precision + << " does not fit into precision inferred from first array element: " + << precision; + return Status::Invalid(buf.str()); + } + + if (scale != inferred_scale) { + DCHECK_NE(out, NULLPTR); + RETURN_NOT_OK(out->Rescale(inferred_scale, scale, out)); + } + return Status::OK(); +} + +bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 719ed79..c82bdab 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -29,6 +29,9 @@ #include "arrow/util/visibility.h" namespace arrow { + +class Decimal128; + namespace py { class OwnedRef; @@ -44,11 +47,15 @@ Status ImportFromModule(const OwnedRef& module, const std::string& module_name, Status PythonDecimalToString(PyObject* python_decimal, std::string* out); -Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int* precision = NULLPTR, - int* scale = NULLPTR); +Status InferDecimalPrecisionAndScale(PyObject* python_decimal, + int32_t* precision = NULLPTR, + int32_t* scale = NULLPTR); PyObject* DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string); +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal128* out); +bool IsPyInteger(PyObject* obj); } // namespace internal } // namespace py diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index db34d24..6c9c871 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -56,8 +56,8 @@ class Ndarray1DIndexer { bool is_strided() const { return stride_ == 1; } - T& operator[](size_type index) { return *(data_ + index * stride_); } - T& operator[](size_type index) const { return *(data_ + index * stride_); } + T& operator[](size_type index) { return data_[index * stride_]; } + T& operator[](size_type index) const { return data_[index * stride_]; } private: PyArrayObject* arr_; diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 0b1124d..2316a79 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -682,24 +682,41 @@ Status NumPyConverter::ConvertDecimals() { Ndarray1DIndexer<PyObject*> objects(arr_); PyObject* object = objects[0]; - int precision; - int scale; + if (type_ == NULLPTR) { + int32_t precision; + int32_t desired_scale; - RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(object, &precision, &scale)); + int32_t tmp_precision; + int32_t tmp_scale; - type_ = std::make_shared<Decimal128Type>(precision, scale); + RETURN_NOT_OK( + internal::InferDecimalPrecisionAndScale(objects[0], &precision, &desired_scale)); + + for (int64_t i = 1; i < length_; ++i) { + RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(objects[i], &tmp_precision, + &tmp_scale)); + precision = std::max(precision, tmp_precision); + + if (std::abs(desired_scale) < std::abs(tmp_scale)) { + desired_scale = tmp_scale; + } + } + + type_ = ::arrow::decimal(precision, desired_scale); + } Decimal128Builder builder(type_, pool_); RETURN_NOT_OK(builder.Resize(length_)); + const auto& decimal_type = static_cast<const DecimalType&>(*type_); + PyObject* Decimal_type_object = Decimal.obj(); + for (int64_t i = 0; i < length_; ++i) { object = objects[i]; - if (PyObject_IsInstance(object, Decimal.obj())) { - std::string string; - RETURN_NOT_OK(internal::PythonDecimalToString(object, &string)); + if (PyObject_IsInstance(object, Decimal_type_object)) { Decimal128 value; - RETURN_NOT_OK(Decimal128::FromString(string, &value)); + RETURN_NOT_OK(internal::DecimalFromPythonDecimal(object, decimal_type, &value)); RETURN_NOT_OK(builder.Append(value)); } else if (PandasObjectIsNull(object)) { RETURN_NOT_OK(builder.AppendNull()); @@ -724,7 +741,7 @@ Status NumPyConverter::ConvertTimes() { Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_); RETURN_NOT_OK(builder.Resize(length_)); - PyObject* obj; + PyObject* obj = NULLPTR; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; if (PyTime_Check(obj)) { diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc index 3b7d7d8..d9919ee 100644 --- a/cpp/src/arrow/python/python-test.cc +++ b/cpp/src/arrow/python/python-test.cc @@ -35,34 +35,73 @@ namespace py { TEST(PyBuffer, InvalidInputObject) { PyBuffer buffer(Py_None); } -TEST(DecimalTest, TestPythonDecimalToString) { - PyAcquireGIL lock; +class DecimalTest : public ::testing::Test { + public: + DecimalTest() : lock_(), decimal_module_(), decimal_constructor_() { + auto s = internal::ImportModule("decimal", &decimal_module_); + DCHECK(s.ok()) << s.message(); + DCHECK_NE(decimal_module_.obj(), NULLPTR); - OwnedRef decimal; - OwnedRef Decimal; - ASSERT_OK(internal::ImportModule("decimal", &decimal)); - ASSERT_NE(decimal.obj(), nullptr); + s = internal::ImportFromModule(decimal_module_, "Decimal", &decimal_constructor_); + DCHECK(s.ok()) << s.message(); - ASSERT_OK(internal::ImportFromModule(decimal, "Decimal", &Decimal)); - ASSERT_NE(Decimal.obj(), nullptr); + DCHECK_NE(decimal_constructor_.obj(), NULLPTR); + } - std::string decimal_string("-39402950693754869342983"); - const char* format = "s#"; - auto c_string = decimal_string.c_str(); - ASSERT_NE(c_string, nullptr); + OwnedRef CreatePythonDecimal(const std::string& string_value) { + OwnedRef ref(internal::DecimalFromString(decimal_constructor_.obj(), string_value)); + return ref; + } - auto c_string_size = decimal_string.size(); - ASSERT_GT(c_string_size, 0); - OwnedRef pydecimal(PyObject_CallFunction(Decimal.obj(), const_cast<char*>(format), - c_string, c_string_size)); - ASSERT_NE(pydecimal.obj(), nullptr); - ASSERT_EQ(PyErr_Occurred(), nullptr); + private: + PyAcquireGIL lock_; + OwnedRef decimal_module_; + OwnedRef decimal_constructor_; +}; - PyObject* python_object = pydecimal.obj(); - ASSERT_NE(python_object, nullptr); +TEST_F(DecimalTest, TestPythonDecimalToString) { + std::string decimal_string("-39402950693754869342983"); + + OwnedRef python_object = this->CreatePythonDecimal(decimal_string); + ASSERT_NE(python_object.obj(), nullptr); std::string string_result; - ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result)); + ASSERT_OK(internal::PythonDecimalToString(python_object.obj(), &string_result)); +} + +TEST_F(DecimalTest, TestInferPrecisionAndScale) { + std::string decimal_string("-394029506937548693.42983"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + + int32_t precision; + int32_t scale; + + ASSERT_OK( + internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); + + const auto expected_precision = + static_cast<int32_t>(decimal_string.size() - 2); // 1 for -, 1 for . + const int32_t expected_scale = 5; + + ASSERT_EQ(expected_precision, precision); + ASSERT_EQ(expected_scale, scale); +} + +TEST_F(DecimalTest, TestInferPrecisionAndNegativeScale) { + std::string decimal_string("-3.94042983E+10"); + OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string)); + + int32_t precision; + int32_t scale; + + ASSERT_OK( + internal::InferDecimalPrecisionAndScale(python_decimal.obj(), &precision, &scale)); + + const auto expected_precision = 9; + const int32_t expected_scale = -2; + + ASSERT_EQ(expected_precision, precision); + ASSERT_EQ(expected_scale, scale); } TEST(PandasConversionTest, TestObjectBlockWriteFails) { diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 0d0c08c..e440674 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -366,4 +366,13 @@ TEST(Decimal128ParseTest, WithExponentAndNullptrScale) { ASSERT_EQ(expected_value, value); } +TEST(Decimal128Test, TestSmallNumberFormat) { + Decimal128 value("0.2"); + std::string expected("0.2"); + + const int32_t scale = 1; + std::string result = value.ToString(scale); + ASSERT_EQ(expected, result); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 447cae5..e999854 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -813,4 +813,68 @@ Decimal128 operator%(const Decimal128& left, const Decimal128& right) { return remainder; } +static const Decimal128 ScaleMultipliers[] = { + Decimal128(1), + Decimal128(10), + Decimal128(100), + Decimal128(1000), + Decimal128(10000), + Decimal128(100000), + Decimal128(1000000), + Decimal128(10000000), + Decimal128(100000000), + Decimal128(1000000000), + Decimal128(10000000000), + Decimal128(100000000000), + Decimal128(1000000000000), + Decimal128(10000000000000), + Decimal128(100000000000000), + Decimal128(1000000000000000), + Decimal128(10000000000000000), + Decimal128(100000000000000000), + Decimal128(1000000000000000000), + Decimal128("10000000000000000000"), + Decimal128("100000000000000000000"), + Decimal128("1000000000000000000000"), + Decimal128("10000000000000000000000"), + Decimal128("100000000000000000000000"), + Decimal128("1000000000000000000000000"), + Decimal128("10000000000000000000000000"), + Decimal128("100000000000000000000000000"), + Decimal128("1000000000000000000000000000"), + Decimal128("10000000000000000000000000000"), + Decimal128("100000000000000000000000000000"), + Decimal128("1000000000000000000000000000000"), + Decimal128("10000000000000000000000000000000"), + Decimal128("100000000000000000000000000000000"), + Decimal128("1000000000000000000000000000000000"), + Decimal128("10000000000000000000000000000000000"), + Decimal128("100000000000000000000000000000000000"), + Decimal128("1000000000000000000000000000000000000"), + Decimal128("10000000000000000000000000000000000000"), + Decimal128("100000000000000000000000000000000000000")}; + +Status Decimal128::Rescale(int32_t original_scale, int32_t new_scale, + Decimal128* out) const { + DCHECK_NE(out, NULLPTR); + DCHECK_NE(original_scale, new_scale); + const int32_t delta_scale = original_scale - new_scale; + const int32_t abs_delta_scale = std::abs(delta_scale); + DCHECK_GE(abs_delta_scale, 1); + DCHECK_LE(abs_delta_scale, 38); + + const Decimal128 scale_multiplier = ScaleMultipliers[abs_delta_scale]; + const Decimal128 result = *this * scale_multiplier; + + if (ARROW_PREDICT_FALSE(result < *this)) { + std::stringstream buf; + buf << "Rescaling decimal value from original scale " << original_scale + << " to new scale " << new_scale << " would cause overflow"; + return Status::Invalid(buf.str()); + } + + *out = result; + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h index a0423e9..1594090 100644 --- a/cpp/src/arrow/util/decimal.h +++ b/cpp/src/arrow/util/decimal.h @@ -126,6 +126,9 @@ class ARROW_EXPORT Decimal128 { static Status FromString(const std::string& s, Decimal128* out, int* precision = NULLPTR, int* scale = NULLPTR); + /// \brief Convert Decimal128 from one scale to another + Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) const; + private: int64_t high_bits_; uint64_t low_bits_; diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 9004fc0..d17d89e 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -16,13 +16,17 @@ # under the License. from os.path import join as pjoin + import datetime +import decimal import io -import os import json +import os + import pytest from pyarrow.compat import guid, u, BytesIO, unichar, frombytes +from pyarrow.tests import util from pyarrow.filesystem import LocalFileSystem import pyarrow as pa from .pandas_examples import dataframe_with_arrays, dataframe_with_lists @@ -1564,3 +1568,41 @@ carat cut color clarity depth table price x y z t = _read_table(path) result = t.to_pandas() tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('precision', range(1, 39)) +def test_decimal_roundtrip(tmpdir, precision): + num_values = 10 + + columns = {} + + for scale in range(0, precision + 1): + with util.random_seed(0): + random_decimal_values = [ + util.randdecimal(precision, scale) for _ in range(num_values) + ] + column_name = 'dec_precision_{:d}_scale_{:d}'.format(precision, scale) + columns[column_name] = random_decimal_values + + expected = pd.DataFrame(columns) + filename = tmpdir.join('decimals.parquet') + string_filename = str(filename) + t = pa.Table.from_pandas(expected) + _write_table(t, string_filename) + result_table = _read_table(string_filename) + result = result_table.to_pandas() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail( + raises=pa.ArrowException, reason='Parquet does not support negative scale' +) +def test_decimal_roundtrip_negative_scale(tmpdir): + expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]}) + filename = tmpdir.join('decimals.parquet') + string_filename = str(filename) + t = pa.Table.from_pandas(expected) + _write_table(t, string_filename) + result_table = _read_table(string_filename) + result = result_table.to_pandas() + tm.assert_frame_equal(result, expected) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py new file mode 100644 index 0000000..a3ba900 --- /dev/null +++ b/python/pyarrow/tests/util.py @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Utility functions for testing +""" + +import decimal +import random +import contextlib + + +def randsign(): + """Randomly choose either 1 or -1. + + Returns + ------- + sign : int + """ + return random.choice((-1, 1)) + + +@contextlib.contextmanager +def random_seed(seed): + """Set the random seed inside of a context manager. + + Parameters + ---------- + seed : int + The seed to set + + Notes + ----- + This function is useful when you want to set a random seed but not affect + the random state of other functions using the random module. + """ + original_state = random.getstate() + random.seed(seed) + try: + yield + finally: + random.setstate(original_state) + + +def randdecimal(precision, scale): + """Generate a random decimal value with specified precision and scale. + + Parameters + ---------- + precision : int + The maximum number of digits to generate. Must be an integer between 1 + and 38 inclusive. + scale : int + The maximum number of digits following the decimal point. Must be an + integer greater than or equal to 0. + + Returns + ------- + decimal_value : decimal.Decimal + A random decimal.Decimal object with the specifed precision and scale. + """ + assert 1 <= precision <= 38, 'precision must be between 1 and 38 inclusive' + if scale < 0: + raise ValueError( + 'randdecimal does not yet support generating decimals with ' + 'negative scale' + ) + max_whole_value = 10 ** (precision - scale) - 1 + whole = random.randint(-max_whole_value, max_whole_value) + + if not scale: + return decimal.Decimal(whole) + + max_fractional_value = 10 ** scale - 1 + fractional = random.randint(0, max_fractional_value) + + return decimal.Decimal( + '{}.{}'.format(whole, str(fractional).rjust(scale, '0')) + ) -- To stop receiving notification emails like this one, please contact ['"commits@arrow.apache.org" <commits@arrow.apache.org>'].