[jira] [Commented] (ARROW-2153) [C++/Python] Decimal conversion not working for exponential notation

ASF GitHub Bot (JIRA) Fri, 23 Feb 2018 09:51:37 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-2153?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16374731#comment-16374731
 ]


ASF GitHub Bot commented on ARROW-2153:
---------------------------------------

cpcloud closed pull request #1618: ARROW-2153/ARROW-2160: [C++/Python]  Fix 
decimal precision inference
URL: https://github.com/apache/arrow/pull/1618
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 42c1ec8f0..f5cdec59a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -577,11 +577,13 @@ set(ARROW_LINK_LIBS
 
 set(ARROW_SHARED_PRIVATE_LINK_LIBS
   ${BOOST_SYSTEM_LIBRARY}
-  ${BOOST_FILESYSTEM_LIBRARY})
+  ${BOOST_FILESYSTEM_LIBRARY}
+  ${BOOST_REGEX_LIBRARY})
 
 set(ARROW_STATIC_PRIVATE_LINK_LIBS
   ${BOOST_SYSTEM_LIBRARY}
-  ${BOOST_FILESYSTEM_LIBRARY})
+  ${BOOST_FILESYSTEM_LIBRARY}
+  ${BOOST_REGEX_LIBRARY})
 
 if (NOT MSVC)
   set(ARROW_LINK_LIBS
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 3511d40d4..22eabde08 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -157,8 +157,11 @@ if (ARROW_BOOST_VENDORED)
     
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}")
   set(BOOST_STATIC_FILESYSTEM_LIBRARY
     
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}")
+  set(BOOST_STATIC_REGEX_LIBRARY
+          
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}")
   set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}")
   set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}")
+  set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}")
   if (ARROW_BOOST_HEADER_ONLY)
     set(BOOST_BUILD_PRODUCTS)
     set(BOOST_CONFIGURE_COMMAND "")
@@ -166,7 +169,8 @@ if (ARROW_BOOST_VENDORED)
   else()
     set(BOOST_BUILD_PRODUCTS
       ${BOOST_SYSTEM_LIBRARY}
-      ${BOOST_FILESYSTEM_LIBRARY})
+      ${BOOST_FILESYSTEM_LIBRARY}
+      ${BOOST_REGEX_LIBRARY})
     set(BOOST_CONFIGURE_COMMAND
       "./bootstrap.sh"
       "--prefix=${BOOST_PREFIX}"
@@ -210,16 +214,19 @@ else()
     if (ARROW_BOOST_HEADER_ONLY)
       find_package(Boost REQUIRED)
     else()
-      find_package(Boost COMPONENTS system filesystem REQUIRED)
+      find_package(Boost COMPONENTS system filesystem regex REQUIRED)
       if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
         set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
         set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
+        set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
       else()
         set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
         set(BOOST_SHARED_FILESYSTEM_LIBRARY 
${Boost_FILESYSTEM_LIBRARY_RELEASE})
+        set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
       endif()
       set(BOOST_SYSTEM_LIBRARY boost_system_shared)
       set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared)
+      set(BOOST_REGEX_LIBRARY boost_regex_shared)
     endif()
   else()
     # Find static boost headers and libs
@@ -228,16 +235,19 @@ else()
     if (ARROW_BOOST_HEADER_ONLY)
       find_package(Boost REQUIRED)
     else()
-      find_package(Boost COMPONENTS system filesystem REQUIRED)
+      find_package(Boost COMPONENTS system filesystem regex REQUIRED)
       if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
         set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
         set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
+        set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
       else()
         set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
         set(BOOST_STATIC_FILESYSTEM_LIBRARY 
${Boost_FILESYSTEM_LIBRARY_RELEASE})
+        set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
       endif()
       set(BOOST_SYSTEM_LIBRARY boost_system_static)
       set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static)
+      set(BOOST_REGEX_LIBRARY boost_regex_static)
     endif()
   endif()
 endif()
@@ -254,7 +264,11 @@ if (NOT ARROW_BOOST_HEADER_ONLY)
       STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}"
       SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}")
 
-  SET(ARROW_BOOST_LIBS boost_system boost_filesystem)
+  ADD_THIRDPARTY_LIB(boost_regex
+      STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}"
+      SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}")
+
+  SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex)
 endif()
 
 include_directories(SYSTEM ${Boost_INCLUDE_DIR})
diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc
index df1db9991..75fb7d8c7 100644
--- a/cpp/src/arrow/python/helpers.cc
+++ b/cpp/src/arrow/python/helpers.cc
@@ -99,7 +99,8 @@ Status InferDecimalPrecisionAndScale(PyObject* 
python_decimal, int32_t* precisio
   DCHECK_NE(precision, NULLPTR);
   DCHECK_NE(scale, NULLPTR);
 
-  OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", "()"));
+  // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a 
DCHECK
+  OwnedRef as_tuple(PyObject_CallMethod(python_decimal, "as_tuple", ""));
   RETURN_IF_PYERROR();
   DCHECK(PyTuple_Check(as_tuple.obj()));
 
@@ -117,7 +118,21 @@ Status InferDecimalPrecisionAndScale(PyObject* 
python_decimal, int32_t* precisio
   const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj()));
   RETURN_IF_PYERROR();
 
-  *precision = num_digits;
+  const int32_t abs_exponent = std::abs(exponent);
+
+  int32_t num_additional_zeros;
+
+  if (num_digits < abs_exponent) {
+    DCHECK_NE(exponent, 0) << "exponent should never be zero here";
+
+    // we have leading/trailing zeros, leading if exponent is negative
+    num_additional_zeros = exponent < 0 ? abs_exponent - num_digits : exponent;
+  } else {
+    // we can use the number of digits as the precision
+    num_additional_zeros = 0;
+  }
+
+  *precision = num_digits + num_additional_zeros;
   *scale = -exponent;
   return Status::OK();
 }
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 23418ad92..a7104832e 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -766,10 +766,7 @@ Status NumPyConverter::ConvertDecimals() {
       RETURN_NOT_OK(internal::InferDecimalPrecisionAndScale(objects[i], 
&tmp_precision,
                                                             &tmp_scale));
       precision = std::max(precision, tmp_precision);
-
-      if (std::abs(desired_scale) < std::abs(tmp_scale)) {
-        desired_scale = tmp_scale;
-      }
+      desired_scale = std::max(desired_scale, tmp_scale);
     }
 
     type_ = ::arrow::decimal(precision, desired_scale);
diff --git a/cpp/src/arrow/python/python-test.cc 
b/cpp/src/arrow/python/python-test.cc
index b76caaece..6ffcbb131 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -145,6 +145,57 @@ TEST_F(DecimalTest, TestInferPrecisionAndNegativeScale) {
   ASSERT_EQ(expected_scale, scale);
 }
 
+TEST_F(DecimalTest, TestInferAllLeadingZeros) {
+  std::string decimal_string("0.001");
+  OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string));
+
+  int32_t precision;
+  int32_t scale;
+
+  ASSERT_OK(
+      internal::InferDecimalPrecisionAndScale(python_decimal.obj(), 
&precision, &scale));
+
+  const int32_t expected_precision = 3;
+  const int32_t expected_scale = 3;
+
+  ASSERT_EQ(expected_precision, precision);
+  ASSERT_EQ(expected_scale, scale);
+}
+
+TEST_F(DecimalTest, TestInferAllLeadingZerosExponentialNotationPositive) {
+  std::string decimal_string("0.01E5");
+  OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string));
+
+  int32_t precision;
+  int32_t scale;
+
+  ASSERT_OK(
+      internal::InferDecimalPrecisionAndScale(python_decimal.obj(), 
&precision, &scale));
+
+  const int32_t expected_precision = 4;
+  const int32_t expected_scale = -3;
+
+  ASSERT_EQ(expected_precision, precision);
+  ASSERT_EQ(expected_scale, scale);
+}
+
+TEST_F(DecimalTest, TestInferAllLeadingZerosExponentialNotationNegative) {
+  std::string decimal_string("0.01E3");
+  OwnedRef python_decimal(this->CreatePythonDecimal(decimal_string));
+
+  int32_t precision;
+  int32_t scale;
+
+  ASSERT_OK(
+      internal::InferDecimalPrecisionAndScale(python_decimal.obj(), 
&precision, &scale));
+
+  const int32_t expected_precision = 1;
+  const int32_t expected_scale = -1;
+
+  ASSERT_EQ(expected_precision, precision);
+  ASSERT_EQ(expected_scale, scale);
+}
+
 TEST(PandasConversionTest, TestObjectBlockWriteFails) {
   StringBuilder builder;
   const char value[] = {'\xf1', '\0'};
@@ -241,5 +292,43 @@ TEST_F(DecimalTest, TestOverflowFails) {
                                                             decimal_type, 
&value));
 }
 
+
+TEST_F(DecimalTest, SimpleInference) {
+  OwnedRef value(this->CreatePythonDecimal("0.01"));
+  ASSERT_NE(value.obj(), nullptr);
+  int32_t precision;
+  int32_t scale;
+  ASSERT_OK(internal::InferDecimalPrecisionAndScale(value.obj(), &precision, 
&scale));
+  ASSERT_EQ(2, precision);
+  ASSERT_EQ(2, scale);
+}
+
+
+TEST_F(DecimalTest, TestMixedPrecisionAndScaleSequenceConvert) {
+
+  PyAcquireGIL lock;
+  MemoryPool* pool = default_memory_pool();
+  std::shared_ptr<Array> arr;
+
+  OwnedRef list_ref(PyList_New(2));
+  PyObject* list = list_ref.obj();
+
+  ASSERT_NE(list, nullptr);
+
+  PyObject* value1 = this->CreatePythonDecimal("0.01").detach();
+  ASSERT_NE(value1, nullptr);
+
+  PyObject* value2 = this->CreatePythonDecimal("0.001").detach();
+  ASSERT_NE(value2, nullptr);
+
+  // This steals a reference to each object, so we don't need to decref them 
later
+  // just the list
+
+  ASSERT_EQ(PyList_SetItem(list, 0, value1), 0);
+  ASSERT_EQ(PyList_SetItem(list, 1, value2), 0);
+
+  ASSERT_OK(ConvertPySequence(list, pool, &arr));
+}
+
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal-test.cc 
b/cpp/src/arrow/util/decimal-test.cc
index e4406747d..42812bd55 100644
--- a/cpp/src/arrow/util/decimal-test.cc
+++ b/cpp/src/arrow/util/decimal-test.cc
@@ -37,7 +37,7 @@ class DecimalTestFixture : public ::testing::Test {
 
 TEST_F(DecimalTestFixture, TestToString) {
   Decimal128 decimal(this->integer_value_);
-  int scale = 5;
+  int32_t scale = 5;
   std::string result = decimal.ToString(scale);
   ASSERT_EQ(result, this->string_value_);
 }
@@ -45,7 +45,7 @@ TEST_F(DecimalTestFixture, TestToString) {
 TEST_F(DecimalTestFixture, TestFromString) {
   Decimal128 expected(this->integer_value_);
   Decimal128 result;
-  int precision, scale;
+  int32_t precision, scale;
   ASSERT_OK(Decimal128::FromString(this->string_value_, &result, &precision, 
&scale));
   ASSERT_EQ(result, expected);
   ASSERT_EQ(precision, 8);
@@ -55,8 +55,8 @@ TEST_F(DecimalTestFixture, TestFromString) {
 TEST_F(DecimalTestFixture, TestStringStartingWithPlus) {
   std::string plus_value("+234.234");
   Decimal128 out;
-  int scale;
-  int precision;
+  int32_t scale;
+  int32_t precision;
   ASSERT_OK(Decimal128::FromString(plus_value, &out, &precision, &scale));
   ASSERT_EQ(234234, out);
   ASSERT_EQ(6, precision);
@@ -67,8 +67,8 @@ TEST_F(DecimalTestFixture, TestStringStartingWithPlus128) {
   std::string plus_value("+2342394230592.232349023094");
   Decimal128 expected_value("2342394230592232349023094");
   Decimal128 out;
-  int scale;
-  int precision;
+  int32_t scale;
+  int32_t precision;
   ASSERT_OK(Decimal128::FromString(plus_value, &out, &precision, &scale));
   ASSERT_EQ(expected_value, out);
   ASSERT_EQ(25, precision);
@@ -90,9 +90,7 @@ TEST(DecimalTest, TestFromDecimalString128) {
   Decimal128 result;
   ASSERT_OK(Decimal128::FromString(string_value, &result));
   Decimal128 expected(static_cast<int64_t>(-230492239423435324));
-  expected *= 100;
-  expected -= 12;
-  ASSERT_EQ(result, expected);
+  ASSERT_EQ(result, expected * 100 - 12);
 
   // Sanity check that our number is actually using more than 64 bits
   ASSERT_NE(result.high_bits(), 0);
@@ -194,36 +192,36 @@ TEST(DecimalTest, TestInvalidInputWithLeadingZeros) {
 TEST(DecimalZerosTest, LeadingZerosNoDecimalPoint) {
   std::string string_value("0000000");
   Decimal128 d;
-  int precision;
-  int scale;
+  int32_t precision;
+  int32_t scale;
   ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale));
-  ASSERT_EQ(precision, 7);
-  ASSERT_EQ(scale, 0);
-  ASSERT_EQ(d, 0);
+  ASSERT_EQ(0, precision);
+  ASSERT_EQ(0, scale);
+  ASSERT_EQ(0, d);
 }
 
 TEST(DecimalZerosTest, LeadingZerosDecimalPoint) {
   std::string string_value("000.0000");
   Decimal128 d;
-  int precision;
-  int scale;
+  int32_t precision;
+  int32_t scale;
   ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale));
   // We explicitly do not support this for now, otherwise this would be 
ASSERT_EQ
-  ASSERT_NE(precision, 7);
+  ASSERT_EQ(4, precision);
 
-  ASSERT_EQ(scale, 4);
-  ASSERT_EQ(d, 0);
+  ASSERT_EQ(4, scale);
+  ASSERT_EQ(0, d);
 }
 
 TEST(DecimalZerosTest, NoLeadingZerosDecimalPoint) {
   std::string string_value(".00000");
   Decimal128 d;
-  int precision;
-  int scale;
+  int32_t precision;
+  int32_t scale;
   ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale));
-  ASSERT_EQ(precision, 5);
-  ASSERT_EQ(scale, 5);
-  ASSERT_EQ(d, 0);
+  ASSERT_EQ(5, precision);
+  ASSERT_EQ(5, scale);
+  ASSERT_EQ(0, d);
 }
 
 template <typename T>
@@ -375,4 +373,14 @@ TEST(Decimal128Test, TestSmallNumberFormat) {
   ASSERT_EQ(expected, result);
 }
 
+TEST(Decimal128Test, TestNoDecimalPointExponential) {
+  Decimal128 value;
+  int32_t precision;
+  int32_t scale;
+  ASSERT_OK(Decimal128::FromString("1E1", &value, &precision, &scale));
+  ASSERT_EQ(1, value.low_bits());
+  ASSERT_EQ(1, precision);
+  ASSERT_EQ(-1, scale);
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index a3c8cda76..c4d8dfb08 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -23,6 +23,8 @@
 #include <limits>
 #include <sstream>
 
+#include <boost/regex.hpp>
+
 #include "arrow/util/bit-util.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
@@ -187,8 +189,6 @@ static constexpr int64_t kPowersOfTen[kInt64DecimalDigits + 
1] = {1LL,
                                                                   
100000000000000000LL,
                                                                   
1000000000000000000LL};
 
-static inline bool isdigit(char value) { return std::isdigit(value) != 0; }
-
 static void StringToInteger(const std::string& str, Decimal128* out) {
   using std::size_t;
 
@@ -212,158 +212,112 @@ static void StringToInteger(const std::string& str, 
Decimal128* out) {
   }
 }
 
-Status Decimal128::FromString(const std::string& s, Decimal128* out, int* 
precision,
-                              int* scale) {
-  // Implements this regex: 
"(\\+?|-?)((0*)(\\d*))(\\.(\\d+))?((E|e)(\\+|-)?\\d+)?";
-  if (s.empty()) {
-    return Status::Invalid("Empty string cannot be converted to decimal");
-  }
+static const boost::regex DECIMAL_REGEX(
+    // sign of the number
+    "(?<SIGN>[-+]?)"
 
-  std::string::const_iterator charp = s.cbegin();
-  std::string::const_iterator end = s.cend();
+    // digits around the decimal point
+    
"(((?<LEFT_DIGITS>\\d+)\\.(?<FIRST_RIGHT_DIGITS>\\d*)|\\.(?<SECOND_RIGHT_DIGITS>\\d+)"
+    ")"
 
-  char first_char = *charp;
-  bool is_negative = false;
-  if (first_char == '+' || first_char == '-') {
-    is_negative = first_char == '-';
-    ++charp;
-  }
+    // optional exponent
+    "([eE](?<FIRST_EXP_VALUE>[-+]?\\d+))?"
 
-  if (charp == end) {
-    std::stringstream ss;
-    ss << "Single character: '" << first_char << "' is not a valid decimal 
value";
-    return Status::Invalid(ss.str());
-  }
+    // otherwise
+    "|"
 
-  std::string::const_iterator numeric_string_start = charp;
+    // we're just an integer
+    "(?<INTEGER>\\d+)"
 
-  DCHECK_LT(charp, end);
+    // or an integer with an exponent
+    "(?:[eE](?<SECOND_EXP_VALUE>[-+]?\\d+))?)");
 
-  // skip leading zeros
-  charp = std::find_if_not(charp, end, [](char value) { return value == '0'; 
});
+static inline bool is_zero_character(char c) { return c == '0'; }
 
-  // all zeros and no decimal point
-  if (charp == end) {
-    if (out != NULLPTR) {
-      *out = 0;
-    }
+Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* 
precision,
+                              int32_t* scale) {
+  if (s.empty()) {
+    return Status::Invalid("Empty string cannot be converted to decimal");
+  }
 
-    // Not sure what other libraries assign precision to for this case (this 
case of
-    // a string consisting only of one or more zeros)
+  // case of all zeros
+  if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) {
     if (precision != NULLPTR) {
-      *precision = static_cast<int>(charp - numeric_string_start);
+      *precision = 0;
     }
 
     if (scale != NULLPTR) {
       *scale = 0;
     }
 
+    *out = 0;
     return Status::OK();
   }
 
-  std::string::const_iterator whole_part_start = charp;
+  boost::smatch results;
+  const bool matches = boost::regex_match(s, results, DECIMAL_REGEX);
+
+  if (!matches) {
+    std::stringstream ss;
+    ss << s << " does not match";
+    return Status::Invalid(ss.str());
+  }
 
-  charp = std::find_if_not(charp, end, isdigit);
+  const std::string sign = results["SIGN"];
+  const std::string integer = results["INTEGER"];
 
-  std::string::const_iterator whole_part_end = charp;
-  std::string whole_part(whole_part_start, whole_part_end);
+  const std::string left_digits = results["LEFT_DIGITS"];
+  const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"];
 
-  if (charp != end && *charp == '.') {
-    ++charp;
+  const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"];
 
-    if (charp == end) {
-      return Status::Invalid(
-          "Decimal point must be followed by at least one base ten digit. 
Reached the "
-          "end of the string.");
-    }
+  const std::string first_exp_value = results["FIRST_EXP_VALUE"];
+  const std::string second_exp_value = results["SECOND_EXP_VALUE"];
 
-    if (std::isdigit(*charp) == 0) {
-      std::stringstream ss;
-      ss << "Decimal point must be followed by a base ten digit. Found '" << 
*charp
-         << "'";
-      return Status::Invalid(ss.str());
-    }
+  std::string whole_part;
+  std::string fractional_part;
+  std::string exponent_value;
+
+  if (!integer.empty()) {
+    whole_part = integer;
+  } else if (!left_digits.empty()) {
+    DCHECK(second_right_digits.empty()) << s << " " << second_right_digits;
+    whole_part = left_digits;
+    fractional_part = first_right_digits;
   } else {
-    if (charp != end) {
-      std::stringstream ss;
-      ss << "Expected base ten digit or decimal point but found '" << *charp
-         << "' instead.";
-      return Status::Invalid(ss.str());
-    }
+    DCHECK(first_right_digits.empty()) << s << " " << first_right_digits;
+    fractional_part = second_right_digits;
   }
 
-  std::string::const_iterator fractional_part_start = charp;
-
-  // The rest must be digits or an exponent
-  if (charp != end) {
-    charp = std::find_if_not(charp, end, isdigit);
+  // skip leading zeros before the decimal point
+  std::string::const_iterator without_leading_zeros =
+      std::find_if_not(whole_part.cbegin(), whole_part.cend(), 
is_zero_character);
+  whole_part = std::string(without_leading_zeros, whole_part.cend());
 
-    // The while loop has ended before the end of the string which means we've 
hit a
-    // character that isn't a base ten digit or "E" for exponent
-    if (charp != end && *charp != 'E' && *charp != 'e') {
-      std::stringstream ss;
-      ss << "Found non base ten digit character '" << *charp
-         << "' before the end of the string";
-      return Status::Invalid(ss.str());
-    }
+  if (!first_exp_value.empty()) {
+    exponent_value = first_exp_value;
+  } else {
+    exponent_value = second_exp_value;
   }
 
-  std::string::const_iterator fractional_part_end = charp;
-  std::string fractional_part(fractional_part_start, fractional_part_end);
-
   if (precision != NULLPTR) {
-    *precision = static_cast<int>(whole_part.size() + fractional_part.size());
+    *precision = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
   }
 
-  if (charp != end) {
-    // we must have an exponent, if this aborts then we have somehow not 
caught this and
-    // raised a proper error
-    DCHECK(*charp == 'E' || *charp == 'e');
-
-    ++charp;
-
-    const char value = *charp;
-    const bool starts_with_plus_or_minus = value == '+' || value == '-';
-
-    // we use this to construct the adjusted exponent integer later
-    std::string::const_iterator digit_start = charp;
-
-    // skip plus or minus
-    charp += starts_with_plus_or_minus;
-
-    // confirm that the rest of the characters are digits
-    charp = std::find_if_not(charp, end, isdigit);
-
-    if (charp != end) {
-      // we have something other than digits here
-      std::stringstream ss;
-      ss << "Found non decimal digit exponent value '" << *charp << "'";
-      return Status::Invalid(ss.str());
-    }
-
-    if (scale != NULLPTR) {
-      // compute the scale from the adjusted exponent
-      std::string adjusted_exponent_string(digit_start, end);
-      DCHECK(std::all_of(adjusted_exponent_string.cbegin() + 
starts_with_plus_or_minus,
-                         adjusted_exponent_string.cend(), isdigit))
-          << "Non decimal digit character found in " << 
adjusted_exponent_string;
-      const auto adjusted_exponent =
-          static_cast<int32_t>(std::stol(adjusted_exponent_string));
-      const auto len = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
-
+  if (scale != NULLPTR) {
+    if (!exponent_value.empty()) {
+      auto adjusted_exponent = static_cast<int32_t>(std::stol(exponent_value));
+      auto len = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
       *scale = -adjusted_exponent + len - 1;
-    }
-  } else {
-    if (scale != NULLPTR) {
-      *scale = static_cast<int>(fractional_part.size());
+    } else {
+      *scale = static_cast<int32_t>(fractional_part.size());
     }
   }
 
   if (out != NULLPTR) {
-    // zero out in case we've passed in a previously used value
     *out = 0;
     StringToInteger(whole_part + fractional_part, out);
-    if (is_negative) {
+    if (sign == "-") {
       out->Negate();
     }
   }
diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h
index 1594090a0..79a99ba6a 100644
--- a/cpp/src/arrow/util/decimal.h
+++ b/cpp/src/arrow/util/decimal.h
@@ -124,7 +124,7 @@ class ARROW_EXPORT Decimal128 {
   /// \brief Convert a decimal string to an Decimal128 value, optionally 
including
   /// precision and scale if they're passed in and not null.
   static Status FromString(const std::string& s, Decimal128* out,
-                           int* precision = NULLPTR, int* scale = NULLPTR);
+                           int32_t* precision = NULLPTR, int32_t* scale = 
NULLPTR);
 
   /// \brief Convert Decimal128 from one scale to another
   Status Rescale(int32_t original_scale, int32_t new_scale, Decimal128* out) 
const;
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 6e68dd961..04c57ef9f 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1143,6 +1143,16 @@ def test_decimal_fails_with_truncation(self):
         with pytest.raises(pa.ArrowException):
             pa.array(data2, type=type2)
 
+    def test_decimal_with_different_precisions(self):
+        data = [
+            decimal.Decimal('0.01'),
+            decimal.Decimal('0.001'),
+        ]
+        series = pd.Series(data)
+        array = pa.array(series)
+        assert array.to_pylist() == data
+        assert array.type == pa.decimal128(3, 3)
+
 
 class TestListTypes(object):
     """


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [C++/Python] Decimal conversion not working for exponential notation
> --------------------------------------------------------------------
>
>                 Key: ARROW-2153
>                 URL: https://issues.apache.org/jira/browse/ARROW-2153
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Antony Mayi
>            Assignee: Phillip Cloud
>            Priority: Major
>              Labels: pull-request-available
>
> {code:java}
> import pyarrow as pa
> import pandas as pd
> import decimal
> pa.Table.from_pandas(pd.DataFrame({'a': [decimal.Decimal('1.1'), 
> decimal.Decimal('2E+1')]}))
> {code}
>  
> {code:java}
> Traceback (most recent call last):
>   File "<stdin>", line 1, in <module>
>   File "pyarrow/table.pxi", line 875, in pyarrow.lib.Table.from_pandas 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:44927)
>   File 
> "/home/skadlec/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", 
> line 350, in dataframe_to_arrays
>     convert_types)]
>   File 
> "/home/skadlec/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", 
> line 349, in <listcomp>
>     for c, t in zip(columns_to_convert,
>   File 
> "/home/skadlec/.local/lib/python3.6/site-packages/pyarrow/pandas_compat.py", 
> line 345, in convert_column
>     return pa.array(col, from_pandas=True, type=ty)
>   File "pyarrow/array.pxi", line 170, in pyarrow.lib.array 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:29224)
>   File "pyarrow/array.pxi", line 70, in pyarrow.lib._ndarray_to_array 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:28465)
>   File "pyarrow/error.pxi", line 77, in pyarrow.lib.check_status 
> (/arrow/python/build/temp.linux-x86_64-3.6/lib.cxx:8270)
> pyarrow.lib.ArrowInvalid: Expected base ten digit or decimal point but found 
> 'E' instead.
> {code}
> In manual cases clearly we can write {{decimal.Decimal('20')}} instead of 
> {{decimal.Decimal('2E+1')}} but during arithmetical operations inside an 
> application the exponential notation can be produced out of control (it is 
> actually the _normalized_ form of the decimal number) plus for some values 
> the exponential notation is the only form expressing the significance so this 
> should be accepted.
> The [documentation|https://docs.python.org/3/library/decimal.html] suggests 
> using following transformation but that's only possible when the significance 
> information doesn't need to be kept:
> {code:java}
> def remove_exponent(d):
>     return d.quantize(Decimal(1)) if d == d.to_integral() else d.normalize()
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (ARROW-2153) [C++/Python] Decimal conversion not working for exponential notation

Reply via email to