[GitHub] [arrow] MingyuZhong commented on a change in pull request #7945: ARROW-9710: [C++] Improve performance of Decimal128::ToString by 10x, and make the implementation reusable for Decimal256.

GitBox Thu, 13 Aug 2020 20:09:37 -0700


MingyuZhong commented on a change in pull request #7945:
URL: https://github.com/apache/arrow/pull/7945#discussion_r470384877




##########
File path: cpp/src/arrow/util/decimal.cc
##########
@@ -241,64 +284,43 @@ Decimal128::operator int64_t() const {
   return static_cast<int64_t>(low_bits());
 }
 
-static std::string ToStringNegativeScale(const std::string& str,
-                                         int32_t adjusted_exponent, bool 
is_negative) {
-  std::stringstream buf;
-
-  size_t offset = 0;
-  buf << str[offset++];
-
-  if (is_negative) {
-    buf << str[offset++];
-  }
-
-  buf << '.' << str.substr(offset, std::string::npos) << 'E' << std::showpos
-      << adjusted_exponent;
-  return buf.str();
-}
-
-std::string Decimal128::ToString(int32_t scale) const {
-  const std::string str(ToIntegerString());
-
+static void AdjustIntegerStringWithScale(int32_t scale, std::string* str) {
   if (scale == 0) {
-    return str;
+    return;
   }
 
-  const bool is_negative = *this < 0;
-
-  const auto len = static_cast<int32_t>(str.size());
+  const bool is_negative = str->front() == '-';
   const auto is_negative_offset = static_cast<int32_t>(is_negative);
-  const int32_t adjusted_exponent = -scale + (len - 1 - is_negative_offset);
+  const auto len = static_cast<int32_t>(str->size());
+  const int32_t num_digits = len - is_negative_offset;
+  const int32_t adjusted_exponent = num_digits - 1 - scale;
 
   /// Note that the -6 is taken from the Java BigDecimal documentation.
   if (scale < 0 || adjusted_exponent < -6) {
-    return ToStringNegativeScale(str, adjusted_exponent, is_negative);
+    str->insert(str->begin() + 1 + is_negative_offset, '.');
+    str->push_back('E');
+    // Use stringstream only for printing the exponent integer. It should be 
short
+    // enough to avoid memory allocations.
+    std::stringstream buf;
+    buf << std::showpos << adjusted_exponent;
+    *str += buf.str();
+    return;
   }
 
-  if (is_negative) {
-    if (len - 1 > scale) {
-      const auto n = static_cast<size_t>(len - scale);
-      return str.substr(0, n) + "." + str.substr(n, 
static_cast<size_t>(scale));
-    }
-
-    if (len - 1 == scale) {
-      return "-0." + str.substr(1, std::string::npos);
-    }
-
-    std::string result("-0." + std::string(static_cast<size_t>(scale - len + 
1), '0'));
-    return result + str.substr(1, std::string::npos);
-  }
-
-  if (len > scale) {
+  if (num_digits > scale) {
     const auto n = static_cast<size_t>(len - scale);
-    return str.substr(0, n) + "." + str.substr(n, static_cast<size_t>(scale));
+    str->insert(str->begin() + n, '.');
+    return;
   }
 
-  if (len == scale) {
-    return "0." + str;
-  }
+  str->insert(is_negative_offset, scale - num_digits + 2, '0');

Review comment:
       Added examples.

##########
File path: cpp/src/arrow/util/decimal_benchmark.cc
##########
@@ -26,14 +26,36 @@
 namespace arrow {
 namespace Decimal {
 
-static void FromString(benchmark::State& state) {  // NOLINT non-const 
reference
-  std::vector<std::string> values = {"0",
-                                     "1.23",
-                                     "12.345e6",
-                                     "-12.345e-6",
-                                     "123456789.123456789",
-                                     "1231234567890.451234567890"};
+static const std::vector<std::string>& GetValuesAsString() {
+  static const std::vector<std::string>* values =
+      new std::vector<std::string>{"0",

Review comment:
       Done. In my organization, we are disallowed to define non-POD static 
variables, so I got this habit.

##########
File path: cpp/src/arrow/util/decimal_benchmark.cc
##########
@@ -26,14 +26,36 @@
 namespace arrow {
 namespace Decimal {
 
-static void FromString(benchmark::State& state) {  // NOLINT non-const 
reference
-  std::vector<std::string> values = {"0",
-                                     "1.23",
-                                     "12.345e6",
-                                     "-12.345e-6",
-                                     "123456789.123456789",
-                                     "1231234567890.451234567890"};
+static const std::vector<std::string>& GetValuesAsString() {
+  static const std::vector<std::string>* values =
+      new std::vector<std::string>{"0",
+                                   "1.23",
+                                   "12.345e6",
+                                   "-12.345e-6",
+                                   "123456789.123456789",
+                                   "1231234567890.451234567890"};
+  return *values;
+}
+
+static std::vector<std::pair<Decimal128, int32_t>> 
BuildDecimalValuesAndScales() {

Review comment:
       Done.

##########
File path: cpp/src/arrow/util/decimal_benchmark.cc
##########
@@ -26,14 +26,36 @@
 namespace arrow {
 namespace Decimal {
 
-static void FromString(benchmark::State& state) {  // NOLINT non-const 
reference
-  std::vector<std::string> values = {"0",
-                                     "1.23",
-                                     "12.345e6",
-                                     "-12.345e-6",
-                                     "123456789.123456789",
-                                     "1231234567890.451234567890"};
+static const std::vector<std::string>& GetValuesAsString() {
+  static const std::vector<std::string>* values =
+      new std::vector<std::string>{"0",
+                                   "1.23",
+                                   "12.345e6",
+                                   "-12.345e-6",
+                                   "123456789.123456789",
+                                   "1231234567890.451234567890"};
+  return *values;
+}
+
+static std::vector<std::pair<Decimal128, int32_t>> 
BuildDecimalValuesAndScales() {
+  const std::vector<std::string>& value_strs = GetValuesAsString();
+  std::vector<std::pair<Decimal128, int32_t>> result(value_strs.size());
+  for (size_t i = 0; i < value_strs.size(); ++i) {
+    int32_t precision;
+    Decimal128::FromString(value_strs[i], &result[i].first, &result[i].second,
+                           &precision);
+  }
+  return result;
+}
+
+static const std::vector<std::pair<Decimal128, int32_t>>& 
GetDecimalValuesAndScales() {
+  static const auto* values =
+      new std::vector<std::pair<Decimal128, 
int32_t>>(BuildDecimalValuesAndScales());

Review comment:
       Moved the caching to the static variable in ToString. Completely 
removing caching would add 1ns to the benchmark.

##########
File path: cpp/src/arrow/util/decimal_test.cc
##########
@@ -106,6 +99,33 @@ TEST(DecimalTest, TestFromDecimalString128) {
   ASSERT_NE(result.high_bits(), 0);
 }
 
+TEST(DecimalTest, TestStringRoundTrip) {
+  static constexpr uint64_t kTestBits[] = {
+      0,
+      1,
+      999,
+      1000,
+      std::numeric_limits<int32_t>::max(),
+      (1ull << 31),
+      std::numeric_limits<uint32_t>::max(),
+      (1ull << 32),
+      std::numeric_limits<int64_t>::max(),
+      (1ull << 63),
+      std::numeric_limits<uint64_t>::max(),
+  };
+  static constexpr int32_t kScales[] = {-10, -1, 0, 1, 10};
+  for (uint64_t high_bits : kTestBits) {

Review comment:
       It's already covered. Added a comment. Decimal128ToStringTest has more 
explicit tests on negative values.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] MingyuZhong commented on a change in pull request #7945: ARROW-9710: [C++] Improve performance of Decimal128::ToString by 10x, and make the implementation reusable for Decimal256.

Reply via email to