This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 4009b62  ARROW-2224: [C++] Remove boost-regex dependency
4009b62 is described below

commit 4009b62086dfa43a4fd8bfa714772716e6531c6f
Author: Antoine Pitrou <anto...@python.org>
AuthorDate: Wed Apr 11 17:53:51 2018 +0200

    ARROW-2224: [C++] Remove boost-regex dependency
    
    Use a hand-written decimal parser.
    
    Author: Antoine Pitrou <anto...@python.org>
    
    Closes #1880 from pitrou/ARROW-2224-remove-boost-regex-dep and squashes the 
following commits:
    
    c173ceb <Antoine Pitrou> Mark result unused
    0277855 <Antoine Pitrou> Add C++ decimal benchmark
    f19dac2 <Antoine Pitrou> Address review comments
    ba0b62e <Antoine Pitrou> ARROW-2224:  Remove boost-regex dependency
---
 cpp/README.md                               |   1 -
 cpp/cmake_modules/ThirdpartyToolchain.cmake |  24 +---
 cpp/src/arrow/util/CMakeLists.txt           |   1 +
 cpp/src/arrow/util/decimal-benchmark.cc     |  45 ++++++++
 cpp/src/arrow/util/decimal-test.cc          |   2 -
 cpp/src/arrow/util/decimal.cc               | 164 +++++++++++++++-------------
 6 files changed, 139 insertions(+), 98 deletions(-)

diff --git a/cpp/README.md b/cpp/README.md
index 8018efd..daeeade 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -35,7 +35,6 @@ On Ubuntu/Debian you can install the requirements with:
 ```shell
 sudo apt-get install cmake \
      libboost-dev \
-     libboost-regex-dev \
      libboost-filesystem-dev \
      libboost-system-dev
 ```
diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake 
b/cpp/cmake_modules/ThirdpartyToolchain.cmake
index 129174c..020e0ed 100644
--- a/cpp/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake
@@ -157,11 +157,8 @@ if (ARROW_BOOST_VENDORED)
     
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}")
   set(BOOST_STATIC_FILESYSTEM_LIBRARY
     
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}")
-  set(BOOST_STATIC_REGEX_LIBRARY
-    
"${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}")
   set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}")
   set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}")
-  set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}")
   if (ARROW_BOOST_HEADER_ONLY)
     set(BOOST_BUILD_PRODUCTS)
     set(BOOST_CONFIGURE_COMMAND "")
@@ -169,12 +166,11 @@ if (ARROW_BOOST_VENDORED)
   else()
     set(BOOST_BUILD_PRODUCTS
       ${BOOST_SYSTEM_LIBRARY}
-      ${BOOST_FILESYSTEM_LIBRARY}
-      ${BOOST_REGEX_LIBRARY})
+      ${BOOST_FILESYSTEM_LIBRARY})
     set(BOOST_CONFIGURE_COMMAND
       "./bootstrap.sh"
       "--prefix=${BOOST_PREFIX}"
-      "--with-libraries=filesystem,system,regex")
+      "--with-libraries=filesystem,system")
     if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
       set(BOOST_BUILD_VARIANT "debug")
     else()
@@ -214,19 +210,16 @@ else()
     if (ARROW_BOOST_HEADER_ONLY)
       find_package(Boost REQUIRED)
     else()
-      find_package(Boost COMPONENTS system filesystem regex REQUIRED)
+      find_package(Boost COMPONENTS system filesystem REQUIRED)
       if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
         set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
         set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
-        set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
       else()
         set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
         set(BOOST_SHARED_FILESYSTEM_LIBRARY 
${Boost_FILESYSTEM_LIBRARY_RELEASE})
-        set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
       endif()
       set(BOOST_SYSTEM_LIBRARY boost_system_shared)
       set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared)
-      set(BOOST_REGEX_LIBRARY boost_regex_shared)
     endif()
   else()
     # Find static boost headers and libs
@@ -235,19 +228,16 @@ else()
     if (ARROW_BOOST_HEADER_ONLY)
       find_package(Boost REQUIRED)
     else()
-      find_package(Boost COMPONENTS system filesystem regex REQUIRED)
+      find_package(Boost COMPONENTS system filesystem REQUIRED)
       if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG")
         set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG})
         set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG})
-        set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG})
       else()
         set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE})
         set(BOOST_STATIC_FILESYSTEM_LIBRARY 
${Boost_FILESYSTEM_LIBRARY_RELEASE})
-        set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE})
       endif()
       set(BOOST_SYSTEM_LIBRARY boost_system_static)
       set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static)
-      set(BOOST_REGEX_LIBRARY boost_regex_static)
     endif()
   endif()
 endif()
@@ -264,11 +254,7 @@ if (NOT ARROW_BOOST_HEADER_ONLY)
       STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}"
       SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}")
 
-  ADD_THIRDPARTY_LIB(boost_regex
-      STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}"
-      SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}")
-
-  SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex)
+  SET(ARROW_BOOST_LIBS boost_system boost_filesystem)
 endif()
 
 include_directories(SYSTEM ${Boost_INCLUDE_DIR})
diff --git a/cpp/src/arrow/util/CMakeLists.txt 
b/cpp/src/arrow/util/CMakeLists.txt
index 41c27a5..8969527 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -60,5 +60,6 @@ ADD_ARROW_TEST(rle-encoding-test)
 ADD_ARROW_TEST(stl-util-test)
 
 ADD_ARROW_BENCHMARK(bit-util-benchmark)
+ADD_ARROW_BENCHMARK(decimal-benchmark)
 
 add_subdirectory(variant)
diff --git a/cpp/src/arrow/util/decimal-benchmark.cc 
b/cpp/src/arrow/util/decimal-benchmark.cc
new file mode 100644
index 0000000..3129536
--- /dev/null
+++ b/cpp/src/arrow/util/decimal-benchmark.cc
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "benchmark/benchmark.h"
+
+#include <string>
+#include <vector>
+
+#include "arrow/util/decimal.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace Decimal {
+
+static void BM_FromString(benchmark::State& state) {  // NOLINT non-const 
reference
+  std::vector<std::string> values = {"0", "1.23", "12.345e6", "-12.345e-6"};
+
+  while (state.KeepRunning()) {
+    for (const auto& value : values) {
+      Decimal128 dec;
+      int32_t scale, precision;
+      ARROW_UNUSED(Decimal128::FromString(value, &dec, &scale, &precision));
+    }
+  }
+  state.SetItemsProcessed(state.iterations() * values.size());
+}
+
+BENCHMARK(BM_FromString)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+
+}  // namespace Decimal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/decimal-test.cc 
b/cpp/src/arrow/util/decimal-test.cc
index 6db46d4..2829e4a 100644
--- a/cpp/src/arrow/util/decimal-test.cc
+++ b/cpp/src/arrow/util/decimal-test.cc
@@ -205,9 +205,7 @@ TEST(DecimalZerosTest, LeadingZerosDecimalPoint) {
   int32_t precision;
   int32_t scale;
   ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale));
-  // We explicitly do not support this for now, otherwise this would be 
ASSERT_EQ
   ASSERT_EQ(4, precision);
-
   ASSERT_EQ(4, scale);
   ASSERT_EQ(0, d);
 }
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index 48380a9..9e5e3dd 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -23,8 +23,6 @@
 #include <limits>
 #include <sstream>
 
-#include <boost/regex.hpp>
-
 #include "arrow/util/bit-util.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
@@ -253,112 +251,126 @@ static void StringToInteger(const std::string& str, 
Decimal128* out) {
   }
 }
 
-static const boost::regex DECIMAL_REGEX(
-    // sign of the number
-    "(?<SIGN>[-+]?)"
-
-    // digits around the decimal point
-    
"(((?<LEFT_DIGITS>\\d+)\\.(?<FIRST_RIGHT_DIGITS>\\d*)|\\.(?<SECOND_RIGHT_DIGITS>\\d+)"
-    ")"
+namespace {
 
-    // optional exponent
-    "([eE](?<FIRST_EXP_VALUE>[-+]?\\d+))?"
+struct DecimalComponents {
+  std::string sign;
+  std::string whole_digits;
+  std::string fractional_digits;
+  std::string exponent_sign;
+  std::string exponent_digits;
+};
 
-    // otherwise
-    "|"
+inline bool IsSign(char c) { return c == '-' || c == '+'; }
 
-    // we're just an integer
-    "(?<INTEGER>\\d+)"
+inline bool IsDot(char c) { return c == '.'; }
 
-    // or an integer with an exponent
-    "(?:[eE](?<SECOND_EXP_VALUE>[-+]?\\d+))?)");
+inline bool IsDigit(char c) { return c >= '0' && c <= '9'; }
 
-static inline bool is_zero_character(char c) { return c == '0'; }
+inline bool StartsExponent(char c) { return c == 'e' || c == 'E'; }
 
-Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* 
precision,
-                              int32_t* scale) {
-  if (s.empty()) {
-    return Status::Invalid("Empty string cannot be converted to decimal");
+inline size_t ParseDigitsRun(const char* s, size_t start, size_t size, 
std::string* out) {
+  size_t pos;
+  for (pos = start; pos < size; ++pos) {
+    if (!IsDigit(s[pos])) {
+      break;
+    }
   }
+  *out = std::string(s + start, pos - start);
+  return pos;
+}
 
-  // case of all zeros
-  if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) {
-    if (precision != nullptr) {
-      *precision = 0;
-    }
+bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* 
out) {
+  size_t pos = 0;
 
-    if (scale != nullptr) {
-      *scale = 0;
+  if (size == 0) {
+    return false;
+  }
+  // Sign of the number
+  if (IsSign(s[pos])) {
+    out->sign = std::string(s + pos, 1);
+    ++pos;
+  }
+  // First run of digits
+  pos = ParseDigitsRun(s, pos, size, &out->whole_digits);
+  if (pos == size) {
+    return !out->whole_digits.empty();
+  }
+  // Optional dot (if given in fractional form)
+  bool has_dot = IsDot(s[pos]);
+  if (has_dot) {
+    // Second run of digits
+    ++pos;
+    pos = ParseDigitsRun(s, pos, size, &out->fractional_digits);
+  }
+  if (out->whole_digits.empty() && out->fractional_digits.empty()) {
+    // Need at least some digits (whole or fractional)
+    return false;
+  }
+  if (pos == size) {
+    return true;
+  }
+  // Optional exponent
+  if (StartsExponent(s[pos])) {
+    ++pos;
+    if (pos == size) {
+      return false;
+    }
+    // Optional exponent sign
+    if (IsSign(s[pos])) {
+      out->exponent_sign = std::string(s + pos, 1);
+      ++pos;
+    }
+    pos = ParseDigitsRun(s, pos, size, &out->exponent_digits);
+    if (out->exponent_digits.empty()) {
+      // Need some exponent digits
+      return false;
     }
-
-    *out = 0;
-    return Status::OK();
   }
+  return pos == size;
+}
 
-  boost::smatch results;
-  const bool matches = boost::regex_match(s, results, DECIMAL_REGEX);
+}  // namespace
 
-  if (!matches) {
-    std::stringstream ss;
-    ss << "The string " << s << " is not a valid decimal number";
-    return Status::Invalid(ss.str());
+Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* 
precision,
+                              int32_t* scale) {
+  if (s.empty()) {
+    return Status::Invalid("Empty string cannot be converted to decimal");
   }
 
-  const std::string sign = results["SIGN"];
-  const std::string integer = results["INTEGER"];
-
-  const std::string left_digits = results["LEFT_DIGITS"];
-  const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"];
-
-  const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"];
-
-  const std::string first_exp_value = results["FIRST_EXP_VALUE"];
-  const std::string second_exp_value = results["SECOND_EXP_VALUE"];
-
-  std::string whole_part;
-  std::string fractional_part;
-  std::string exponent_value;
-
-  if (!integer.empty()) {
-    whole_part = integer;
-  } else if (!left_digits.empty()) {
-    DCHECK(second_right_digits.empty()) << s << " " << second_right_digits;
-    whole_part = left_digits;
-    fractional_part = first_right_digits;
-  } else {
-    DCHECK(first_right_digits.empty()) << s << " " << first_right_digits;
-    fractional_part = second_right_digits;
+  DecimalComponents dec;
+  if (!ParseDecimalComponents(s.data(), s.size(), &dec)) {
+    std::stringstream ss;
+    ss << "The string '" << s << "' is not a valid decimal number";
+    return Status::Invalid(ss.str());
   }
+  std::string exponent_value = dec.exponent_sign + dec.exponent_digits;
 
-  // skip leading zeros before the decimal point
-  std::string::const_iterator without_leading_zeros =
-      std::find_if_not(whole_part.cbegin(), whole_part.cend(), 
is_zero_character);
-  whole_part = std::string(without_leading_zeros, whole_part.cend());
-
-  if (!first_exp_value.empty()) {
-    exponent_value = first_exp_value;
-  } else {
-    exponent_value = second_exp_value;
+  // Count number of significant digits (without leading zeros)
+  size_t first_non_zero = dec.whole_digits.find_first_not_of('0');
+  size_t significant_digits = dec.fractional_digits.size();
+  if (first_non_zero != std::string::npos) {
+    significant_digits += dec.whole_digits.size() - first_non_zero;
   }
 
   if (precision != nullptr) {
-    *precision = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
+    *precision = static_cast<int32_t>(significant_digits);
   }
 
   if (scale != nullptr) {
     if (!exponent_value.empty()) {
       auto adjusted_exponent = static_cast<int32_t>(std::stol(exponent_value));
-      auto len = static_cast<int32_t>(whole_part.size() + 
fractional_part.size());
+      auto len = static_cast<int32_t>(significant_digits);
       *scale = -adjusted_exponent + len - 1;
     } else {
-      *scale = static_cast<int32_t>(fractional_part.size());
+      *scale = static_cast<int32_t>(dec.fractional_digits.size());
     }
   }
 
   if (out != nullptr) {
     *out = 0;
-    StringToInteger(whole_part + fractional_part, out);
-    if (sign == "-") {
+    StringToInteger(dec.whole_digits + dec.fractional_digits, out);
+    if (dec.sign == "-") {
       out->Negate();
     }
 

-- 
To stop receiving notification emails like this one, please contact
apit...@apache.org.

Reply via email to