This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 4009b62 ARROW-2224: [C++] Remove boost-regex dependency 4009b62 is described below commit 4009b62086dfa43a4fd8bfa714772716e6531c6f Author: Antoine Pitrou <anto...@python.org> AuthorDate: Wed Apr 11 17:53:51 2018 +0200 ARROW-2224: [C++] Remove boost-regex dependency Use a hand-written decimal parser. Author: Antoine Pitrou <anto...@python.org> Closes #1880 from pitrou/ARROW-2224-remove-boost-regex-dep and squashes the following commits: c173ceb <Antoine Pitrou> Mark result unused 0277855 <Antoine Pitrou> Add C++ decimal benchmark f19dac2 <Antoine Pitrou> Address review comments ba0b62e <Antoine Pitrou> ARROW-2224: Remove boost-regex dependency --- cpp/README.md | 1 - cpp/cmake_modules/ThirdpartyToolchain.cmake | 24 +--- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/decimal-benchmark.cc | 45 ++++++++ cpp/src/arrow/util/decimal-test.cc | 2 - cpp/src/arrow/util/decimal.cc | 164 +++++++++++++++------------- 6 files changed, 139 insertions(+), 98 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index 8018efd..daeeade 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -35,7 +35,6 @@ On Ubuntu/Debian you can install the requirements with: ```shell sudo apt-get install cmake \ libboost-dev \ - libboost-regex-dev \ libboost-filesystem-dev \ libboost-system-dev ``` diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 129174c..020e0ed 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -157,11 +157,8 @@ if (ARROW_BOOST_VENDORED) "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BOOST_STATIC_FILESYSTEM_LIBRARY "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}") - set(BOOST_STATIC_REGEX_LIBRARY - "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}") set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}") - set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}") if (ARROW_BOOST_HEADER_ONLY) set(BOOST_BUILD_PRODUCTS) set(BOOST_CONFIGURE_COMMAND "") @@ -169,12 +166,11 @@ if (ARROW_BOOST_VENDORED) else() set(BOOST_BUILD_PRODUCTS ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY} - ${BOOST_REGEX_LIBRARY}) + ${BOOST_FILESYSTEM_LIBRARY}) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh" "--prefix=${BOOST_PREFIX}" - "--with-libraries=filesystem,system,regex") + "--with-libraries=filesystem,system") if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_BUILD_VARIANT "debug") else() @@ -214,19 +210,16 @@ else() if (ARROW_BOOST_HEADER_ONLY) find_package(Boost REQUIRED) else() - find_package(Boost COMPONENTS system filesystem regex REQUIRED) + find_package(Boost COMPONENTS system filesystem REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) - set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) - set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_shared) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) - set(BOOST_REGEX_LIBRARY boost_regex_shared) endif() else() # Find static boost headers and libs @@ -235,19 +228,16 @@ else() if (ARROW_BOOST_HEADER_ONLY) find_package(Boost REQUIRED) else() - find_package(Boost COMPONENTS system filesystem regex REQUIRED) + find_package(Boost COMPONENTS system filesystem REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) - set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) - set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) - set(BOOST_REGEX_LIBRARY boost_regex_static) endif() endif() endif() @@ -264,11 +254,7 @@ if (NOT ARROW_BOOST_HEADER_ONLY) STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}" SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}") - ADD_THIRDPARTY_LIB(boost_regex - STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}" - SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}") - - SET(ARROW_BOOST_LIBS boost_system boost_filesystem boost_regex) + SET(ARROW_BOOST_LIBS boost_system boost_filesystem) endif() include_directories(SYSTEM ${Boost_INCLUDE_DIR}) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 41c27a5..8969527 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -60,5 +60,6 @@ ADD_ARROW_TEST(rle-encoding-test) ADD_ARROW_TEST(stl-util-test) ADD_ARROW_BENCHMARK(bit-util-benchmark) +ADD_ARROW_BENCHMARK(decimal-benchmark) add_subdirectory(variant) diff --git a/cpp/src/arrow/util/decimal-benchmark.cc b/cpp/src/arrow/util/decimal-benchmark.cc new file mode 100644 index 0000000..3129536 --- /dev/null +++ b/cpp/src/arrow/util/decimal-benchmark.cc @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include <string> +#include <vector> + +#include "arrow/util/decimal.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace Decimal { + +static void BM_FromString(benchmark::State& state) { // NOLINT non-const reference + std::vector<std::string> values = {"0", "1.23", "12.345e6", "-12.345e-6"}; + + while (state.KeepRunning()) { + for (const auto& value : values) { + Decimal128 dec; + int32_t scale, precision; + ARROW_UNUSED(Decimal128::FromString(value, &dec, &scale, &precision)); + } + } + state.SetItemsProcessed(state.iterations() * values.size()); +} + +BENCHMARK(BM_FromString)->Repetitions(3)->Unit(benchmark::kMicrosecond); + +} // namespace Decimal +} // namespace arrow diff --git a/cpp/src/arrow/util/decimal-test.cc b/cpp/src/arrow/util/decimal-test.cc index 6db46d4..2829e4a 100644 --- a/cpp/src/arrow/util/decimal-test.cc +++ b/cpp/src/arrow/util/decimal-test.cc @@ -205,9 +205,7 @@ TEST(DecimalZerosTest, LeadingZerosDecimalPoint) { int32_t precision; int32_t scale; ASSERT_OK(Decimal128::FromString(string_value, &d, &precision, &scale)); - // We explicitly do not support this for now, otherwise this would be ASSERT_EQ ASSERT_EQ(4, precision); - ASSERT_EQ(4, scale); ASSERT_EQ(0, d); } diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 48380a9..9e5e3dd 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -23,8 +23,6 @@ #include <limits> #include <sstream> -#include <boost/regex.hpp> - #include "arrow/util/bit-util.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" @@ -253,112 +251,126 @@ static void StringToInteger(const std::string& str, Decimal128* out) { } } -static const boost::regex DECIMAL_REGEX( - // sign of the number - "(?<SIGN>[-+]?)" - - // digits around the decimal point - "(((?<LEFT_DIGITS>\\d+)\\.(?<FIRST_RIGHT_DIGITS>\\d*)|\\.(?<SECOND_RIGHT_DIGITS>\\d+)" - ")" +namespace { - // optional exponent - "([eE](?<FIRST_EXP_VALUE>[-+]?\\d+))?" +struct DecimalComponents { + std::string sign; + std::string whole_digits; + std::string fractional_digits; + std::string exponent_sign; + std::string exponent_digits; +}; - // otherwise - "|" +inline bool IsSign(char c) { return c == '-' || c == '+'; } - // we're just an integer - "(?<INTEGER>\\d+)" +inline bool IsDot(char c) { return c == '.'; } - // or an integer with an exponent - "(?:[eE](?<SECOND_EXP_VALUE>[-+]?\\d+))?)"); +inline bool IsDigit(char c) { return c >= '0' && c <= '9'; } -static inline bool is_zero_character(char c) { return c == '0'; } +inline bool StartsExponent(char c) { return c == 'e' || c == 'E'; } -Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, - int32_t* scale) { - if (s.empty()) { - return Status::Invalid("Empty string cannot be converted to decimal"); +inline size_t ParseDigitsRun(const char* s, size_t start, size_t size, std::string* out) { + size_t pos; + for (pos = start; pos < size; ++pos) { + if (!IsDigit(s[pos])) { + break; + } } + *out = std::string(s + start, pos - start); + return pos; +} - // case of all zeros - if (std::all_of(s.cbegin(), s.cend(), is_zero_character)) { - if (precision != nullptr) { - *precision = 0; - } +bool ParseDecimalComponents(const char* s, size_t size, DecimalComponents* out) { + size_t pos = 0; - if (scale != nullptr) { - *scale = 0; + if (size == 0) { + return false; + } + // Sign of the number + if (IsSign(s[pos])) { + out->sign = std::string(s + pos, 1); + ++pos; + } + // First run of digits + pos = ParseDigitsRun(s, pos, size, &out->whole_digits); + if (pos == size) { + return !out->whole_digits.empty(); + } + // Optional dot (if given in fractional form) + bool has_dot = IsDot(s[pos]); + if (has_dot) { + // Second run of digits + ++pos; + pos = ParseDigitsRun(s, pos, size, &out->fractional_digits); + } + if (out->whole_digits.empty() && out->fractional_digits.empty()) { + // Need at least some digits (whole or fractional) + return false; + } + if (pos == size) { + return true; + } + // Optional exponent + if (StartsExponent(s[pos])) { + ++pos; + if (pos == size) { + return false; + } + // Optional exponent sign + if (IsSign(s[pos])) { + out->exponent_sign = std::string(s + pos, 1); + ++pos; + } + pos = ParseDigitsRun(s, pos, size, &out->exponent_digits); + if (out->exponent_digits.empty()) { + // Need some exponent digits + return false; } - - *out = 0; - return Status::OK(); } + return pos == size; +} - boost::smatch results; - const bool matches = boost::regex_match(s, results, DECIMAL_REGEX); +} // namespace - if (!matches) { - std::stringstream ss; - ss << "The string " << s << " is not a valid decimal number"; - return Status::Invalid(ss.str()); +Status Decimal128::FromString(const std::string& s, Decimal128* out, int32_t* precision, + int32_t* scale) { + if (s.empty()) { + return Status::Invalid("Empty string cannot be converted to decimal"); } - const std::string sign = results["SIGN"]; - const std::string integer = results["INTEGER"]; - - const std::string left_digits = results["LEFT_DIGITS"]; - const std::string first_right_digits = results["FIRST_RIGHT_DIGITS"]; - - const std::string second_right_digits = results["SECOND_RIGHT_DIGITS"]; - - const std::string first_exp_value = results["FIRST_EXP_VALUE"]; - const std::string second_exp_value = results["SECOND_EXP_VALUE"]; - - std::string whole_part; - std::string fractional_part; - std::string exponent_value; - - if (!integer.empty()) { - whole_part = integer; - } else if (!left_digits.empty()) { - DCHECK(second_right_digits.empty()) << s << " " << second_right_digits; - whole_part = left_digits; - fractional_part = first_right_digits; - } else { - DCHECK(first_right_digits.empty()) << s << " " << first_right_digits; - fractional_part = second_right_digits; + DecimalComponents dec; + if (!ParseDecimalComponents(s.data(), s.size(), &dec)) { + std::stringstream ss; + ss << "The string '" << s << "' is not a valid decimal number"; + return Status::Invalid(ss.str()); } + std::string exponent_value = dec.exponent_sign + dec.exponent_digits; - // skip leading zeros before the decimal point - std::string::const_iterator without_leading_zeros = - std::find_if_not(whole_part.cbegin(), whole_part.cend(), is_zero_character); - whole_part = std::string(without_leading_zeros, whole_part.cend()); - - if (!first_exp_value.empty()) { - exponent_value = first_exp_value; - } else { - exponent_value = second_exp_value; + // Count number of significant digits (without leading zeros) + size_t first_non_zero = dec.whole_digits.find_first_not_of('0'); + size_t significant_digits = dec.fractional_digits.size(); + if (first_non_zero != std::string::npos) { + significant_digits += dec.whole_digits.size() - first_non_zero; } if (precision != nullptr) { - *precision = static_cast<int32_t>(whole_part.size() + fractional_part.size()); + *precision = static_cast<int32_t>(significant_digits); } if (scale != nullptr) { if (!exponent_value.empty()) { auto adjusted_exponent = static_cast<int32_t>(std::stol(exponent_value)); - auto len = static_cast<int32_t>(whole_part.size() + fractional_part.size()); + auto len = static_cast<int32_t>(significant_digits); *scale = -adjusted_exponent + len - 1; } else { - *scale = static_cast<int32_t>(fractional_part.size()); + *scale = static_cast<int32_t>(dec.fractional_digits.size()); } } if (out != nullptr) { *out = 0; - StringToInteger(whole_part + fractional_part, out); - if (sign == "-") { + StringToInteger(dec.whole_digits + dec.fractional_digits, out); + if (dec.sign == "-") { out->Negate(); } -- To stop receiving notification emails like this one, please contact apit...@apache.org.