This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8db4e10 ARROW-3060: [C++] Factor out string-to-X conversion routines
8db4e10 is described below
commit 8db4e1061c4049fa08043710a7f116c820204dea
Author: Antoine Pitrou <[email protected]>
AuthorDate: Fri Aug 17 20:09:21 2018 -0400
ARROW-3060: [C++] Factor out string-to-X conversion routines
Author: Antoine Pitrou <[email protected]>
Closes #2433 from pitrou/ARROW-3060-string-conversion and squashes the
following commits:
67e3315f <Antoine Pitrou> ARROW-3060: Factor out string-to-X conversion
routines
---
cpp/src/arrow/compute/kernels/cast.cc | 120 ++--------------
cpp/src/arrow/type_traits.h | 10 ++
cpp/src/arrow/util/CMakeLists.txt | 1 +
cpp/src/arrow/util/parsing-util-test.cc | 241 ++++++++++++++++++++++++++++++++
cpp/src/arrow/util/parsing.h | 184 ++++++++++++++++++++++++
5 files changed, 446 insertions(+), 110 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/cast.cc
b/cpp/src/arrow/compute/kernels/cast.cc
index 8b14b7b..1101ce7 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -17,12 +17,10 @@
#include "arrow/compute/kernels/cast.h"
-#include <cerrno>
#include <cstdint>
#include <cstring>
#include <functional>
#include <limits>
-#include <locale>
#include <memory>
#include <sstream>
#include <string>
@@ -40,6 +38,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
+#include "arrow/util/parsing.h"
#include "arrow/compute/context.h"
#include "arrow/compute/kernel.h"
@@ -732,72 +731,6 @@ struct CastFunctor<T, DictionaryType,
// ----------------------------------------------------------------------
// String to Number
-// Cast a string to a number. Returns true on success, false on error.
-// We rely on C++ istringstream for locale-independent parsing, which might
-// not be the fastest option.
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value,
- bool>::type static
CastStringToNumber(std::istringstream& ibuf,
- T* out) {
- ibuf >> *out;
- return !ibuf.fail() && ibuf.eof();
-}
-
-// For integers, not all integer widths are handled by the C++ stdlib, so
-// we check for limits outselves.
-
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
- bool>::type static
CastStringToNumber(std::istringstream& ibuf,
- T* out) {
- static constexpr bool need_long_long = sizeof(T) > sizeof(long); // NOLINT
- static constexpr T min_value = std::numeric_limits<T>::min();
- static constexpr T max_value = std::numeric_limits<T>::max();
-
- if (need_long_long) {
- long long res; // NOLINT
- ibuf >> res;
- *out = static_cast<T>(res); // may downcast
- if (res < min_value || res > max_value) {
- return false;
- }
- } else {
- long res; // NOLINT
- ibuf >> res;
- *out = static_cast<T>(res); // may downcast
- if (res < min_value || res > max_value) {
- return false;
- }
- }
- return !ibuf.fail() && ibuf.eof();
-}
-
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value &&
std::is_unsigned<T>::value,
- bool>::type static
CastStringToNumber(std::istringstream& ibuf,
- T* out) {
- static constexpr bool need_long_long = sizeof(T) > sizeof(unsigned long);
// NOLINT
- static constexpr T max_value = std::numeric_limits<T>::max();
-
- if (need_long_long) {
- unsigned long long res; // NOLINT
- ibuf >> res;
- *out = static_cast<T>(res); // may downcast
- if (res > max_value) {
- return false;
- }
- } else {
- unsigned long res; // NOLINT
- ibuf >> res;
- *out = static_cast<T>(res); // may downcast
- if (res > max_value) {
- return false;
- }
- }
- return !ibuf.fail() && ibuf.eof();
-}
-
template <typename O>
struct CastFunctor<O, StringType, enable_if_number<O>> {
void operator()(FunctionContext* ctx, const CastOptions& options,
@@ -806,19 +739,17 @@ struct CastFunctor<O, StringType, enable_if_number<O>> {
StringArray input_array(input.Copy());
auto out_data = GetMutableValues<out_type>(output, 1);
- errno = 0;
- // Instantiate the stringstream outside of the loop
- std::istringstream ibuf;
- ibuf.imbue(std::locale::classic());
+ internal::StringConverter<O> converter;
for (int64_t i = 0; i < input.length; ++i, ++out_data) {
if (input_array.IsNull(i)) {
continue;
}
- auto str = input_array.GetString(i);
- ibuf.clear();
- ibuf.str(str);
- if (!CastStringToNumber(ibuf, out_data)) {
+
+ int32_t length = -1;
+ auto str = input_array.GetValue(i, &length);
+ if (!converter(reinterpret_cast<const char*>(str),
static_cast<size_t>(length),
+ out_data)) {
std::stringstream ss;
ss << "Failed to cast String '" << str << "' into " <<
output->type->ToString();
ctx->SetStatus(Status(StatusCode::Invalid, ss.str()));
@@ -831,38 +762,6 @@ struct CastFunctor<O, StringType, enable_if_number<O>> {
// ----------------------------------------------------------------------
// String to Boolean
-// Helper function to cast a C string to a boolean. Returns true on success,
-// false on error.
-
-static bool CastStringtoBoolean(const char* s, size_t length, bool* out) {
- if (length == 1) {
- // "0" or "1"?
- if (s[0] == '0') {
- *out = false;
- return true;
- }
- if (s[0] == '1') {
- *out = true;
- return true;
- }
- return false;
- }
- if (length == 4) {
- // "true"?
- *out = true;
- return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
- (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
- }
- if (length == 5) {
- // "false"?
- *out = false;
- return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
- (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
- (s[4] == 'e' || s[4] == 'E'));
- }
- return false;
-}
-
template <typename O>
struct CastFunctor<O, StringType,
typename std::enable_if<std::is_same<BooleanType,
O>::value>::type> {
@@ -871,6 +770,7 @@ struct CastFunctor<O, StringType,
StringArray input_array(input.Copy());
internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
output->offset, input.length);
+ internal::StringConverter<O> converter;
for (int64_t i = 0; i < input.length; ++i) {
if (input_array.IsNull(i)) {
@@ -881,8 +781,8 @@ struct CastFunctor<O, StringType,
int32_t length = -1;
auto str = input_array.GetValue(i, &length);
bool value;
- if (!CastStringtoBoolean(reinterpret_cast<const char*>(str),
- static_cast<size_t>(length), &value)) {
+ if (!converter(reinterpret_cast<const char*>(str),
static_cast<size_t>(length),
+ &value)) {
std::stringstream ss;
ss << "Failed to cast String '" << input_array.GetString(i) << "' into
"
<< output->type->ToString();
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index c270a16..da5cf25 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -337,6 +337,16 @@ using enable_if_integer =
typename std::enable_if<std::is_base_of<Integer, T>::value>::type;
template <typename T>
+using enable_if_signed_integer =
+ typename std::enable_if<std::is_base_of<Integer, T>::value &&
+ std::is_signed<typename T::c_type>::value>::type;
+
+template <typename T>
+using enable_if_unsigned_integer =
+ typename std::enable_if<std::is_base_of<Integer, T>::value &&
+ std::is_unsigned<typename T::c_type>::value>::type;
+
+template <typename T>
using enable_if_floating_point =
typename std::enable_if<std::is_base_of<FloatingPoint, T>::value>::type;
diff --git a/cpp/src/arrow/util/CMakeLists.txt
b/cpp/src/arrow/util/CMakeLists.txt
index 16fd442..9a4fceb 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -58,6 +58,7 @@ ADD_ARROW_TEST(compression-test)
ADD_ARROW_TEST(decimal-test)
ADD_ARROW_TEST(key-value-metadata-test)
ADD_ARROW_TEST(rle-encoding-test)
+ADD_ARROW_TEST(parsing-util-test)
ADD_ARROW_TEST(stl-util-test)
ADD_ARROW_TEST(thread-pool-test)
ADD_ARROW_TEST(lazy-test)
diff --git a/cpp/src/arrow/util/parsing-util-test.cc
b/cpp/src/arrow/util/parsing-util-test.cc
new file mode 100644
index 0000000..b126b82
--- /dev/null
+++ b/cpp/src/arrow/util/parsing-util-test.cc
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <locale>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/test-util.h"
+#include "arrow/util/parsing.h"
+
+namespace arrow {
+
+using internal::StringConverter;
+
+template <typename ConverterType, typename C_TYPE>
+void AssertConversion(ConverterType& converter, const std::string& s, C_TYPE
expected) {
+ typename ConverterType::value_type out;
+ ASSERT_TRUE(converter(s.data(), s.length(), &out))
+ << "Conversion failed for '" << s << "' (expected to return " <<
expected << ")";
+ ASSERT_EQ(out, expected);
+}
+
+template <typename ConverterType>
+void AssertConversionFails(ConverterType& converter, const std::string& s) {
+ typename ConverterType::value_type out;
+ ASSERT_FALSE(converter(s.data(), s.length(), &out))
+ << "Conversion should have failed for '" << s << "' (returned " << out
<< ")";
+}
+
+class LocaleGuard {
+ public:
+ explicit LocaleGuard(const char* new_locale) : global_locale_(std::locale())
{
+ try {
+ std::locale::global(std::locale(new_locale));
+ } catch (std::runtime_error) {
+ // Locale unavailable, ignore
+ }
+ }
+
+ ~LocaleGuard() { std::locale::global(global_locale_); }
+
+ protected:
+ std::locale global_locale_;
+};
+
+TEST(StringConversion, ToBoolean) {
+ StringConverter<BooleanType> converter;
+
+ AssertConversion(converter, "true", true);
+ AssertConversion(converter, "tRuE", true);
+ AssertConversion(converter, "FAlse", false);
+ AssertConversion(converter, "false", false);
+ AssertConversion(converter, "1", true);
+ AssertConversion(converter, "0", false);
+
+ AssertConversionFails(converter, "");
+}
+
+TEST(StringConversion, ToFloat) {
+ StringConverter<FloatType> converter;
+
+ AssertConversion(converter, "1.5", 1.5f);
+ AssertConversion(converter, "0", 0.0f);
+ // XXX ASSERT_EQ doesn't distinguish signed zeros
+ AssertConversion(converter, "-0.0", -0.0f);
+ AssertConversion(converter, "-1e20", -1e20f);
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToDouble) {
+ StringConverter<DoubleType> converter;
+
+ AssertConversion(converter, "1.5", 1.5);
+ AssertConversion(converter, "0", 0);
+ // XXX ASSERT_EQ doesn't distinguish signed zeros
+ AssertConversion(converter, "-0.0", -0.0);
+ AssertConversion(converter, "-1e100", -1e100);
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToFloatLocale) {
+ // French locale uses the comma as decimal point
+ LocaleGuard locale_guard("fr_FR.UTF-8");
+
+ StringConverter<FloatType> converter;
+ AssertConversion(converter, "1.5", 1.5f);
+}
+
+TEST(StringConversion, ToDoubleLocale) {
+ // French locale uses the comma as decimal point
+ LocaleGuard locale_guard("fr_FR.UTF-8");
+
+ StringConverter<DoubleType> converter;
+ AssertConversion(converter, "1.5", 1.5f);
+}
+
+TEST(StringConversion, ToInt8) {
+ StringConverter<Int8Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "127", 127);
+ AssertConversion(converter, "-128", -128);
+
+ // Non-representable values
+ AssertConversionFails(converter, "128");
+ AssertConversionFails(converter, "-129");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt8) {
+ StringConverter<UInt8Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "255", 255);
+
+ // Non-representable values
+ // AssertConversionFails(converter, "-1");
+ AssertConversionFails(converter, "256");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToInt16) {
+ StringConverter<Int16Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "32767", 32767);
+ AssertConversion(converter, "-32768", -32768);
+
+ // Non-representable values
+ AssertConversionFails(converter, "32768");
+ AssertConversionFails(converter, "-32769");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt16) {
+ StringConverter<UInt16Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "65535", 65535);
+
+ // Non-representable values
+ // AssertConversionFails(converter, "-1");
+ AssertConversionFails(converter, "65536");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToInt32) {
+ StringConverter<Int32Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "2147483647", 2147483647);
+ AssertConversion(converter, "-2147483648", -2147483648LL);
+
+ // Non-representable values
+ AssertConversionFails(converter, "2147483648");
+ AssertConversionFails(converter, "-2147483649");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt32) {
+ StringConverter<UInt32Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "4294967295", 4294967295UL);
+
+ // Non-representable values
+ // AssertConversionFails(converter, "-1");
+ AssertConversionFails(converter, "4294967296");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToInt64) {
+ StringConverter<Int64Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "9223372036854775807", 9223372036854775807LL);
+ AssertConversion(converter, "-9223372036854775808", -9223372036854775807LL -
1);
+
+ // Non-representable values
+ AssertConversionFails(converter, "9223372036854775808");
+ AssertConversionFails(converter, "-9223372036854775809");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt64) {
+ StringConverter<UInt64Type> converter;
+
+ AssertConversion(converter, "0", 0);
+ AssertConversion(converter, "18446744073709551615", 18446744073709551615ULL);
+
+ // Non-representable values
+ // AssertConversionFails(converter, "-1");
+ AssertConversionFails(converter, "18446744073709551616");
+
+ AssertConversionFails(converter, "");
+ AssertConversionFails(converter, "0.0");
+ AssertConversionFails(converter, "e");
+}
+
+} // namespace arrow
diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h
new file mode 100644
index 0000000..efe3162
--- /dev/null
+++ b/cpp/src/arrow/util/parsing.h
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_PARSING_H
+#define ARROW_UTIL_PARSING_H
+
+#include <limits>
+#include <locale>
+#include <sstream>
+#include <string>
+
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief A class providing conversion from strings to some Arrow data types
+///
+/// Conversion is triggered by calling operator(). It returns true on
+/// success, false on failure.
+///
+/// The class may have a non-trivial construction cost in some cases,
+/// so it's recommended to use a single instance many times, if doing bulk
+/// conversion.
+///
+template <typename ARROW_TYPE, typename Enable = void>
+class StringConverter;
+
+template <>
+class StringConverter<BooleanType> {
+ public:
+ using value_type = bool;
+
+ bool operator()(const char* s, size_t length, value_type* out) {
+ if (length == 1) {
+ // "0" or "1"?
+ if (s[0] == '0') {
+ *out = false;
+ return true;
+ }
+ if (s[0] == '1') {
+ *out = true;
+ return true;
+ }
+ return false;
+ }
+ if (length == 4) {
+ // "true"?
+ *out = true;
+ return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
+ (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
+ }
+ if (length == 5) {
+ // "false"?
+ *out = false;
+ return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
+ (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
+ (s[4] == 'e' || s[4] == 'E'));
+ }
+ return false;
+ }
+};
+
+template <class ARROW_TYPE>
+class StringToFloatConverterMixin {
+ public:
+ using value_type = typename ARROW_TYPE::c_type;
+
+ StringToFloatConverterMixin() { ibuf.imbue(std::locale::classic()); }
+
+ bool operator()(const char* s, size_t length, value_type* out) {
+ ibuf.clear();
+ ibuf.str(std::string(s, length));
+ ibuf >> *out;
+ // XXX Should we reset errno on failure?
+ return !ibuf.fail() && ibuf.eof();
+ }
+
+ protected:
+ std::istringstream ibuf;
+};
+
+template <>
+class StringConverter<FloatType> : public
StringToFloatConverterMixin<FloatType> {};
+
+template <>
+class StringConverter<DoubleType> : public
StringToFloatConverterMixin<DoubleType> {};
+
+// NOTE: HalfFloatType would require a half<->float conversion library
+
+template <class ARROW_TYPE>
+class StringConverter<ARROW_TYPE, enable_if_signed_integer<ARROW_TYPE>> {
+ public:
+ using value_type = typename ARROW_TYPE::c_type;
+
+ StringConverter() { ibuf.imbue(std::locale::classic()); }
+
+ bool operator()(const char* s, size_t length, value_type* out) {
+ static constexpr bool need_long_long = sizeof(value_type) > sizeof(long);
// NOLINT
+ static constexpr value_type min_value =
std::numeric_limits<value_type>::min();
+ static constexpr value_type max_value =
std::numeric_limits<value_type>::max();
+
+ ibuf.clear();
+ ibuf.str(std::string(s, length));
+ if (need_long_long) {
+ long long res; // NOLINT
+ ibuf >> res;
+ *out = static_cast<value_type>(res); // may downcast
+ if (res < min_value || res > max_value) {
+ return false;
+ }
+ } else {
+ long res; // NOLINT
+ ibuf >> res;
+ *out = static_cast<value_type>(res); // may downcast
+ if (res < min_value || res > max_value) {
+ return false;
+ }
+ }
+ // XXX Should we reset errno on failure?
+ return !ibuf.fail() && ibuf.eof();
+ }
+
+ protected:
+ std::istringstream ibuf;
+};
+
+template <class ARROW_TYPE>
+class StringConverter<ARROW_TYPE, enable_if_unsigned_integer<ARROW_TYPE>> {
+ public:
+ using value_type = typename ARROW_TYPE::c_type;
+
+ StringConverter() { ibuf.imbue(std::locale::classic()); }
+
+ bool operator()(const char* s, size_t length, value_type* out) {
+ static constexpr bool need_long_long =
+ sizeof(value_type) > sizeof(unsigned long); // NOLINT
+ static constexpr value_type max_value =
std::numeric_limits<value_type>::max();
+
+ ibuf.clear();
+ ibuf.str(std::string(s, length));
+ // XXX The following unfortunately allows negative input values
+ if (need_long_long) {
+ unsigned long long res; // NOLINT
+ ibuf >> res;
+ *out = static_cast<value_type>(res); // may downcast
+ if (res > max_value) {
+ return false;
+ }
+ } else {
+ unsigned long res; // NOLINT
+ ibuf >> res;
+ *out = static_cast<value_type>(res); // may downcast
+ if (res > max_value) {
+ return false;
+ }
+ }
+ // XXX Should we reset errno on failure?
+ return !ibuf.fail() && ibuf.eof();
+ }
+
+ protected:
+ std::istringstream ibuf;
+};
+
+} // namespace internal
+} // namespace arrow
+
+#endif // ARROW_UTIL_PARSING_H