This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8db4e10  ARROW-3060: [C++] Factor out string-to-X conversion routines
8db4e10 is described below

commit 8db4e1061c4049fa08043710a7f116c820204dea
Author: Antoine Pitrou <[email protected]>
AuthorDate: Fri Aug 17 20:09:21 2018 -0400

    ARROW-3060: [C++] Factor out string-to-X conversion routines
    
    Author: Antoine Pitrou <[email protected]>
    
    Closes #2433 from pitrou/ARROW-3060-string-conversion and squashes the 
following commits:
    
    67e3315f <Antoine Pitrou> ARROW-3060:  Factor out string-to-X conversion 
routines
---
 cpp/src/arrow/compute/kernels/cast.cc   | 120 ++--------------
 cpp/src/arrow/type_traits.h             |  10 ++
 cpp/src/arrow/util/CMakeLists.txt       |   1 +
 cpp/src/arrow/util/parsing-util-test.cc | 241 ++++++++++++++++++++++++++++++++
 cpp/src/arrow/util/parsing.h            | 184 ++++++++++++++++++++++++
 5 files changed, 446 insertions(+), 110 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/cast.cc 
b/cpp/src/arrow/compute/kernels/cast.cc
index 8b14b7b..1101ce7 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -17,12 +17,10 @@
 
 #include "arrow/compute/kernels/cast.h"
 
-#include <cerrno>
 #include <cstdint>
 #include <cstring>
 #include <functional>
 #include <limits>
-#include <locale>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -40,6 +38,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/parsing.h"
 
 #include "arrow/compute/context.h"
 #include "arrow/compute/kernel.h"
@@ -732,72 +731,6 @@ struct CastFunctor<T, DictionaryType,
 // ----------------------------------------------------------------------
 // String to Number
 
-// Cast a string to a number.  Returns true on success, false on error.
-// We rely on C++ istringstream for locale-independent parsing, which might
-// not be the fastest option.
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value,
-                        bool>::type static 
CastStringToNumber(std::istringstream& ibuf,
-                                                              T* out) {
-  ibuf >> *out;
-  return !ibuf.fail() && ibuf.eof();
-}
-
-// For integers, not all integer widths are handled by the C++ stdlib, so
-// we check for limits outselves.
-
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
-                        bool>::type static 
CastStringToNumber(std::istringstream& ibuf,
-                                                              T* out) {
-  static constexpr bool need_long_long = sizeof(T) > sizeof(long);  // NOLINT
-  static constexpr T min_value = std::numeric_limits<T>::min();
-  static constexpr T max_value = std::numeric_limits<T>::max();
-
-  if (need_long_long) {
-    long long res;  // NOLINT
-    ibuf >> res;
-    *out = static_cast<T>(res);  // may downcast
-    if (res < min_value || res > max_value) {
-      return false;
-    }
-  } else {
-    long res;  // NOLINT
-    ibuf >> res;
-    *out = static_cast<T>(res);  // may downcast
-    if (res < min_value || res > max_value) {
-      return false;
-    }
-  }
-  return !ibuf.fail() && ibuf.eof();
-}
-
-template <typename T>
-typename std::enable_if<std::is_integral<T>::value && 
std::is_unsigned<T>::value,
-                        bool>::type static 
CastStringToNumber(std::istringstream& ibuf,
-                                                              T* out) {
-  static constexpr bool need_long_long = sizeof(T) > sizeof(unsigned long);  
// NOLINT
-  static constexpr T max_value = std::numeric_limits<T>::max();
-
-  if (need_long_long) {
-    unsigned long long res;  // NOLINT
-    ibuf >> res;
-    *out = static_cast<T>(res);  // may downcast
-    if (res > max_value) {
-      return false;
-    }
-  } else {
-    unsigned long res;  // NOLINT
-    ibuf >> res;
-    *out = static_cast<T>(res);  // may downcast
-    if (res > max_value) {
-      return false;
-    }
-  }
-  return !ibuf.fail() && ibuf.eof();
-}
-
 template <typename O>
 struct CastFunctor<O, StringType, enable_if_number<O>> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
@@ -806,19 +739,17 @@ struct CastFunctor<O, StringType, enable_if_number<O>> {
 
     StringArray input_array(input.Copy());
     auto out_data = GetMutableValues<out_type>(output, 1);
-    errno = 0;
-    // Instantiate the stringstream outside of the loop
-    std::istringstream ibuf;
-    ibuf.imbue(std::locale::classic());
+    internal::StringConverter<O> converter;
 
     for (int64_t i = 0; i < input.length; ++i, ++out_data) {
       if (input_array.IsNull(i)) {
         continue;
       }
-      auto str = input_array.GetString(i);
-      ibuf.clear();
-      ibuf.str(str);
-      if (!CastStringToNumber(ibuf, out_data)) {
+
+      int32_t length = -1;
+      auto str = input_array.GetValue(i, &length);
+      if (!converter(reinterpret_cast<const char*>(str), 
static_cast<size_t>(length),
+                     out_data)) {
         std::stringstream ss;
         ss << "Failed to cast String '" << str << "' into " << 
output->type->ToString();
         ctx->SetStatus(Status(StatusCode::Invalid, ss.str()));
@@ -831,38 +762,6 @@ struct CastFunctor<O, StringType, enable_if_number<O>> {
 // ----------------------------------------------------------------------
 // String to Boolean
 
-// Helper function to cast a C string to a boolean.  Returns true on success,
-// false on error.
-
-static bool CastStringtoBoolean(const char* s, size_t length, bool* out) {
-  if (length == 1) {
-    // "0" or "1"?
-    if (s[0] == '0') {
-      *out = false;
-      return true;
-    }
-    if (s[0] == '1') {
-      *out = true;
-      return true;
-    }
-    return false;
-  }
-  if (length == 4) {
-    // "true"?
-    *out = true;
-    return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
-            (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
-  }
-  if (length == 5) {
-    // "false"?
-    *out = false;
-    return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
-            (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
-            (s[4] == 'e' || s[4] == 'E'));
-  }
-  return false;
-}
-
 template <typename O>
 struct CastFunctor<O, StringType,
                    typename std::enable_if<std::is_same<BooleanType, 
O>::value>::type> {
@@ -871,6 +770,7 @@ struct CastFunctor<O, StringType,
     StringArray input_array(input.Copy());
     internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
                                            output->offset, input.length);
+    internal::StringConverter<O> converter;
 
     for (int64_t i = 0; i < input.length; ++i) {
       if (input_array.IsNull(i)) {
@@ -881,8 +781,8 @@ struct CastFunctor<O, StringType,
       int32_t length = -1;
       auto str = input_array.GetValue(i, &length);
       bool value;
-      if (!CastStringtoBoolean(reinterpret_cast<const char*>(str),
-                               static_cast<size_t>(length), &value)) {
+      if (!converter(reinterpret_cast<const char*>(str), 
static_cast<size_t>(length),
+                     &value)) {
         std::stringstream ss;
         ss << "Failed to cast String '" << input_array.GetString(i) << "' into 
"
            << output->type->ToString();
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index c270a16..da5cf25 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -337,6 +337,16 @@ using enable_if_integer =
     typename std::enable_if<std::is_base_of<Integer, T>::value>::type;
 
 template <typename T>
+using enable_if_signed_integer =
+    typename std::enable_if<std::is_base_of<Integer, T>::value &&
+                            std::is_signed<typename T::c_type>::value>::type;
+
+template <typename T>
+using enable_if_unsigned_integer =
+    typename std::enable_if<std::is_base_of<Integer, T>::value &&
+                            std::is_unsigned<typename T::c_type>::value>::type;
+
+template <typename T>
 using enable_if_floating_point =
     typename std::enable_if<std::is_base_of<FloatingPoint, T>::value>::type;
 
diff --git a/cpp/src/arrow/util/CMakeLists.txt 
b/cpp/src/arrow/util/CMakeLists.txt
index 16fd442..9a4fceb 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -58,6 +58,7 @@ ADD_ARROW_TEST(compression-test)
 ADD_ARROW_TEST(decimal-test)
 ADD_ARROW_TEST(key-value-metadata-test)
 ADD_ARROW_TEST(rle-encoding-test)
+ADD_ARROW_TEST(parsing-util-test)
 ADD_ARROW_TEST(stl-util-test)
 ADD_ARROW_TEST(thread-pool-test)
 ADD_ARROW_TEST(lazy-test)
diff --git a/cpp/src/arrow/util/parsing-util-test.cc 
b/cpp/src/arrow/util/parsing-util-test.cc
new file mode 100644
index 0000000..b126b82
--- /dev/null
+++ b/cpp/src/arrow/util/parsing-util-test.cc
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <locale>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/test-util.h"
+#include "arrow/util/parsing.h"
+
+namespace arrow {
+
+using internal::StringConverter;
+
+template <typename ConverterType, typename C_TYPE>
+void AssertConversion(ConverterType& converter, const std::string& s, C_TYPE 
expected) {
+  typename ConverterType::value_type out;
+  ASSERT_TRUE(converter(s.data(), s.length(), &out))
+      << "Conversion failed for '" << s << "' (expected to return " << 
expected << ")";
+  ASSERT_EQ(out, expected);
+}
+
+template <typename ConverterType>
+void AssertConversionFails(ConverterType& converter, const std::string& s) {
+  typename ConverterType::value_type out;
+  ASSERT_FALSE(converter(s.data(), s.length(), &out))
+      << "Conversion should have failed for '" << s << "' (returned " << out 
<< ")";
+}
+
+class LocaleGuard {
+ public:
+  explicit LocaleGuard(const char* new_locale) : global_locale_(std::locale()) 
{
+    try {
+      std::locale::global(std::locale(new_locale));
+    } catch (std::runtime_error) {
+      // Locale unavailable, ignore
+    }
+  }
+
+  ~LocaleGuard() { std::locale::global(global_locale_); }
+
+ protected:
+  std::locale global_locale_;
+};
+
+TEST(StringConversion, ToBoolean) {
+  StringConverter<BooleanType> converter;
+
+  AssertConversion(converter, "true", true);
+  AssertConversion(converter, "tRuE", true);
+  AssertConversion(converter, "FAlse", false);
+  AssertConversion(converter, "false", false);
+  AssertConversion(converter, "1", true);
+  AssertConversion(converter, "0", false);
+
+  AssertConversionFails(converter, "");
+}
+
+TEST(StringConversion, ToFloat) {
+  StringConverter<FloatType> converter;
+
+  AssertConversion(converter, "1.5", 1.5f);
+  AssertConversion(converter, "0", 0.0f);
+  // XXX ASSERT_EQ doesn't distinguish signed zeros
+  AssertConversion(converter, "-0.0", -0.0f);
+  AssertConversion(converter, "-1e20", -1e20f);
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToDouble) {
+  StringConverter<DoubleType> converter;
+
+  AssertConversion(converter, "1.5", 1.5);
+  AssertConversion(converter, "0", 0);
+  // XXX ASSERT_EQ doesn't distinguish signed zeros
+  AssertConversion(converter, "-0.0", -0.0);
+  AssertConversion(converter, "-1e100", -1e100);
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToFloatLocale) {
+  // French locale uses the comma as decimal point
+  LocaleGuard locale_guard("fr_FR.UTF-8");
+
+  StringConverter<FloatType> converter;
+  AssertConversion(converter, "1.5", 1.5f);
+}
+
+TEST(StringConversion, ToDoubleLocale) {
+  // French locale uses the comma as decimal point
+  LocaleGuard locale_guard("fr_FR.UTF-8");
+
+  StringConverter<DoubleType> converter;
+  AssertConversion(converter, "1.5", 1.5f);
+}
+
+TEST(StringConversion, ToInt8) {
+  StringConverter<Int8Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "127", 127);
+  AssertConversion(converter, "-128", -128);
+
+  // Non-representable values
+  AssertConversionFails(converter, "128");
+  AssertConversionFails(converter, "-129");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt8) {
+  StringConverter<UInt8Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "255", 255);
+
+  // Non-representable values
+  //   AssertConversionFails(converter, "-1");
+  AssertConversionFails(converter, "256");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToInt16) {
+  StringConverter<Int16Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "32767", 32767);
+  AssertConversion(converter, "-32768", -32768);
+
+  // Non-representable values
+  AssertConversionFails(converter, "32768");
+  AssertConversionFails(converter, "-32769");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt16) {
+  StringConverter<UInt16Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "65535", 65535);
+
+  // Non-representable values
+  //   AssertConversionFails(converter, "-1");
+  AssertConversionFails(converter, "65536");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToInt32) {
+  StringConverter<Int32Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "2147483647", 2147483647);
+  AssertConversion(converter, "-2147483648", -2147483648LL);
+
+  // Non-representable values
+  AssertConversionFails(converter, "2147483648");
+  AssertConversionFails(converter, "-2147483649");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt32) {
+  StringConverter<UInt32Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "4294967295", 4294967295UL);
+
+  // Non-representable values
+  //   AssertConversionFails(converter, "-1");
+  AssertConversionFails(converter, "4294967296");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToInt64) {
+  StringConverter<Int64Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "9223372036854775807", 9223372036854775807LL);
+  AssertConversion(converter, "-9223372036854775808", -9223372036854775807LL - 
1);
+
+  // Non-representable values
+  AssertConversionFails(converter, "9223372036854775808");
+  AssertConversionFails(converter, "-9223372036854775809");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+TEST(StringConversion, ToUInt64) {
+  StringConverter<UInt64Type> converter;
+
+  AssertConversion(converter, "0", 0);
+  AssertConversion(converter, "18446744073709551615", 18446744073709551615ULL);
+
+  // Non-representable values
+  //   AssertConversionFails(converter, "-1");
+  AssertConversionFails(converter, "18446744073709551616");
+
+  AssertConversionFails(converter, "");
+  AssertConversionFails(converter, "0.0");
+  AssertConversionFails(converter, "e");
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h
new file mode 100644
index 0000000..efe3162
--- /dev/null
+++ b/cpp/src/arrow/util/parsing.h
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_PARSING_H
+#define ARROW_UTIL_PARSING_H
+
+#include <limits>
+#include <locale>
+#include <sstream>
+#include <string>
+
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief A class providing conversion from strings to some Arrow data types
+///
+/// Conversion is triggered by calling operator().  It returns true on
+/// success, false on failure.
+///
+/// The class may have a non-trivial construction cost in some cases,
+/// so it's recommended to use a single instance many times, if doing bulk
+/// conversion.
+///
+template <typename ARROW_TYPE, typename Enable = void>
+class StringConverter;
+
+template <>
+class StringConverter<BooleanType> {
+ public:
+  using value_type = bool;
+
+  bool operator()(const char* s, size_t length, value_type* out) {
+    if (length == 1) {
+      // "0" or "1"?
+      if (s[0] == '0') {
+        *out = false;
+        return true;
+      }
+      if (s[0] == '1') {
+        *out = true;
+        return true;
+      }
+      return false;
+    }
+    if (length == 4) {
+      // "true"?
+      *out = true;
+      return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
+              (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
+    }
+    if (length == 5) {
+      // "false"?
+      *out = false;
+      return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
+              (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
+              (s[4] == 'e' || s[4] == 'E'));
+    }
+    return false;
+  }
+};
+
+template <class ARROW_TYPE>
+class StringToFloatConverterMixin {
+ public:
+  using value_type = typename ARROW_TYPE::c_type;
+
+  StringToFloatConverterMixin() { ibuf.imbue(std::locale::classic()); }
+
+  bool operator()(const char* s, size_t length, value_type* out) {
+    ibuf.clear();
+    ibuf.str(std::string(s, length));
+    ibuf >> *out;
+    // XXX Should we reset errno on failure?
+    return !ibuf.fail() && ibuf.eof();
+  }
+
+ protected:
+  std::istringstream ibuf;
+};
+
+template <>
+class StringConverter<FloatType> : public 
StringToFloatConverterMixin<FloatType> {};
+
+template <>
+class StringConverter<DoubleType> : public 
StringToFloatConverterMixin<DoubleType> {};
+
+// NOTE: HalfFloatType would require a half<->float conversion library
+
+template <class ARROW_TYPE>
+class StringConverter<ARROW_TYPE, enable_if_signed_integer<ARROW_TYPE>> {
+ public:
+  using value_type = typename ARROW_TYPE::c_type;
+
+  StringConverter() { ibuf.imbue(std::locale::classic()); }
+
+  bool operator()(const char* s, size_t length, value_type* out) {
+    static constexpr bool need_long_long = sizeof(value_type) > sizeof(long);  
// NOLINT
+    static constexpr value_type min_value = 
std::numeric_limits<value_type>::min();
+    static constexpr value_type max_value = 
std::numeric_limits<value_type>::max();
+
+    ibuf.clear();
+    ibuf.str(std::string(s, length));
+    if (need_long_long) {
+      long long res;  // NOLINT
+      ibuf >> res;
+      *out = static_cast<value_type>(res);  // may downcast
+      if (res < min_value || res > max_value) {
+        return false;
+      }
+    } else {
+      long res;  // NOLINT
+      ibuf >> res;
+      *out = static_cast<value_type>(res);  // may downcast
+      if (res < min_value || res > max_value) {
+        return false;
+      }
+    }
+    // XXX Should we reset errno on failure?
+    return !ibuf.fail() && ibuf.eof();
+  }
+
+ protected:
+  std::istringstream ibuf;
+};
+
+template <class ARROW_TYPE>
+class StringConverter<ARROW_TYPE, enable_if_unsigned_integer<ARROW_TYPE>> {
+ public:
+  using value_type = typename ARROW_TYPE::c_type;
+
+  StringConverter() { ibuf.imbue(std::locale::classic()); }
+
+  bool operator()(const char* s, size_t length, value_type* out) {
+    static constexpr bool need_long_long =
+        sizeof(value_type) > sizeof(unsigned long);  // NOLINT
+    static constexpr value_type max_value = 
std::numeric_limits<value_type>::max();
+
+    ibuf.clear();
+    ibuf.str(std::string(s, length));
+    // XXX The following unfortunately allows negative input values
+    if (need_long_long) {
+      unsigned long long res;  // NOLINT
+      ibuf >> res;
+      *out = static_cast<value_type>(res);  // may downcast
+      if (res > max_value) {
+        return false;
+      }
+    } else {
+      unsigned long res;  // NOLINT
+      ibuf >> res;
+      *out = static_cast<value_type>(res);  // may downcast
+      if (res > max_value) {
+        return false;
+      }
+    }
+    // XXX Should we reset errno on failure?
+    return !ibuf.fail() && ibuf.eof();
+  }
+
+ protected:
+  std::istringstream ibuf;
+};
+
+}  // namespace internal
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_PARSING_H

Reply via email to