lidavidm commented on a change in pull request #12464:
URL: https://github.com/apache/arrow/pull/12464#discussion_r829048674



##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1590,7 +1705,13 @@ const FunctionDoc strftime_doc{
      "does not exist on this system."),
     {"timestamps"},
     "StrftimeOptions"};
-
+const FunctionDoc strptime_doc(
+    "Parse timestamps",
+    ("For each string in `strings`, parse it as a timestamp.\n"
+     "The timestamp unit and the expected string pattern must be given\n"
+     "in StrptimeOptions.  Null inputs emit null.  If a non-null string\n"
+     "fails parsing, an error is returned by default."),

Review comment:
       Ditto here - what is the behavior if we don't raise errors and we get an 
invalid timestamp?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,117 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to 
timestamps
+
+static std::string GetZone(std::string format) {
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < format.size() - 1) {
+    if (format[cur] == '%') {
+      if (format[cur + 1] == 'z') {
+        zone = "UTC";
+        break;
+      }
+      cur++;
+    }
+    cur++;
+  }
+  return zone;
+}
+
+template <typename Duration, typename InType>
+struct Strptime {
+  const std::shared_ptr<TimestampParser> parser;
+  const TimeUnit::type unit;
+  const std::string zone;
+  const bool raise_errors;
+
+  static Result<Strptime> Make(KernelContext* ctx, const DataType& type) {
+    const StrptimeOptions& options = StrptimeState::Get(ctx);
+
+    return Strptime{TimestampParser::MakeStrptime(options.format),
+                    std::move(options.unit), GetZone(options.format),
+                    options.raise_errors};
+  }
+
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
+
+    if (in.is_valid) {
+      auto s = internal::UnboxScalar<InType>::Unbox(in);
+      int64_t result;
+      if ((*self.parser)(s.data(), s.size(), self.unit, &result)) {
+        *checked_cast<TimestampScalar*>(out) =
+            TimestampScalar(result, timestamp(self.unit, self.zone));
+      } else {
+        if (self.raise_errors) {
+          return Status::Invalid("Failed to parse string: '", s.data(),
+                                 "' as a scalar of type ",
+                                 TimestampType(self.unit).ToString());
+        } else {
+          out->is_valid = false;
+        }
+      }
+    } else {
+      out->is_valid = false;
+    }
+    return Status::OK();
+  }
+
+  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+    ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
+
+    std::unique_ptr<ArrayBuilder> array_builder;

Review comment:
       Using a builder is probably the reason for why we can't 
`can_write_into_slices`? Since the builder allocates its own buffer. Presumably 
the kernel has preallocation enabled so you should instead be able to write 
directly into `out->array()->GetValues<int64_t>(1)`.

##########
File path: python/pyarrow/_compute.pyx
##########
@@ -1458,10 +1458,12 @@ class StrptimeOptions(_StrptimeOptions):
     unit : str
         Timestamp unit of the output.
         Accepted values are "s", "ms", "us", "ns".
+    raise_errors : boolean, default True
+        Raise on parsing errors.

Review comment:
       Same here - what's the behavior otherwise?

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -267,12 +267,17 @@ class ARROW_EXPORT StructFieldOptions : public 
FunctionOptions {
 
 class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
  public:
-  explicit StrptimeOptions(std::string format, TimeUnit::type unit);
+  explicit StrptimeOptions(std::string format, TimeUnit::type unit,
+                           bool raise_errors = true);
   StrptimeOptions();
   static constexpr char const kTypeName[] = "StrptimeOptions";
 
+  /// The desired format string.
   std::string format;
+  /// The desired time resolution
   TimeUnit::type unit;
+  /// Raise on parsing errors

Review comment:
       What is the behavior if we don't raise parse errors?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -27,6 +27,7 @@
 #include <utf8proc.h>
 #endif
 
+#include "arrow/compute/api.h"

Review comment:
       nit: is there a more precise header we can include

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,117 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to 
timestamps
+
+static std::string GetZone(std::string format) {

Review comment:
       `const std::string&`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -1842,12 +1843,24 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
 TYPED_TEST(TestStringKernels, Strptime) {
   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
-  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
+  auto input_array = ArrayFromJSON(utf8(), input1);
+  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, false);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
 
   input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
   options.format = "%m/%d/%Y %%z";
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
+
+  ASSERT_OK_AND_ASSIGN(auto result, CallFunction("strptime", {input_array}, 
&options));
+
+  options.format = "%Y-%m-%d";
+  options.raise_errors = true;
+  ASSERT_RAISES(Invalid, CallFunction("strptime", {input_array}, &options));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Invalid: Failed to parse string: 
'5/1/202012/11/1900'"),
+      Strptime(input_array, options));

Review comment:
       Can we have a test of the behavior with raise_errors = false and an 
invalid string?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -1842,12 +1843,24 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
 TYPED_TEST(TestStringKernels, Strptime) {
   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
-  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
+  auto input_array = ArrayFromJSON(utf8(), input1);
+  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, false);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
 
   input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
   options.format = "%m/%d/%Y %%z";
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
+
+  ASSERT_OK_AND_ASSIGN(auto result, CallFunction("strptime", {input_array}, 
&options));
+
+  options.format = "%Y-%m-%d";
+  options.raise_errors = true;
+  ASSERT_RAISES(Invalid, CallFunction("strptime", {input_array}, &options));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Invalid: Failed to parse string: 
'5/1/202012/11/1900'"),

Review comment:
       This error message seems suspect, why are both rows in the error message?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,117 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to 
timestamps
+
+static std::string GetZone(std::string format) {
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < format.size() - 1) {
+    if (format[cur] == '%') {

Review comment:
       Do we want to respect `%%` for escaping?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,119 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to 
timestamps
+
+const std::string GetZone(const std::string& format) {
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < format.size() - 1) {
+    if (format[cur] == '%') {
+      if (format[cur + 1] == 'z') {
+        zone = "UTC";
+        break;
+      }
+      cur++;
+    }
+    cur++;
+  }
+  return zone;
+}
+
+template <typename Duration, typename InType>
+struct Strptime {
+  const std::shared_ptr<TimestampParser> parser;
+  const TimeUnit::type unit;
+  const std::string zone;
+  const bool error_is_null;
+
+  static Result<Strptime> Make(KernelContext* ctx, const DataType& type) {
+    const StrptimeOptions& options = StrptimeState::Get(ctx);
+
+    return Strptime{TimestampParser::MakeStrptime(options.format),
+                    std::move(options.unit), GetZone(options.format),
+                    options.error_is_null};
+  }
+
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
+
+    if (in.is_valid) {
+      auto s = internal::UnboxScalar<InType>::Unbox(in);
+      int64_t result;
+      if ((*self.parser)(s.data(), s.size(), self.unit, &result)) {
+        *checked_cast<TimestampScalar*>(out) =
+            TimestampScalar(result, timestamp(self.unit, self.zone));
+      } else {
+        if (self.error_is_null) {
+          out->is_valid = false;
+        } else {
+          return Status::Invalid("Failed to parse string: '", s.data(),

Review comment:
       ah, this is probably why all the string data shows up in the test above 
- `s` isn't null terminated because it's a string_view and this is getting 
treated as a `char*`. That's probably undefined behavior, does the constructor 
not take `s` by itself?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -1840,14 +1840,26 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
 #endif
 
 TYPED_TEST(TestStringKernels, Strptime) {
-  std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
-  std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
-  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
+  std::string input1 = R"(["5/1/2020", null, "12/13/1900"])";
+  std::string output1 = R"(["2020-05-01", null, "1900-12-13"])";
+  std::string output2 = R"(["2020-01-05", null, null])";
+  auto input = ArrayFromJSON(this->type(), input1);
+
+  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, /*error_is_null=*/true);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
 
-  input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
+  options.format = "%d/%m/%Y";
+  this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output2, 
&options);
+
+  input1 = R"(["5/1/2020 %z", null, "12/13/1900 %z"])";
   options.format = "%m/%d/%Y %%z";
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
+
+  options.error_is_null = false;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Invalid: Failed to parse string: 
'5/1/202012/13/1900'"),

Review comment:
       I'm still curious why both rows get put into the string when it's 
invalid.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to