lidavidm commented on a change in pull request #12464:
URL: https://github.com/apache/arrow/pull/12464#discussion_r829966381



##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
##########
@@ -1143,6 +1145,119 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert string representations of timestamps in arbitrary format to 
timestamps
+
+const std::string GetZone(const std::string& format) {
+  // Check for use of %z or %Z
+  size_t cur = 0;
+  std::string zone = "";
+  while (cur < format.size() - 1) {
+    if (format[cur] == '%') {
+      if (format[cur + 1] == 'z') {
+        zone = "UTC";
+        break;
+      }
+      cur++;
+    }
+    cur++;
+  }
+  return zone;
+}
+
+template <typename Duration, typename InType>
+struct Strptime {
+  const std::shared_ptr<TimestampParser> parser;
+  const TimeUnit::type unit;
+  const std::string zone;
+  const bool error_is_null;
+
+  static Result<Strptime> Make(KernelContext* ctx, const DataType& type) {
+    const StrptimeOptions& options = StrptimeState::Get(ctx);
+
+    return Strptime{TimestampParser::MakeStrptime(options.format),
+                    std::move(options.unit), GetZone(options.format),
+                    options.error_is_null};
+  }
+
+  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+    ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
+
+    if (in.is_valid) {
+      auto s = internal::UnboxScalar<InType>::Unbox(in);
+      int64_t result;
+      if ((*self.parser)(s.data(), s.size(), self.unit, &result)) {
+        *checked_cast<TimestampScalar*>(out) =
+            TimestampScalar(result, timestamp(self.unit, self.zone));
+      } else {
+        if (self.error_is_null) {
+          out->is_valid = false;
+        } else {
+          return Status::Invalid("Failed to parse string: '", s.data(),

Review comment:
       ah, this is probably why all the string data shows up in the test above 
- `s` isn't null terminated because it's a string_view and this is getting 
treated as a `char*`. That's probably undefined behavior, does the constructor 
not take `s` by itself?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -1840,14 +1840,26 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
 #endif
 
 TYPED_TEST(TestStringKernels, Strptime) {
-  std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
-  std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
-  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
+  std::string input1 = R"(["5/1/2020", null, "12/13/1900"])";
+  std::string output1 = R"(["2020-05-01", null, "1900-12-13"])";
+  std::string output2 = R"(["2020-01-05", null, null])";
+  auto input = ArrayFromJSON(this->type(), input1);
+
+  StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, /*error_is_null=*/true);
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
 
-  input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
+  options.format = "%d/%m/%Y";
+  this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output2, 
&options);
+
+  input1 = R"(["5/1/2020 %z", null, "12/13/1900 %z"])";
   options.format = "%m/%d/%Y %%z";
   this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, 
&options);
+
+  options.error_is_null = false;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Invalid: Failed to parse string: 
'5/1/202012/13/1900'"),

Review comment:
       I'm still curious why both rows get put into the string when it's 
invalid.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to