[GitHub] [arrow] pitrou commented on a change in pull request #10397: ARROW-11930: [C++][Dataset][Compute] Use an ExecPlan for dataset scans

GitBox Wed, 23 Jun 2021 05:48:31 -0700


pitrou commented on a change in pull request #10397:
URL: https://github.com/apache/arrow/pull/10397#discussion_r657029167




##########
File path: cpp/src/arrow/compute/exec/doc/exec_node.md
##########
@@ -0,0 +1,147 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# ExecNodes and logical operators

Review comment:
       I'm not sure I understand the status of this document. If this is meant 
to be a persistent document, then can it be part of the Sphinx development docs?

##########
File path: cpp/src/arrow/compute/exec/expression.cc
##########
@@ -61,13 +61,22 @@ Expression call(std::string function, 
std::vector<Expression> arguments,
   call.function_name = std::move(function);
   call.arguments = std::move(arguments);
   call.options = std::move(options);
+
+  call.hash = std::hash<std::string>{}(call.function_name);
+  for (const auto& arg : call.arguments) {
+    call.hash ^= arg.hash();

Review comment:
       Note that simple XORing is suboptimal (`a,b` will produce the same hash 
as `b,a`, for example).
   May want to use something like `hash_combine` from `arrow/util/hash_util.h`.
   

##########
File path: cpp/src/arrow/compute/exec/plan_test.cc
##########
@@ -20,27 +20,46 @@
 #include <functional>
 #include <memory>
 
+#include "arrow/compute/exec.h"
 #include "arrow/compute/exec/exec_plan.h"
+#include "arrow/compute/exec/expression.h"
 #include "arrow/compute/exec/test_util.h"
 #include "arrow/record_batch.h"
 #include "arrow/testing/future_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/thread_pool.h"
+#include "arrow/util/vector.h"
 
-namespace arrow {
+using testing::ElementsAre;
+using testing::HasSubstr;
+using testing::UnorderedElementsAreArray;
 
-using internal::Executor;
+namespace arrow {
 
 namespace compute {
 
-void AssertBatchesEqual(const RecordBatchVector& expected,
-                        const RecordBatchVector& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  for (size_t i = 0; i < expected.size(); ++i) {
-    AssertBatchesEqual(*expected[i], *actual[i]);
+ExecBatch ExecBatchFromJSON(const std::vector<ValueDescr>& descrs,

Review comment:
       Is this something we want to expose in `arrow/testing` or 
`arrow/compute/test_util.h` perhaps?

##########
File path: cpp/src/arrow/type.cc
##########
@@ -1195,6 +1195,10 @@ std::string FieldRef::ToString() const {
 }
 
 std::vector<FieldPath> FieldRef::FindAll(const Schema& schema) const {
+  if (auto name = this->name()) {
+    return internal::MapVector([](int i) { return FieldPath{i}; },
+                               schema.GetAllFieldIndices(*name));
+  }

Review comment:
       Can you add a test for this?

##########
File path: cpp/src/arrow/util/vector.h
##########
@@ -84,27 +84,49 @@ std::vector<T> FilterVector(std::vector<T> values, 
Predicate&& predicate) {
   return values;
 }
 
-/// \brief Like MapVector, but where the function can fail.
-template <typename Fn, typename From = internal::call_traits::argument_type<0, 
Fn>,
-          typename To = typename 
internal::call_traits::return_type<Fn>::ValueType>
-Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& src) 
{
+template <typename Fn, typename From,
+          typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
   std::vector<To> out;
-  out.reserve(src.size());
-  ARROW_RETURN_NOT_OK(MaybeTransform(src.begin(), src.end(), 
std::back_inserter(out),
-                                     std::forward<Fn>(map)));
-  return std::move(out);
+  out.reserve(source.size());
+  std::transform(source.begin(), source.end(), std::back_inserter(out),
+                 std::forward<Fn>(map));
+  return out;
 }
 
 template <typename Fn, typename From,
           typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
-std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
+std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
   std::vector<To> out;
   out.reserve(source.size());
-  std::transform(source.begin(), source.end(), std::back_inserter(out),
+  std::transform(std::make_move_iterator(source.begin()),
+                 std::make_move_iterator(source.end()), 
std::back_inserter(out),
                  std::forward<Fn>(map));
   return out;
 }
 
+/// \brief Like MapVector, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, 
Fn>,
+          typename To = typename 
internal::call_traits::return_type<Fn>::ValueType>

Review comment:
       Why not use the `decltype(declval)` pattern here as well?

##########
File path: cpp/src/arrow/util/thread_pool_test.cc
##########
@@ -395,6 +395,22 @@ TEST_F(TestThreadPool, StressSpawn) {
   SpawnAdds(pool.get(), 1000, task_add<int>);
 }
 
+TEST_F(TestThreadPool, OwnsCurrentThread) {
+  auto pool = this->MakeThreadPool(30);
+  std::atomic<bool> one_failed{false};
+
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(pool->Spawn([&] {
+      if (pool->OwnsThisThread()) return;
+
+      one_failed = true;
+    }));
+  }
+
+  ASSERT_OK(pool->Shutdown());
+  ASSERT_FALSE(one_failed);

Review comment:
       Also call `pool->OwnsThisThread()` from here?

##########
File path: cpp/src/arrow/compute/exec.h
##########
@@ -28,6 +28,7 @@
 #include <vector>
 
 #include "arrow/array/data.h"
+#include "arrow/compute/exec/expression.h"

Review comment:
       Still a concern probably.

##########
File path: cpp/src/arrow/util/future_test.cc
##########
@@ -1704,5 +1704,45 @@ TEST(FnOnceTest, MoveOnlyDataType) {
   ASSERT_EQ(i0.moves, 0);
   ASSERT_EQ(i1.moves, 0);
 }
+
+TEST(FutureTest, MatcherExamples) {
+  EXPECT_THAT(Future<int>::MakeFinished(Status::Invalid("arbitrary error")),
+              Raises(StatusCode::Invalid));
+
+  EXPECT_THAT(Future<int>::MakeFinished(Status::Invalid("arbitrary error")),
+              Raises(StatusCode::Invalid, testing::HasSubstr("arbitrary")));

Review comment:
       I would find it slightly nicer if `Future` used separate matchers (e.g. 
`FinishesWith`, `Fails`...). But not a big deal either.

##########
File path: cpp/src/arrow/testing/gtest_util.h
##########
@@ -28,6 +28,7 @@
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock-matchers.h>

Review comment:
       Usual concern: does it blow up compile times?
   (`gtest.h` is already a large inclusion that gets processed for every test 
file)

##########
File path: cpp/src/arrow/util/future.h
##########
@@ -976,4 +979,43 @@ Future<BreakValueType> Loop(Iterate iterate) {
   return break_fut;
 }
 
+template <typename T>
+struct EnsureFuture {
+  using type = Future<T>;
+};
+
+template <typename T>
+struct EnsureFuture<Result<T>> {
+  using type = Future<T>;
+};
+
+template <typename T>
+struct EnsureFuture<Future<T>> {
+  using type = Future<T>;
+};
+
+template <>
+struct EnsureFuture<Status> {
+  using type = Future<>;
+};

Review comment:
       Should it be simply:
   ```c++
   template <typename T>
   struct EnsureFuture {
     using type = decltype(ToFuture(std::declval<T>()));
   };
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #10397: ARROW-11930: [C++][Dataset][Compute] Use an ExecPlan for dataset scans

Reply via email to