pitrou commented on a change in pull request #8777:
URL: https://github.com/apache/arrow/pull/8777#discussion_r533972693



##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -1838,19 +1838,113 @@ Result<std::shared_ptr<Table>> FilterTable(const 
Table& table, const Datum& filt
   if (table.num_rows() != filter.length()) {
     return Status::Invalid("Filter inputs must all be the same length");
   }
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+
+  // Fetch filter chunks
+  const auto& filter_opts = *static_cast<const FilterOptions*>(options);
+  ArrayDataVector filter_chunks;
+  switch (filter.kind()) {
+    case Datum::ARRAY:
+      filter_chunks.push_back(filter.array());
+      break;
+    case Datum::CHUNKED_ARRAY: {
+      const auto& chunked_array = filter.chunked_array();
+      filter_chunks.reserve(chunked_array->num_chunks());
+      for (const auto& filter_chunk : chunked_array->chunks()) {
+        filter_chunks.push_back(filter_chunk->data());
+      }
+    } break;
+    default:
+      return Status::NotImplemented("Filter should be array-like");
+  }
 
-  // The selection vector "trick" cannot currently be easily applied on Table
-  // because either the filter or the columns may be ChunkedArray, so we use
-  // Filter recursively on the columns for now until a more efficient
-  // implementation of Take with chunked data is available.
-  auto new_columns = table.columns();
-  for (auto& column : new_columns) {
+  // Instead of filtering each column with the boolean filter
+  // (which would be slow if the table has a large number of columns: 
ARROW-10569),
+  // convert each filter slice to take indices.
+  // We just have to be careful to choose the right filter slice size to match
+  // the table chunking.
+
+  const int num_columns = table.num_columns();

Review comment:
       Very good suggestion, thank you.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to