jorisvandenbossche commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r685851827



##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,216 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+                         std::shared_ptr<arrow::BooleanArray>* out_array) {
+  auto bitmap_buffer = values.null_bitmap();
+  *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer, 
nullptr, 0,
+                                              values.offset());
+  return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool* 
memory_pool,
+                        std::shared_ptr<Array>* output_array) {
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+  RETURN_NOT_OK(builder->Resize(0));
+  ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+  return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool* 
memory_pool,
+                               std::shared_ptr<ChunkedArray>* output_array) {
+  std::vector<std::shared_ptr<Array>> new_chunks(1);  // Hard-coded 1 for now
+  ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+  *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+  return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>& 
values,
+                                             ExecContext* ctx) {
+  if (values->null_count() == 0) {
+    return values;
+  }
+  if (values->type()->Equals(arrow::null())) {
+    return std::make_shared<NullArray>(0);
+  }
+  std::shared_ptr<BooleanArray> drop_null_filter;
+  RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(), 
&drop_null_filter));
+
+  if (drop_null_filter->null_count() == drop_null_filter->length()) {
+    std::shared_ptr<Array> empty_array;
+    RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(), 
&empty_array));
+    return empty_array;
+  }
+  auto options = FilterOptions::Defaults();
+  ARROW_ASSIGN_OR_RAISE(
+      Datum result,
+      CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)}, 
&options,
+                   ctx));
+  return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray& 
values,
+                                                           ExecContext* ctx) {
+  if (values.null_count() == values.length()) {
+    std::shared_ptr<ChunkedArray> empty_array;
+    RETURN_NOT_OK(
+        CreateEmptyChunkedArray(values.type(), ctx->memory_pool(), 
&empty_array));
+    return empty_array;
+  }
+  std::vector<std::shared_ptr<Array>> new_chunks;
+  for (const auto& chunk : values.chunks()) {
+    ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+    if (new_chunk->length() > 0) {
+      new_chunks.push_back(new_chunk);
+    }
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch& 
batch,
+                                                         ExecContext* ctx) {
+  int64_t null_count = 0;
+  for (const auto& column : batch.columns()) {
+    null_count += column->null_count();
+  }
+  if (null_count == 0) {
+    return RecordBatch::Make(batch.schema(), batch.num_rows(), 
batch.columns());
+  }
+  ARROW_ASSIGN_OR_RAISE(auto dst,
+                        AllocateEmptyBitmap(batch.num_rows(), 
ctx->memory_pool()));
+  BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+  for (const auto& column : batch.columns()) {
+    if (column->type()->Equals(arrow::null())) {
+      BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+      break;
+    }
+    if (column->null_bitmap_data()) {
+      ::arrow::internal::BitmapAnd(column->null_bitmap_data(), 
column->offset(),
+                                   dst->data(), 0, column->length(), 0,
+                                   dst->mutable_data());
+    }
+  }
+  auto drop_null_filter =
+      std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+  if (drop_null_filter->null_count() == batch.num_rows()) {
+    std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+    for (int i = 0; i < batch.num_columns(); i++) {
+      RETURN_NOT_OK(
+          CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(), 
&empty_batch[i]));
+    }
+    return RecordBatch::Make(batch.schema(), 0, empty_batch);
+  }
+  ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch), 
Datum(drop_null_filter),
+                                             FilterOptions::Defaults(), ctx));
+  return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext* 
ctx) {
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+  int64_t null_count = 0;
+  for (const auto& col : table.columns()) {
+    for (const auto& column_chunk : col->chunks()) {
+      null_count += column_chunk->null_count();
+    }
+  }
+  if (null_count == 0) {
+    return Table::Make(table.schema(), table.columns(), table.num_rows());
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto dst,
+                        AllocateEmptyBitmap(table.num_rows(), 
ctx->memory_pool()));
+  BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+  for (const auto& col : table.columns()) {
+    if (col->type()->Equals(arrow::null())) {
+      BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+      break;
+    }
+    std::vector<::arrow::internal::Bitmap> bitmaps;
+    std::transform(col->chunks().begin(), col->chunks().end(),
+                   std::back_inserter(bitmaps), [](const 
std::shared_ptr<Array>& array) {
+                     return 
::arrow::internal::Bitmap(array->null_bitmap_data(),
+                                                      array->offset(), 
array->length());
+                   });
+    int64_t global_offset = 0;
+    ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+                          AllocateEmptyBitmap(table.num_rows(), 
ctx->memory_pool()));
+    BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0, 
table.num_rows(), true);
+    for (auto bitmap : bitmaps) {
+      if (bitmap.buffer()->data()) {
+        ::arrow::internal::CopyBitmap(bitmap.buffer()->data(), bitmap.offset(),
+                                      bitmap.length(),
+                                      concatenated_bitmap->mutable_data(), 
global_offset);
+      }
+      global_offset += bitmap.length();
+    }
+    ::arrow::internal::BitmapAnd(concatenated_bitmap->data(), 0, dst->data(), 
0,
+                                 table.num_rows(), 0, dst->mutable_data());
+  }
+  auto drop_null_filter =
+      std::make_shared<BooleanArray>(table.num_rows(), dst, nullptr, 0, 0);
+  if (drop_null_filter->null_count() == table.num_rows()) {
+    std::vector<std::shared_ptr<ChunkedArray>> 
empty_table(table.num_columns());
+    for (int i = 0; i < table.num_columns(); i++) {
+      std::shared_ptr<Array> empty_array;
+      RETURN_NOT_OK(
+          CreateEmptyArray(table.column(i)->type(), ctx->memory_pool(), 
&empty_array));
+      empty_table[i] = 
std::make_shared<ChunkedArray>(ArrayVector{empty_array});
+    }
+    return Table::Make(table.schema(), empty_table, 0);
+  }
+  ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(table), 
Datum(drop_null_filter),
+                                             FilterOptions::Defaults(), ctx));
+  return result.table();
+}
+
+const FunctionDoc drop_null_doc(
+    "Drop Null kernel",
+    ("The output is populated with values from the input (Array, ChunkedArray, 
"
+     "RecordBatch, or Table) without the null values"),

Review comment:
       I think it would be good to explicitly describe the behaviour for the 
RecordBatch/Table cases, i.e. that it drops the full row if there is any null 
(as there are multiple options, it could also have been "all" instead of "any").

##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2273,27 +2311,20 @@ Result<std::shared_ptr<Table>> DropNullTable(const 
Table& table, ExecContext* ct
     }
     return Table::Make(table.schema(), empty_table, 0);
   }
-  std::shared_ptr<arrow::Array> indices;
-  arrow::NumericBuilder<arrow::Int32Type> builder(ctx->memory_pool());
-  RETURN_NOT_OK(
-      builder.Reserve(static_cast<int64_t>(table.num_rows() - 
notnull_indices.size())));
-  for (int64_t row_index = 0; row_index < table.num_rows(); ++row_index) {
-    if (notnull_indices.find(static_cast<int32_t>(row_index)) == 
notnull_indices.end()) {
-      builder.UnsafeAppend(static_cast<int32_t>(row_index));
-    }
-  }
-  RETURN_NOT_OK(builder.Finish(&indices));
-  return TakeTA(table, *indices, TakeOptions::Defaults(), ctx);
+  ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(table), 
Datum(drop_null_filter),
+                                             FilterOptions::Defaults(), ctx));
+  return result.table();
 }
 
-const FunctionDoc dropnull_doc(
-    "DropNull kernel",
-    ("The output is populated with values from the input without the null 
values"),
+const FunctionDoc drop_null_doc(
+    "Drop Null kernel",

Review comment:
       ```suggestion
       "Drop nulls from the input",
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to