westonpace commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r686288305



##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+                         std::shared_ptr<arrow::BooleanArray>* out_array) {
+  auto bitmap_buffer = values.null_bitmap();
+  *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer, 
nullptr, 0,
+                                              values.offset());
+  return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool* 
memory_pool,
+                        std::shared_ptr<Array>* output_array) {
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+  RETURN_NOT_OK(builder->Resize(0));
+  ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+  return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool* 
memory_pool,
+                               std::shared_ptr<ChunkedArray>* output_array) {
+  std::vector<std::shared_ptr<Array>> new_chunks(1);  // Hard-coded 1 for now
+  ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+  *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+  return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>& 
values,
+                                             ExecContext* ctx) {
+  if (values->null_count() == 0) {
+    return values;
+  }
+  if (values->type()->Equals(arrow::null())) {
+    return std::make_shared<NullArray>(0);
+  }
+  std::shared_ptr<BooleanArray> drop_null_filter;
+  RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(), 
&drop_null_filter));
+
+  if (drop_null_filter->null_count() == drop_null_filter->length()) {
+    std::shared_ptr<Array> empty_array;
+    RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(), 
&empty_array));
+    return empty_array;
+  }
+  auto options = FilterOptions::Defaults();
+  ARROW_ASSIGN_OR_RAISE(
+      Datum result,
+      CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)}, 
&options,
+                   ctx));
+  return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray& 
values,
+                                                           ExecContext* ctx) {
+  if (values.null_count() == values.length()) {
+    std::shared_ptr<ChunkedArray> empty_array;
+    RETURN_NOT_OK(
+        CreateEmptyChunkedArray(values.type(), ctx->memory_pool(), 
&empty_array));
+    return empty_array;
+  }
+  std::vector<std::shared_ptr<Array>> new_chunks;
+  for (const auto& chunk : values.chunks()) {
+    ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+    if (new_chunk->length() > 0) {
+      new_chunks.push_back(new_chunk);
+    }
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch& 
batch,
+                                                         ExecContext* ctx) {
+  int64_t null_count = 0;
+  for (const auto& column : batch.columns()) {
+    null_count += column->null_count();
+  }
+  if (null_count == 0) {
+    return RecordBatch::Make(batch.schema(), batch.num_rows(), 
batch.columns());
+  }
+  ARROW_ASSIGN_OR_RAISE(auto dst,
+                        AllocateEmptyBitmap(batch.num_rows(), 
ctx->memory_pool()));
+  BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+  for (const auto& column : batch.columns()) {
+    if (column->type()->Equals(arrow::null())) {
+      BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+      break;
+    }
+    if (column->null_bitmap_data()) {
+      ::arrow::internal::BitmapAnd(column->null_bitmap_data(), 
column->offset(),
+                                   dst->data(), 0, column->length(), 0,
+                                   dst->mutable_data());
+    }
+  }
+  auto drop_null_filter =
+      std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+  if (drop_null_filter->null_count() == batch.num_rows()) {
+    std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+    for (int i = 0; i < batch.num_columns(); i++) {
+      RETURN_NOT_OK(
+          CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(), 
&empty_batch[i]));
+    }
+    return RecordBatch::Make(batch.schema(), 0, empty_batch);
+  }
+  ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch), 
Datum(drop_null_filter),
+                                             FilterOptions::Defaults(), ctx));
+  return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext* 
ctx) {
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+  int64_t null_count = 0;
+  for (const auto& col : table.columns()) {
+    for (const auto& column_chunk : col->chunks()) {
+      null_count += column_chunk->null_count();
+    }
+  }
+  if (null_count == 0) {
+    return Table::Make(table.schema(), table.columns(), table.num_rows());
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto dst,
+                        AllocateEmptyBitmap(table.num_rows(), 
ctx->memory_pool()));
+  BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+  for (const auto& col : table.columns()) {
+    if (col->type()->Equals(arrow::null())) {
+      BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+      break;
+    }
+    std::vector<::arrow::internal::Bitmap> bitmaps;
+    std::transform(col->chunks().begin(), col->chunks().end(),
+                   std::back_inserter(bitmaps), [](const 
std::shared_ptr<Array>& array) {
+                     return 
::arrow::internal::Bitmap(array->null_bitmap_data(),
+                                                      array->offset(), 
array->length());
+                   });
+    int64_t global_offset = 0;
+    ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+                          AllocateEmptyBitmap(table.num_rows(), 
ctx->memory_pool()));
+    BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0, 
table.num_rows(), true);
+    for (auto bitmap : bitmaps) {
+      if (bitmap.buffer()->data()) {
+        ::arrow::internal::CopyBitmap(bitmap.buffer()->data(), bitmap.offset(),
+                                      bitmap.length(),
+                                      concatenated_bitmap->mutable_data(), 
global_offset);
+      }
+      global_offset += bitmap.length();
+    }
+    ::arrow::internal::BitmapAnd(concatenated_bitmap->data(), 0, dst->data(), 
0,
+                                 table.num_rows(), 0, dst->mutable_data());
+  }
+  auto drop_null_filter =
+      std::make_shared<BooleanArray>(table.num_rows(), dst, nullptr, 0, 0);
+  if (drop_null_filter->null_count() == table.num_rows()) {
+    std::vector<std::shared_ptr<ChunkedArray>> 
empty_table(table.num_columns());
+    for (int i = 0; i < table.num_columns(); i++) {
+      std::shared_ptr<Array> empty_array;
+      RETURN_NOT_OK(
+          CreateEmptyArray(table.column(i)->type(), ctx->memory_pool(), 
&empty_array));
+      empty_table[i] = 
std::make_shared<ChunkedArray>(ArrayVector{empty_array});
+    }
+    return Table::Make(table.schema(), empty_table, 0);
+  }
+  ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(table), 
Datum(drop_null_filter),
+                                             FilterOptions::Defaults(), ctx));
+  return result.table();
+}
+
+const FunctionDoc drop_null_doc(
+    "Drop nulls from the input",
+    ("The output is populated with values from the input (Array, ChunkedArray, 
"
+     "RecordBatch, or Table) without the null values."
+     "Note that for the RecordBatch/Table cases, `drop_null` drops the full 
row if there "
+     "is any null"
+     "null."),

Review comment:
       ```suggestion
        "is any null."),
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to