westonpace commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r684345342
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
Review comment:
This seems like a very useful utility function. I wonder if we want to
put it somewhere more visible. There could be a CreateEmptyRecordBatch and
CreateEmptyTable too. Can be done in a follow-up PR.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ if (null_count / batch.num_columns() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+using ::arrow::internal::Bitmap;
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ const int num_columns = table.num_columns();
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < num_columns; ++col_index) {
+ const ArrayVector& chunks = table.column(col_index)->chunks();
+ for (size_t chunk_index = 0; chunk_index < chunks.size(); ++chunk_index) {
+ const auto& column_chunk = chunks[chunk_index];
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+ if (null_count / table.num_columns() == table.num_rows()) {
+ std::vector<std::shared_ptr<ChunkedArray>>
empty_table(table.num_columns());
+ for (int i = 0; i < table.num_columns(); i++) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyArray(table.column(i)->type(), ctx->memory_pool(),
&empty_array));
+ empty_table[i] =
std::make_shared<ChunkedArray>(ArrayVector{empty_array});
+ }
+ return Table::Make(table.schema(), empty_table, 0);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+ for (int col_index = 0; col_index < num_columns; ++col_index) {
+ const ArrayVector& chunks = table.column(col_index)->chunks();
+ std::vector<Bitmap> bitmaps(chunks.size());
+ for (size_t chunk_index = 0; chunk_index < chunks.size(); ++chunk_index) {
+ const auto& column_chunk = chunks[chunk_index];
+ bitmaps[chunk_index] = Bitmap(column_chunk->null_bitmap_data(),
+ column_chunk->offset(),
column_chunk->length());
+ }
+ int64_t bitmap_offset = 0;
+ ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0,
table.num_rows(), true);
+
+ for (auto bitmap : bitmaps) {
+ if (bitmap.buffer()->data()) {
+ ::arrow::internal::CopyBitmap(bitmap.buffer()->data(), bitmap.offset(),
+ bitmap.length(),
+ concatenated_bitmap->mutable_data(),
bitmap_offset);
+ }
Review comment:
This can be done in a follow-up but I wonder if this copy could be
avoided by passing offsets to BitmapAnd?
##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -974,6 +974,83 @@ def test_take_null_type():
assert len(table.take(indices).column(0)) == 4
[email protected](('ty', 'values'), all_array_types)
+def test_drop_null(ty, values):
+ arr = pa.array(values, type=ty)
+ result = arr.drop_null()
+ result.validate()
+ indices = [i for i in range(len(arr)) if arr[i].is_valid]
+ expected = arr.take(pa.array(indices))
+ assert result.equals(expected)
+
+
+def test_drop_null_chunked_array():
+ arr = pa.chunked_array([["a", None], ["c", "d", None]])
+ expected_drop = pa.chunked_array([["a"], ["c", "d"]])
+ result = arr.drop_null()
+ assert result.equals(expected_drop)
+
+
+def test_drop_null_record_batch():
+ batch = pa.record_batch(
+ [pa.array(["a", None, "c", "d", None])], names=["a'"])
+ result = batch.drop_null()
+ expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"])
+ assert result.equals(expected)
+
+ batch = pa.record_batch(
+ [pa.array(["a", None, "c", "d", None]),
+ pa.array([None, None, "c", None, "e"])], names=["a'", "b'"])
+
+ result = batch.drop_null()
+ expected = pa.record_batch(
+ [pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"])
+ print(result["a'"])
+ print(expected["a'"])
Review comment:
Remove these prints
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ if (null_count / batch.num_columns() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+using ::arrow::internal::Bitmap;
Review comment:
Nit: Put this in the method or at the top of the namespace declaration.
##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -974,6 +974,83 @@ def test_take_null_type():
assert len(table.take(indices).column(0)) == 4
[email protected](('ty', 'values'), all_array_types)
+def test_drop_null(ty, values):
+ arr = pa.array(values, type=ty)
+ result = arr.drop_null()
+ result.validate()
+ indices = [i for i in range(len(arr)) if arr[i].is_valid]
+ expected = arr.take(pa.array(indices))
+ assert result.equals(expected)
+
+
+def test_drop_null_chunked_array():
+ arr = pa.chunked_array([["a", None], ["c", "d", None]])
+ expected_drop = pa.chunked_array([["a"], ["c", "d"]])
Review comment:
```suggestion
arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []])
expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []])
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]