pitrou commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r689508738
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ int64_t null_count = 0;
+ for (const auto& col : table.columns()) {
+ for (const auto& column_chunk : col->chunks()) {
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
Review comment:
Same comment again as for the record batch variant.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ int64_t null_count = 0;
+ for (const auto& col : table.columns()) {
+ for (const auto& column_chunk : col->chunks()) {
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+ for (const auto& col : table.columns()) {
+ if (col->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+ break;
+ }
+ std::vector<::arrow::internal::Bitmap> bitmaps;
+ std::transform(col->chunks().begin(), col->chunks().end(),
+ std::back_inserter(bitmaps), [](const
std::shared_ptr<Array>& array) {
+ return
::arrow::internal::Bitmap(array->null_bitmap_data(),
+ array->offset(),
array->length());
+ });
+ int64_t global_offset = 0;
+ ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0,
table.num_rows(), true);
Review comment:
Ping on this?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]