pitrou commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r687866544
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
Review comment:
It is allowed to create a zero-chunk chunked array, so this shouldn't be
needed.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
Review comment:
Why don't you use return `Result<...>` in these helper functions?
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
Review comment:
Here as well, you can use `column->type()->id()`.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
Review comment:
Returning the original batch will be faster.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
Review comment:
Also, if `values.null_count()` is 0, you can probably return the array
unchanged.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
Review comment:
This kind of checks is faster as `values->type()->id() == Type::ID`.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ int64_t null_count = 0;
+ for (const auto& col : table.columns()) {
+ for (const auto& column_chunk : col->chunks()) {
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+ for (const auto& col : table.columns()) {
+ if (col->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+ break;
+ }
+ std::vector<::arrow::internal::Bitmap> bitmaps;
+ std::transform(col->chunks().begin(), col->chunks().end(),
+ std::back_inserter(bitmaps), [](const
std::shared_ptr<Array>& array) {
+ return
::arrow::internal::Bitmap(array->null_bitmap_data(),
+ array->offset(),
array->length());
+ });
+ int64_t global_offset = 0;
+ ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0,
table.num_rows(), true);
+ for (auto bitmap : bitmaps) {
+ if (bitmap.buffer()->data()) {
+ ::arrow::internal::CopyBitmap(bitmap.buffer()->data(), bitmap.offset(),
+ bitmap.length(),
+ concatenated_bitmap->mutable_data(),
global_offset);
+ }
+ global_offset += bitmap.length();
+ }
+ ::arrow::internal::BitmapAnd(concatenated_bitmap->data(), 0, dst->data(),
0,
+ table.num_rows(), 0, dst->mutable_data());
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(table.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == table.num_rows()) {
+ std::vector<std::shared_ptr<ChunkedArray>>
empty_table(table.num_columns());
+ for (int i = 0; i < table.num_columns(); i++) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyArray(table.column(i)->type(), ctx->memory_pool(),
&empty_array));
+ empty_table[i] =
std::make_shared<ChunkedArray>(ArrayVector{empty_array});
+ }
+ return Table::Make(table.schema(), empty_table, 0);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(table),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.table();
Review comment:
You could return a `Datum` from all these `drop_null` implementations,
instead of converting Datum to table then Datum.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
Review comment:
Why?
1) The filter array created by `GetDropNullFilter` doesn't have any nulls.
2) It's better to check this _before_ creating the filter, i.e. `if
(values->null_count() == values->length()) ...`
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
Review comment:
Nit: `std::move(empty_batch)`
##########
File path: python/pyarrow/compute.py
##########
@@ -590,6 +590,38 @@ def take(data, indices, *, boundscheck=True,
memory_pool=None):
return call_function('take', [data, indices], options, memory_pool)
+def drop_null(data, *, memory_pool=None):
Review comment:
This shouldn't be necessary unless you want to craft a special function
signature (which doesn't seem to be the case here).
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ int64_t null_count = 0;
+ for (const auto& col : table.columns()) {
+ for (const auto& column_chunk : col->chunks()) {
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
Review comment:
Better to return the original table, here and above.
##########
File path: python/pyarrow/table.pxi
##########
@@ -397,6 +397,13 @@ cdef class ChunkedArray(_PandasConvertible):
"""
return _pc().take(self, indices)
+ def drop_null(self):
+ """
+ Remove missing values from a chunked array.
+ See pyarrow.compute.drop_null for full usage.
Review comment:
"for full description" perhaps, because the function signature (hence
usage) is trivial?
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ int64_t null_count = 0;
+ for (const auto& col : table.columns()) {
+ for (const auto& column_chunk : col->chunks()) {
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+ for (const auto& col : table.columns()) {
+ if (col->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+ break;
+ }
+ std::vector<::arrow::internal::Bitmap> bitmaps;
+ std::transform(col->chunks().begin(), col->chunks().end(),
+ std::back_inserter(bitmaps), [](const
std::shared_ptr<Array>& array) {
+ return
::arrow::internal::Bitmap(array->null_bitmap_data(),
+ array->offset(),
array->length());
+ });
+ int64_t global_offset = 0;
+ ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0,
table.num_rows(), true);
Review comment:
Hmm... is this necessary? `BitmapAnd` allows you to pass arbitrary input
offsets, so you shouldn't need to concatenate all source bitmap chunks for the
column.
##########
File path: python/pyarrow/table.pxi
##########
@@ -956,6 +963,13 @@ cdef class RecordBatch(_PandasConvertible):
"""
return _pc().take(self, indices)
+ def drop_null(self):
+ """
+ Remove missing values from an RecordBatch.
Review comment:
I would say "a record batch" or "a RecordBatch".
##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1734,5 +1735,551 @@ TEST(TestTake, RandomFixedSizeBinary) {
TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(16));
}
+// ----------------------------------------------------------------------
+// DropNull tests
+
+void AssertDropNullArrays(const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Array>& expected) {
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, DropNull(*values));
+ ValidateOutput(actual);
+ AssertArraysEqual(*expected, *actual, /*verbose=*/true);
+}
+
+Status DropNullJSON(const std::shared_ptr<DataType>& type, const std::string&
values,
+ std::shared_ptr<Array>* out) {
+ return DropNull(*ArrayFromJSON(type, values)).Value(out);
+}
+
+void CheckDropNull(const std::shared_ptr<DataType>& type, const std::string&
values,
+ const std::string& expected) {
+ std::shared_ptr<Array> actual;
+
+ ASSERT_OK(DropNullJSON(type, values, &actual));
+ ValidateOutput(actual);
+ AssertArraysEqual(*ArrayFromJSON(type, expected), *actual, /*verbose=*/true);
+}
+
+struct TestDropNullKernel : public ::testing::Test {
+ void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr<Array>&
values) {
+ ASSERT_EQ(values->null_count(), 0);
+ auto expected = (*DropNull(values)).make_array();
+
+ auto new_values = MakeArray(values->data()->Copy());
+ new_values->data()->buffers[0].reset();
+ new_values->data()->null_count = kUnknownNullCount;
+ auto result = (*DropNull(new_values)).make_array();
+ AssertArraysEqual(*expected, *result);
+ }
+
+ void TestNoValidityBitmapButUnknownNullCount(const
std::shared_ptr<DataType>& type,
+ const std::string& values) {
+ TestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values));
+ }
+};
+
+TEST_F(TestDropNullKernel, DropNull) {
+ CheckDropNull(null(), "[null, null, null]", "[]");
+ CheckDropNull(null(), "[null]", "[]");
+}
+
+TEST_F(TestDropNullKernel, DropNullBoolean) {
+ CheckDropNull(boolean(), "[true, false, true]", "[true, false, true]");
+ CheckDropNull(boolean(), "[null, false, true]", "[false, true]");
Review comment:
Ideally, should also check with an empty array and a nulls-only array.
##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -974,6 +974,82 @@ def test_take_null_type():
assert len(table.take(indices).column(0)) == 4
[email protected](('ty', 'values'), all_array_types)
+def test_drop_null(ty, values):
+ arr = pa.array(values, type=ty)
+ result = arr.drop_null()
+ result.validate()
Review comment:
`result.validate(full=True)` is more thorough.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1734,5 +1735,551 @@ TEST(TestTake, RandomFixedSizeBinary) {
TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(16));
}
+// ----------------------------------------------------------------------
+// DropNull tests
+
+void AssertDropNullArrays(const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Array>& expected) {
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, DropNull(*values));
+ ValidateOutput(actual);
+ AssertArraysEqual(*expected, *actual, /*verbose=*/true);
+}
+
+Status DropNullJSON(const std::shared_ptr<DataType>& type, const std::string&
values,
+ std::shared_ptr<Array>* out) {
+ return DropNull(*ArrayFromJSON(type, values)).Value(out);
+}
+
+void CheckDropNull(const std::shared_ptr<DataType>& type, const std::string&
values,
+ const std::string& expected) {
+ std::shared_ptr<Array> actual;
+
+ ASSERT_OK(DropNullJSON(type, values, &actual));
+ ValidateOutput(actual);
+ AssertArraysEqual(*ArrayFromJSON(type, expected), *actual, /*verbose=*/true);
+}
+
+struct TestDropNullKernel : public ::testing::Test {
+ void TestNoValidityBitmapButUnknownNullCount(const std::shared_ptr<Array>&
values) {
+ ASSERT_EQ(values->null_count(), 0);
+ auto expected = (*DropNull(values)).make_array();
+
+ auto new_values = MakeArray(values->data()->Copy());
+ new_values->data()->buffers[0].reset();
+ new_values->data()->null_count = kUnknownNullCount;
+ auto result = (*DropNull(new_values)).make_array();
+ AssertArraysEqual(*expected, *result);
+ }
+
+ void TestNoValidityBitmapButUnknownNullCount(const
std::shared_ptr<DataType>& type,
+ const std::string& values) {
+ TestNoValidityBitmapButUnknownNullCount(ArrayFromJSON(type, values));
+ }
+};
+
+TEST_F(TestDropNullKernel, DropNull) {
+ CheckDropNull(null(), "[null, null, null]", "[]");
+ CheckDropNull(null(), "[null]", "[]");
+}
+
+TEST_F(TestDropNullKernel, DropNullBoolean) {
+ CheckDropNull(boolean(), "[true, false, true]", "[true, false, true]");
+ CheckDropNull(boolean(), "[null, false, true]", "[false, true]");
+
+ TestNoValidityBitmapButUnknownNullCount(boolean(), "[true, false, true]");
+}
+
+template <typename ArrowType>
+struct TestDropNullKernelTyped : public TestDropNullKernel {
+ TestDropNullKernelTyped() : rng_(seed_) {}
+
+ template <typename OffsetType>
+ std::vector<OffsetType> Offsets(int32_t length, int32_t slice_count) {
+ std::vector<OffsetType> offsets(static_cast<std::size_t>(slice_count + 1));
+ std::default_random_engine gen(seed_);
+ std::uniform_int_distribution<OffsetType> dist(0, length);
+ std::generate(offsets.begin(), offsets.end(), [&] { return dist(gen); });
+ std::sort(offsets.begin(), offsets.end());
+ return offsets;
+ }
Review comment:
Can probably reuse `RandomArrayGenerator::Offsets` instead.
##########
File path: python/pyarrow/tests/test_compute.py
##########
@@ -974,6 +974,82 @@ def test_take_null_type():
assert len(table.take(indices).column(0)) == 4
[email protected](('ty', 'values'), all_array_types)
+def test_drop_null(ty, values):
+ arr = pa.array(values, type=ty)
+ result = arr.drop_null()
+ result.validate()
+ indices = [i for i in range(len(arr)) if arr[i].is_valid]
+ expected = arr.take(pa.array(indices))
+ assert result.equals(expected)
+
+
+def test_drop_null_chunked_array():
+ arr = pa.chunked_array([["a", None], ["c", "d", None], [None], []])
+ expected_drop = pa.chunked_array([["a"], ["c", "d"], [], []])
+
+ result = arr.drop_null()
+ assert result.equals(expected_drop)
+
+
+def test_drop_null_record_batch():
+ batch = pa.record_batch(
+ [pa.array(["a", None, "c", "d", None])], names=["a'"])
+ result = batch.drop_null()
+ expected = pa.record_batch([pa.array(["a", "c", "d"])], names=["a'"])
+ assert result.equals(expected)
+
+ batch = pa.record_batch(
+ [pa.array(["a", None, "c", "d", None]),
+ pa.array([None, None, "c", None, "e"])], names=["a'", "b'"])
+
+ result = batch.drop_null()
+ expected = pa.record_batch(
+ [pa.array(["c"]), pa.array(["c"])], names=["a'", "b'"])
+ assert result.equals(expected)
+
+
+def test_drop_null_table():
+ table = pa.table([pa.array(["a", None, "c", "d", None])], names=["a"])
+ expected = pa.table([pa.array(["a", "c", "d"])], names=["a"])
+ result = table.drop_null()
+ assert result.equals(expected)
+
+ table = pa.table([pa.chunked_array([["a", None], ["c", "d", None]]),
+ pa.chunked_array([["a", None], [None, "d", None]]),
+ pa.chunked_array([["a"], ["b"], [None], ["d", None]])],
+ names=["a", "b", "c"])
+ expected = pa.table([pa.array(["a", "d"]),
+ pa.array(["a", "d"]),
+ pa.array(["a", "d"])],
+ names=["a", "b", "c"])
+ result = table.drop_null()
+ assert result.equals(expected)
+
+ table = pa.table([pa.chunked_array([["a", "b"], ["c", "d", "e"]]),
+ pa.chunked_array([["a"], ["b"], [None], ["d", None]]),
+ pa.chunked_array([["a", None], ["c", "d", None]])],
+ names=["a", "b", "c"])
+ expected = pa.table([pa.array(["a", "d"]),
+ pa.array(["a", "d"]),
+ pa.array(["a", "d"])],
+ names=["a", "b", "c"])
Review comment:
Hmm, you could probably make the values different in each column, though
this is already tested on the C++ side, so probably not very important.
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<ChunkedArray>* output_array) {
+ std::vector<std::shared_ptr<Array>> new_chunks(1); // Hard-coded 1 for now
+ ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+ *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ if (values.null_count() == values.length()) {
+ std::shared_ptr<ChunkedArray> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyChunkedArray(values.type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ std::vector<std::shared_ptr<Array>> new_chunks;
+ for (const auto& chunk : values.chunks()) {
+ ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+ if (new_chunk->length() > 0) {
+ new_chunks.push_back(new_chunk);
+ }
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (const auto& column : batch.columns()) {
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+ for (const auto& column : batch.columns()) {
+ if (column->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+ break;
+ }
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ if (drop_null_filter->null_count() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ int64_t null_count = 0;
+ for (const auto& col : table.columns()) {
+ for (const auto& column_chunk : col->chunks()) {
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+ for (const auto& col : table.columns()) {
+ if (col->type()->Equals(arrow::null())) {
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+ break;
+ }
+ std::vector<::arrow::internal::Bitmap> bitmaps;
+ std::transform(col->chunks().begin(), col->chunks().end(),
+ std::back_inserter(bitmaps), [](const
std::shared_ptr<Array>& array) {
+ return
::arrow::internal::Bitmap(array->null_bitmap_data(),
+ array->offset(),
array->length());
+ });
+ int64_t global_offset = 0;
+ ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0,
table.num_rows(), true);
Review comment:
Another possibility entirely is to get `TableBatchReader` to get a
vector of record batches spanning the table, then call drop_null on each record
batch and then `Table::FromRecordBatches` to get the resulting table.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]