nirandaperera commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r684311052
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
Review comment:
nit:
```suggestion
for (const auto& column: batch.columns()) {
null_count += column->null_count();
}
```
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ if (null_count / batch.num_columns() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+using ::arrow::internal::Bitmap;
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ const int num_columns = table.num_columns();
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < num_columns; ++col_index) {
+ const ArrayVector& chunks = table.column(col_index)->chunks();
+ for (size_t chunk_index = 0; chunk_index < chunks.size(); ++chunk_index) {
+ const auto& column_chunk = chunks[chunk_index];
+ null_count += column_chunk->null_count();
+ }
+ }
Review comment:
nit: range loops are much cleaner, and aesthetically pleasing :wink:
```suggestion
for (const auto& col: table.columns()) {
for (const auto& column_chunk: col->chunks()) {
null_count += column_chunk->null_count();
}
}
```
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ if (null_count / batch.num_columns() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+using ::arrow::internal::Bitmap;
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ const int num_columns = table.num_columns();
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < num_columns; ++col_index) {
+ const ArrayVector& chunks = table.column(col_index)->chunks();
+ for (size_t chunk_index = 0; chunk_index < chunks.size(); ++chunk_index) {
+ const auto& column_chunk = chunks[chunk_index];
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+ if (null_count / table.num_columns() == table.num_rows()) {
+ std::vector<std::shared_ptr<ChunkedArray>>
empty_table(table.num_columns());
+ for (int i = 0; i < table.num_columns(); i++) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyArray(table.column(i)->type(), ctx->memory_pool(),
&empty_array));
+ empty_table[i] =
std::make_shared<ChunkedArray>(ArrayVector{empty_array});
+ }
+ return Table::Make(table.schema(), empty_table, 0);
+ }
Review comment:
same comment as in the record batch
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ if (null_count / batch.num_columns() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(batch.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ if (column->null_bitmap_data()) {
+ ::arrow::internal::BitmapAnd(column->null_bitmap_data(),
column->offset(),
+ dst->data(), 0, column->length(), 0,
+ dst->mutable_data());
+ }
+ }
+ auto drop_null_filter =
+ std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+ ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch),
Datum(drop_null_filter),
+ FilterOptions::Defaults(), ctx));
+ return result.record_batch();
+}
+
+using ::arrow::internal::Bitmap;
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext*
ctx) {
+ if (table.num_rows() == 0) {
+ return Table::Make(table.schema(), table.columns(), 0);
+ }
+ const int num_columns = table.num_columns();
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < num_columns; ++col_index) {
+ const ArrayVector& chunks = table.column(col_index)->chunks();
+ for (size_t chunk_index = 0; chunk_index < chunks.size(); ++chunk_index) {
+ const auto& column_chunk = chunks[chunk_index];
+ null_count += column_chunk->null_count();
+ }
+ }
+ if (null_count == 0) {
+ return Table::Make(table.schema(), table.columns(), table.num_rows());
+ }
+ if (null_count / table.num_columns() == table.num_rows()) {
+ std::vector<std::shared_ptr<ChunkedArray>>
empty_table(table.num_columns());
+ for (int i = 0; i < table.num_columns(); i++) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(
+ CreateEmptyArray(table.column(i)->type(), ctx->memory_pool(),
&empty_array));
+ empty_table[i] =
std::make_shared<ChunkedArray>(ArrayVector{empty_array});
+ }
+ return Table::Make(table.schema(), empty_table, 0);
+ }
+
+ ARROW_ASSIGN_OR_RAISE(auto dst,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+ for (int col_index = 0; col_index < num_columns; ++col_index) {
+ const ArrayVector& chunks = table.column(col_index)->chunks();
+ std::vector<Bitmap> bitmaps(chunks.size());
+ for (size_t chunk_index = 0; chunk_index < chunks.size(); ++chunk_index) {
+ const auto& column_chunk = chunks[chunk_index];
+ bitmaps[chunk_index] = Bitmap(column_chunk->null_bitmap_data(),
+ column_chunk->offset(),
column_chunk->length());
+ }
+ int64_t bitmap_offset = 0;
+ ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+ AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
+ BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0,
table.num_rows(), true);
+
+ for (auto bitmap : bitmaps) {
+ if (bitmap.buffer()->data()) {
+ ::arrow::internal::CopyBitmap(bitmap.buffer()->data(), bitmap.offset(),
+ bitmap.length(),
+ concatenated_bitmap->mutable_data(),
bitmap_offset);
+ }
+ bitmap_offset += bitmap.length();
+ }
+ ::arrow::internal::BitmapAnd(concatenated_bitmap->data(), 0, dst->data(),
0,
+ table.num_rows(), 0, dst->mutable_data());
+ }
Review comment:
this can be simplified IMO
```suggestion
ARROW_ASSIGN_OR_RAISE(auto dst,
AllocateEmptyBitmap(table.num_rows(),
ctx->memory_pool()));
BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
for (const auto& col: table.columns()) {
int64_t global_offset = 0;
for (const auto& chunk: col.chunks()) {
if (chunk-> null_bitmap()){
// BitmapAnd only on the corresponding region of dst
::arrow::internal::BitmapAnd(dst->data(), global_offset,
chunk->data(), chunk->offset(),
chunk->length(), global_offset,
dst->mutable_data());
}
global_offset += chunk->length();
}
}
```
##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,203 @@ class TakeMetaFunction : public MetaFunction {
}
};
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+ std::shared_ptr<arrow::BooleanArray>* out_array) {
+ auto bitmap_buffer = values.null_bitmap();
+ *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer,
nullptr, 0,
+ values.offset());
+ return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool*
memory_pool,
+ std::shared_ptr<Array>* output_array) {
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+ RETURN_NOT_OK(builder->Resize(0));
+ ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+ return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>&
values,
+ ExecContext* ctx) {
+ if (values->null_count() == 0) {
+ return values;
+ }
+ if (values->type()->Equals(arrow::null())) {
+ return std::make_shared<NullArray>(0);
+ }
+ std::shared_ptr<BooleanArray> drop_null_filter;
+ RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(),
&drop_null_filter));
+
+ if (drop_null_filter->null_count() == drop_null_filter->length()) {
+ std::shared_ptr<Array> empty_array;
+ RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(),
&empty_array));
+ return empty_array;
+ }
+ auto options = FilterOptions::Defaults();
+ ARROW_ASSIGN_OR_RAISE(
+ Datum result,
+ CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)},
&options,
+ ctx));
+ return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray&
values,
+ ExecContext* ctx) {
+ auto num_chunks = values.num_chunks();
+ std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+ for (int i = 0; i < num_chunks; i++) {
+ ARROW_ASSIGN_OR_RAISE(new_chunks[i], DropNullArray(values.chunk(i), ctx));
+ }
+ return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch&
batch,
+ ExecContext* ctx) {
+ int64_t null_count = 0;
+ for (int col_index = 0; col_index < batch.num_columns(); ++col_index) {
+ const auto& column = batch.column(col_index);
+ null_count += column->null_count();
+ }
+ if (null_count == 0) {
+ return RecordBatch::Make(batch.schema(), batch.num_rows(),
batch.columns());
+ }
+ if (null_count / batch.num_columns() == batch.num_rows()) {
+ std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+ for (int i = 0; i < batch.num_columns(); i++) {
+ RETURN_NOT_OK(
+ CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(),
&empty_batch[i]));
+ }
+ return RecordBatch::Make(batch.schema(), 0, empty_batch);
+ }
Review comment:
I wouldn't this TBH. This doesn't cover all cases where we could return
an empty table/ record batch.
I think the best would be to defer it until we have the `dst` array
populated.
if `dst->null_count() == def->length()` we could return an empty Table/
RecordBatch. And that would cover all cases where we need to return an empty
table. :-)
##########
File path: cpp/src/arrow/compute/kernels/vector_selection_test.cc
##########
@@ -1734,5 +1734,240 @@ TEST(TestTake, RandomFixedSizeBinary) {
TakeRandomTest<FixedSizeBinaryType>::Test(fixed_size_binary(16));
}
+// ----------------------------------------------------------------------
Review comment:
I think we need to add following test cases
1. 0-lengthed inputs (to test the early termination code paths)
2. non-zero but all null values on inputs (to test the early termination
code paths)
3. larger test case with more random values (say 1000 or so) --> Because
bitmap ops work on 8 elements together. So, we need to test beyond 8 elements
in an array.
ex:
https://github.com/apache/arrow/blob/master/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc#L56
4. also, it would be nicer to have test with non-zero offsets in the inputs
(only for array impl)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]