This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 255dbf990c GH-41684: [C++][Python] Add optional null_bitmap to
MapArray::FromArrays (#41757)
255dbf990c is described below
commit 255dbf990c3d3e5fb1270a2a11efe0af2be195ab
Author: Alenka Frim <[email protected]>
AuthorDate: Fri May 31 10:09:54 2024 +0200
GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays
(#41757)
### Rationale for this change
When constructing a `MapArray` with `FromArrays` one can not supply a
`null_bitmap`.
### What changes are included in this PR?
Optional `null_bitmap` argument is added to `MapArray::FromArrays`.
### Are these changes tested?
TODO (have them locally, need to clean them up and commit.
### Are there any user-facing changes?
No.
* GitHub Issue: #41684
Authored-by: AlenkaF <[email protected]>
Signed-off-by: Joris Van den Bossche <[email protected]>
---
cpp/src/arrow/array/array_list_test.cc | 17 +++++++++++++
cpp/src/arrow/array/array_nested.cc | 45 +++++++++++++++++++++++-----------
cpp/src/arrow/array/array_nested.h | 9 ++++---
python/pyarrow/array.pxi | 11 ++++++---
python/pyarrow/includes/libarrow.pxd | 8 ++++--
python/pyarrow/tests/test_array.py | 34 +++++++++++++++++++++++++
6 files changed, 102 insertions(+), 22 deletions(-)
diff --git a/cpp/src/arrow/array/array_list_test.cc
b/cpp/src/arrow/array/array_list_test.cc
index e79ce6fe17..55f91dc341 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -1368,6 +1368,23 @@ TEST_F(TestMapArray, FromArrays) {
ASSERT_EQ(keys_with_null->length(), tmp_items->length());
ASSERT_RAISES(Invalid,
MapArray::FromArrays(offsets1, keys_with_null, tmp_items,
pool_));
+
+ // With null_bitmap
+ ASSERT_OK_AND_ASSIGN(auto map7, MapArray::FromArrays(offsets1, keys, items,
pool_,
+
offsets3->data()->buffers[0]));
+ ASSERT_OK(map7->Validate());
+ MapArray expected7(map_type, length, offsets1->data()->buffers[1], keys,
items,
+ offsets3->data()->buffers[0], 1);
+ AssertArraysEqual(expected7, *map7);
+
+ // Null bitmap and offset with null
+ ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets3, keys, items, pool_,
+ offsets3->data()->buffers[0]));
+
+ // Null bitmap and offset with offset
+ ASSERT_RAISES(NotImplemented,
+ MapArray::FromArrays(offsets3->Slice(2), keys, items, pool_,
+ offsets3->data()->buffers[0]));
}
TEST_F(TestMapArray, FromArraysEquality) {
diff --git a/cpp/src/arrow/array/array_nested.cc
b/cpp/src/arrow/array/array_nested.cc
index 67a499c2b8..bb5c6bf018 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -807,7 +807,7 @@ MapArray::MapArray(const std::shared_ptr<DataType>& type,
int64_t length,
Result<std::shared_ptr<Array>> MapArray::FromArraysInternal(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
- MemoryPool* pool) {
+ MemoryPool* pool, const std::shared_ptr<Buffer>& null_bitmap) {
using offset_type = typename MapType::offset_type;
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
@@ -827,6 +827,15 @@ Result<std::shared_ptr<Array>>
MapArray::FromArraysInternal(
return Status::Invalid("Map key and item arrays must be equal length");
}
+ if (null_bitmap != nullptr && offsets->null_count() > 0) {
+ return Status::Invalid(
+ "Ambiguous to specify both validity map and offsets with nulls");
+ }
+
+ if (null_bitmap != nullptr && offsets->offset() != 0) {
+ return Status::NotImplemented("Null bitmap with offsets slice not
supported.");
+ }
+
if (offsets->null_count() > 0) {
ARROW_ASSIGN_OR_RAISE(auto buffers,
CleanListOffsets<MapType>(NULLPTR, *offsets, pool));
@@ -836,24 +845,32 @@ Result<std::shared_ptr<Array>>
MapArray::FromArraysInternal(
using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
const auto& typed_offsets = checked_cast<const OffsetArrayType&>(*offsets);
- auto buffers = BufferVector({nullptr, typed_offsets.values()});
+
+ BufferVector buffers;
+ int64_t null_count;
+ if (null_bitmap != nullptr) {
+ buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()});
+ null_count = null_bitmap->size();
+ } else {
+ buffers = BufferVector({null_bitmap, typed_offsets.values()});
+ null_count = 0;
+ }
return std::make_shared<MapArray>(type, offsets->length() - 1,
std::move(buffers), keys,
- items, /*null_count=*/0,
offsets->offset());
+ items, /*null_count=*/null_count,
offsets->offset());
}
-Result<std::shared_ptr<Array>> MapArray::FromArrays(const
std::shared_ptr<Array>& offsets,
- const
std::shared_ptr<Array>& keys,
- const
std::shared_ptr<Array>& items,
- MemoryPool* pool) {
+Result<std::shared_ptr<Array>> MapArray::FromArrays(
+ const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
+ const std::shared_ptr<Array>& items, MemoryPool* pool,
+ const std::shared_ptr<Buffer>& null_bitmap) {
return FromArraysInternal(std::make_shared<MapType>(keys->type(),
items->type()),
- offsets, keys, items, pool);
+ offsets, keys, items, pool, null_bitmap);
}
-Result<std::shared_ptr<Array>> MapArray::FromArrays(std::shared_ptr<DataType>
type,
- const
std::shared_ptr<Array>& offsets,
- const
std::shared_ptr<Array>& keys,
- const
std::shared_ptr<Array>& items,
- MemoryPool* pool) {
+Result<std::shared_ptr<Array>> MapArray::FromArrays(
+ std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
+ const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+ MemoryPool* pool, const std::shared_ptr<Buffer>& null_bitmap) {
if (type->id() != Type::MAP) {
return Status::TypeError("Expected map type, got ", type->ToString());
}
@@ -864,7 +881,7 @@ Result<std::shared_ptr<Array>>
MapArray::FromArrays(std::shared_ptr<DataType> ty
if (!map_type.item_type()->Equals(items->type())) {
return Status::TypeError("Mismatching map items type");
}
- return FromArraysInternal(std::move(type), offsets, keys, items, pool);
+ return FromArraysInternal(std::move(type), offsets, keys, items, pool,
null_bitmap);
}
Status MapArray::ValidateChildData(
diff --git a/cpp/src/arrow/array/array_nested.h
b/cpp/src/arrow/array/array_nested.h
index 5744f5fcad..f96b6bd3b1 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -532,15 +532,18 @@ class ARROW_EXPORT MapArray : public ListArray {
/// \param[in] keys Array containing key values
/// \param[in] items Array containing item values
/// \param[in] pool MemoryPool in case new offsets array needs to be
+ /// \param[in] null_bitmap Optional validity bitmap
/// allocated because of null values
static Result<std::shared_ptr<Array>> FromArrays(
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>&
keys,
- const std::shared_ptr<Array>& items, MemoryPool* pool =
default_memory_pool());
+ const std::shared_ptr<Array>& items, MemoryPool* pool =
default_memory_pool(),
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR);
static Result<std::shared_ptr<Array>> FromArrays(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
- MemoryPool* pool = default_memory_pool());
+ MemoryPool* pool = default_memory_pool(),
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR);
const MapType* map_type() const { return map_type_; }
@@ -560,7 +563,7 @@ class ARROW_EXPORT MapArray : public ListArray {
static Result<std::shared_ptr<Array>> FromArraysInternal(
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
- MemoryPool* pool);
+ MemoryPool* pool, const std::shared_ptr<Buffer>& null_bitmap = NULLPTR);
private:
const MapType* map_type_;
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 406830ad4d..3c26e85887 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -3060,7 +3060,7 @@ cdef class MapArray(ListArray):
"""
@staticmethod
- def from_arrays(offsets, keys, items, DataType type=None, MemoryPool
pool=None):
+ def from_arrays(offsets, keys, items, DataType type=None, MemoryPool
pool=None, mask=None):
"""
Construct MapArray from arrays of int32 offsets and key, item arrays.
@@ -3072,6 +3072,8 @@ cdef class MapArray(ListArray):
type : DataType, optional
If not specified, a default MapArray with the keys' and items'
type is used.
pool : MemoryPool
+ mask : Array (boolean type), optional
+ Indicate which values are null (True) or not null (False).
Returns
-------
@@ -3153,24 +3155,27 @@ cdef class MapArray(ListArray):
cdef:
Array _offsets, _keys, _items
shared_ptr[CArray] out
+ shared_ptr[CBuffer] c_mask
cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
_offsets = asarray(offsets, type='int32')
_keys = asarray(keys)
_items = asarray(items)
+ c_mask = c_mask_inverted_from_obj(mask, pool)
+
if type is not None:
with nogil:
out = GetResultValue(
CMapArray.FromArraysAndType(
type.sp_type, _offsets.sp_array,
- _keys.sp_array, _items.sp_array, cpool))
+ _keys.sp_array, _items.sp_array, cpool, c_mask))
else:
with nogil:
out = GetResultValue(
CMapArray.FromArrays(_offsets.sp_array,
_keys.sp_array,
- _items.sp_array, cpool))
+ _items.sp_array, cpool, c_mask))
cdef Array result = pyarrow_wrap_array(out)
result.validate()
return result
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index a66f584b83..0d63ec6be3 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -823,7 +823,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
const shared_ptr[CArray]& offsets,
const shared_ptr[CArray]& keys,
const shared_ptr[CArray]& items,
- CMemoryPool* pool)
+ CMemoryPool* pool,
+ const shared_ptr[CBuffer] null_bitmap,
+ )
@staticmethod
CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"(
@@ -831,7 +833,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
const shared_ptr[CArray]& offsets,
const shared_ptr[CArray]& keys,
const shared_ptr[CArray]& items,
- CMemoryPool* pool)
+ CMemoryPool* pool,
+ const shared_ptr[CBuffer] null_bitmap,
+ )
shared_ptr[CArray] keys()
shared_ptr[CArray] items()
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index b89e0ace15..49a00517fc 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1079,6 +1079,40 @@ def test_map_from_arrays():
pa.int64()
))
+ # pass in null bitmap with type
+ result = pa.MapArray.from_arrays([0, 2, 2, 6], keys, items, pa.map_(
+ keys.type,
+ items.type),
+ mask=pa.array([False, True, False], type=pa.bool_())
+ )
+ assert result.equals(expected)
+
+ # pass in null bitmap without the type
+ result = pa.MapArray.from_arrays([0, 2, 2, 6], keys, items,
+ mask=pa.array([False, True, False],
+ type=pa.bool_())
+ )
+ assert result.equals(expected)
+
+ # error if null bitmap and offsets with nulls passed
+ msg1 = 'Ambiguous to specify both validity map and offsets with nulls'
+ with pytest.raises(pa.ArrowInvalid, match=msg1):
+ pa.MapArray.from_arrays(offsets, keys, items, pa.map_(
+ keys.type,
+ items.type),
+ mask=pa.array([False, True, False], type=pa.bool_())
+ )
+
+ # error if null bitmap passed to sliced offset
+ msg2 = 'Null bitmap with offsets slice not supported.'
+ offsets = pa.array(offsets, pa.int32())
+ with pytest.raises(pa.ArrowNotImplementedError, match=msg2):
+ pa.MapArray.from_arrays(offsets.slice(2), keys, items, pa.map_(
+ keys.type,
+ items.type),
+ mask=pa.array([False, True, False], type=pa.bool_())
+ )
+
# check invalid usage
offsets = [0, 1, 3, 5]
keys = np.arange(5)