[
https://issues.apache.org/jira/browse/ARROW-507?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16215923#comment-16215923
]
ASF GitHub Bot commented on ARROW-507:
--------------------------------------
wesm closed pull request #1224: ARROW-507: [C++] Complete ListArray::FromArrays
implementation, add unit tests
URL: https://github.com/apache/arrow/pull/1224
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index ae9e9fd00..168ef1057 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1946,7 +1946,7 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
// ----------------------------------------------------------------------
// List tests
-class TestListBuilder : public TestBuilder {
+class TestListArray : public TestBuilder {
public:
void SetUp() {
TestBuilder::SetUp();
@@ -1973,7 +1973,7 @@ class TestListBuilder : public TestBuilder {
std::shared_ptr<ListArray> result_;
};
-TEST_F(TestListBuilder, Equality) {
+TEST_F(TestListArray, Equality) {
Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder());
std::shared_ptr<Array> array, equal_array, unequal_array;
@@ -2032,9 +2032,66 @@ TEST_F(TestListBuilder, Equality) {
ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
}
-TEST_F(TestListBuilder, TestResize) {}
+TEST_F(TestListArray, TestResize) {}
-TEST_F(TestListBuilder, TestAppendNull) {
+TEST_F(TestListArray, TestFromArrays) {
+ std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, values;
+
+ std::vector<bool> offsets_is_valid3 = {true, false, true, true};
+ std::vector<bool> offsets_is_valid4 = {true, true, false, true};
+
+ std::vector<bool> values_is_valid = {true, false, true, true, true, true};
+
+ std::vector<int32_t> offset1_values = {0, 2, 2, 6};
+ std::vector<int32_t> offset2_values = {0, 2, 6, 6};
+
+ std::vector<int8_t> values_values = {0, 1, 2, 3, 4, 5};
+ const int length = 3;
+
+ ArrayFromVector<Int32Type, int32_t>(offset1_values, &offsets1);
+ ArrayFromVector<Int32Type, int32_t>(offset2_values, &offsets2);
+
+ ArrayFromVector<Int32Type, int32_t>(offsets_is_valid3, offset1_values,
&offsets3);
+ ArrayFromVector<Int32Type, int32_t>(offsets_is_valid4, offset2_values,
&offsets4);
+
+ ArrayFromVector<Int8Type, int8_t>(values_is_valid, values_values, &values);
+
+ auto list_type = list(int8());
+
+ std::shared_ptr<Array> list1, list3, list4;
+ ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1));
+ ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3));
+ ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4));
+
+ ListArray expected1(list_type, length, offsets1->data()->buffers[1], values,
+ offsets1->data()->buffers[0], 0);
+ AssertArraysEqual(expected1, *list1);
+
+ // Use null bitmap from offsets3, but clean offsets from non-null version
+ ListArray expected3(list_type, length, offsets1->data()->buffers[1], values,
+ offsets3->data()->buffers[0], 1);
+ AssertArraysEqual(expected3, *list3);
+
+ // Check that the last offset bit is zero
+ ASSERT_TRUE(BitUtil::BitNotSet(list3->null_bitmap()->data(), length + 1));
+
+ ListArray expected4(list_type, length, offsets2->data()->buffers[1], values,
+ offsets4->data()->buffers[0], 1);
+ AssertArraysEqual(expected4, *list4);
+
+ // Test failure modes
+
+ std::shared_ptr<Array> tmp;
+
+ // Zero-length offsets
+ ASSERT_RAISES(Invalid,
+ ListArray::FromArrays(*offsets1->Slice(0, 0), *values, pool_,
&tmp));
+
+ // Offsets not int32
+ ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets1, pool_,
&tmp));
+}
+
+TEST_F(TestListArray, TestAppendNull) {
ASSERT_OK(builder_->AppendNull());
ASSERT_OK(builder_->AppendNull());
@@ -2076,7 +2133,7 @@ void ValidateBasicListArray(const ListArray* result,
const vector<int32_t>& valu
}
}
-TEST_F(TestListBuilder, TestBasics) {
+TEST_F(TestListArray, TestBasics) {
vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
vector<int> lengths = {3, 0, 4};
vector<uint8_t> is_valid = {1, 0, 1};
@@ -2098,7 +2155,7 @@ TEST_F(TestListBuilder, TestBasics) {
ValidateBasicListArray(result_.get(), values, is_valid);
}
-TEST_F(TestListBuilder, BulkAppend) {
+TEST_F(TestListArray, BulkAppend) {
vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
vector<int> lengths = {3, 0, 4};
vector<uint8_t> is_valid = {1, 0, 1};
@@ -2115,7 +2172,7 @@ TEST_F(TestListBuilder, BulkAppend) {
ValidateBasicListArray(result_.get(), values, is_valid);
}
-TEST_F(TestListBuilder, BulkAppendInvalid) {
+TEST_F(TestListArray, BulkAppendInvalid) {
vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
vector<int> lengths = {3, 0, 4};
vector<uint8_t> is_null = {0, 1, 0};
@@ -2135,7 +2192,7 @@ TEST_F(TestListBuilder, BulkAppendInvalid) {
ASSERT_RAISES(Invalid, ValidateArray(*result_));
}
-TEST_F(TestListBuilder, TestZeroLength) {
+TEST_F(TestListArray, TestZeroLength) {
// All buffers are null
Done();
ASSERT_OK(ValidateArray(*result_));
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index eaac187a3..fc4b96e1b 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -172,27 +172,55 @@ ListArray::ListArray(const std::shared_ptr<DataType>&
type, int64_t length,
SetData(internal_data);
}
-Status ListArray::FromArrays(const Array& offsets, const Array& values,
- MemoryPool* ARROW_ARG_UNUSED(pool),
+Status ListArray::FromArrays(const Array& offsets, const Array& values,
MemoryPool* pool,
std::shared_ptr<Array>* out) {
- if (ARROW_PREDICT_FALSE(offsets.length() == 0)) {
+ if (offsets.length() == 0) {
return Status::Invalid("List offsets must have non-zero length");
}
- if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) {
- return Status::Invalid("Null offsets in ListArray::FromArrays not yet
implemented");
- }
-
- if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) {
+ if (offsets.type_id() != Type::INT32) {
return Status::Invalid("List offsets must be signed int32");
}
- BufferVector buffers = {offsets.null_bitmap(),
- static_cast<const Int32Array&>(offsets).values()};
+ BufferVector buffers = {};
+
+ const auto& typed_offsets = static_cast<const Int32Array&>(offsets);
+
+ const int64_t num_offsets = offsets.length();
+
+ if (offsets.null_count() > 0) {
+ std::shared_ptr<Buffer> clean_offsets, clean_valid_bits;
+
+ RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t),
&clean_offsets));
+
+ // Copy valid bits, zero out the bit for the final offset
+ RETURN_NOT_OK(offsets.null_bitmap()->Copy(0,
BitUtil::BytesForBits(num_offsets - 1),
+ &clean_valid_bits));
+ BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets);
+ buffers.emplace_back(std::move(clean_valid_bits));
+
+ const int32_t* raw_offsets = typed_offsets.raw_values();
+ auto clean_raw_offsets =
reinterpret_cast<int32_t*>(clean_offsets->mutable_data());
+
+ // Must work backwards so we can tell how many values were in the last
non-null value
+ DCHECK(offsets.IsValid(num_offsets - 1));
+ int32_t current_offset = raw_offsets[num_offsets - 1];
+ for (int64_t i = num_offsets - 1; i >= 0; --i) {
+ if (offsets.IsValid(i)) {
+ current_offset = raw_offsets[i];
+ }
+ clean_raw_offsets[i] = current_offset;
+ }
+
+ buffers.emplace_back(std::move(clean_offsets));
+ } else {
+ buffers.emplace_back(offsets.null_bitmap());
+ buffers.emplace_back(typed_offsets.values());
+ }
auto list_type = list(values.type());
auto internal_data =
- std::make_shared<ArrayData>(list_type, offsets.length() - 1,
std::move(buffers),
+ std::make_shared<ArrayData>(list_type, num_offsets - 1,
std::move(buffers),
offsets.null_count(), offsets.offset());
internal_data->child_data.push_back(values.data());
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 75dda4a75..b5d253099 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -400,10 +400,13 @@ class ARROW_EXPORT ListArray : public Array {
/// \brief Construct ListArray from array of offsets and child value array
///
- /// Note: does not validate input beyond sanity checks. Use
- /// arrow::ValidateArray if you need stronger validation of inputs
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types, and will allocate a new offsets array if necessary (i.e. if
+ /// the offsets contain any nulls). If the offsets do not have nulls, they
+ /// are assumed to be well-formed
///
- /// \param[in] offsets Array containing n + 1 offsets encoding length and
size
+ /// \param[in] offsets Array containing n + 1 offsets encoding length and
+ /// size. Must be of int32 type
/// \param[in] values Array containing
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f402defc9..c5f28a951 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -173,6 +173,29 @@ def array(object obj, type=None, mask=None,
return _sequence_to_array(obj, size, type, pool)
+def asarray(values, type=None):
+ """
+ Convert to pyarrow.Array, inferring type if not provided. Attempt to cast
+ if indicated type is different
+
+ Parameters
+ ----------
+ values : array-like (sequence, numpy.ndarray, pyarrow.Array)
+ type : string or DataType
+
+ Returns
+ -------
+ arr : Array
+ """
+ if isinstance(values, Array):
+ if type is not None and not values.type.equals(type):
+ values = values.cast(type)
+
+ return values
+ else:
+ return array(values, type=type)
+
+
def _normalize_slice(object arrow_obj, slice key):
cdef Py_ssize_t n = len(arrow_obj)
@@ -574,7 +597,7 @@ cdef class DecimalArray(FixedSizeBinaryArray):
cdef class ListArray(Array):
@staticmethod
- def from_arrays(Array offsets, Array values, MemoryPool pool=None):
+ def from_arrays(offsets, values, MemoryPool pool=None):
"""
Construct ListArray from arrays of int32 offsets and values
@@ -587,11 +610,17 @@ cdef class ListArray(Array):
-------
list_array : ListArray
"""
- cdef shared_ptr[CArray] out
+ cdef:
+ Array _offsets, _values
+ shared_ptr[CArray] out
cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
+
+ _offsets = asarray(offsets, type='int32')
+ _values = asarray(values)
+
with nogil:
- check_status(CListArray.FromArrays(
- deref(offsets.ap), deref(values.ap), cpool, &out))
+ check_status(CListArray.FromArrays(_offsets.ap[0], _values.ap[0],
+ cpool, &out))
return pyarrow_wrap_array(out)
diff --git a/python/pyarrow/tests/test_array.py
b/python/pyarrow/tests/test_array.py
index 414a268ce..418076f81 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -218,6 +218,22 @@ def test_list_from_arrays():
assert result.equals(expected)
+ # With nulls
+ offsets = [0, None, 2, 6]
+
+ values = ['a', 'b', 'c', 'd', 'e', 'f']
+
+ result = pa.ListArray.from_arrays(offsets, values)
+ expected = pa.array([values[:2], None, values[2:]])
+
+ assert result.equals(expected)
+
+ # Another edge case
+ offsets2 = [0, 2, None, 6]
+ result = pa.ListArray.from_arrays(offsets2, values)
+ expected = pa.array([values[:2], values[2:], None])
+ assert result.equals(expected)
+
def _check_cast_case(case, safe=True):
in_data, in_type, out_data, out_type = case
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0bef1aa60..3d838ba39 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -73,7 +73,27 @@ cdef class DataType:
return '{0.__class__.__name__}({0})'.format(self)
def __richcmp__(DataType self, object other, int op):
+ if op == cp.Py_EQ:
+ return self.equals(other)
+ elif op == cp.Py_NE:
+ return not self.equals(other)
+ else:
+ raise TypeError('Invalid comparison')
+
+ def equals(self, other):
+ """
+ Return true if type is equivalent to passed value
+
+ Parameters
+ ----------
+ other : DataType or string convertible to DataType
+
+ Returns
+ -------
+ is_equal : boolean
+ """
cdef DataType other_type
+
if not isinstance(other, DataType):
if not isinstance(other, six.string_types):
raise TypeError(other)
@@ -81,12 +101,7 @@ cdef class DataType:
else:
other_type = other
- if op == cp.Py_EQ:
- return self.type.Equals(deref(other_type.type))
- elif op == cp.Py_NE:
- return not self.type.Equals(deref(other_type.type))
- else:
- raise TypeError('Invalid comparison')
+ return self.type.Equals(deref(other_type.type))
def to_pandas_dtype(self):
"""
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [C++/Python] Construct List container from offsets and values subarrays
> -----------------------------------------------------------------------
>
> Key: ARROW-507
> URL: https://issues.apache.org/jira/browse/ARROW-507
> Project: Apache Arrow
> Issue Type: New Feature
> Components: C++, Python
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> This is the inverse operation from flattening a list type into its child
> values (dropping the offsets)
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)