This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 2b77b7c ARROW-507: [C++] Complete ListArray::FromArrays implementation, add unit tests 2b77b7c is described below commit 2b77b7ce7a9f46b8c0bd9d4bf751e7686cec0f95 Author: Wes McKinney <wes.mckin...@twosigma.com> AuthorDate: Mon Oct 23 17:57:13 2017 -0400 ARROW-507: [C++] Complete ListArray::FromArrays implementation, add unit tests In the event that the offsets array has nulls, this will backward-fill the offsets to compute the correct value sizes. Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1224 from wesm/ARROW-507 and squashes the following commits: 9027c140 [Wes McKinney] Clean valid bits to remove trailing set bit 8d2cb512 [Wes McKinney] Implement / add tests for ListArray.from_arrays in Python 1c6a8702 [Wes McKinney] Complete C++ implementation, unit test for ListArray::FromArrays, handling of offsets with nulls --- cpp/src/arrow/array-test.cc | 73 +++++++++++++++++++++++++++++++++----- cpp/src/arrow/array.cc | 50 ++++++++++++++++++++------ cpp/src/arrow/array.h | 9 +++-- python/pyarrow/array.pxi | 37 ++++++++++++++++--- python/pyarrow/tests/test_array.py | 16 +++++++++ python/pyarrow/types.pxi | 27 ++++++++++---- 6 files changed, 180 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index ae9e9fd..168ef10 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1946,7 +1946,7 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) { // ---------------------------------------------------------------------- // List tests -class TestListBuilder : public TestBuilder { +class TestListArray : public TestBuilder { public: void SetUp() { TestBuilder::SetUp(); @@ -1973,7 +1973,7 @@ class TestListBuilder : public TestBuilder { std::shared_ptr<ListArray> result_; }; -TEST_F(TestListBuilder, Equality) { +TEST_F(TestListArray, Equality) { Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder()); std::shared_ptr<Array> array, equal_array, unequal_array; @@ -2032,9 +2032,66 @@ TEST_F(TestListBuilder, Equality) { ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice)); } -TEST_F(TestListBuilder, TestResize) {} +TEST_F(TestListArray, TestResize) {} -TEST_F(TestListBuilder, TestAppendNull) { +TEST_F(TestListArray, TestFromArrays) { + std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, values; + + std::vector<bool> offsets_is_valid3 = {true, false, true, true}; + std::vector<bool> offsets_is_valid4 = {true, true, false, true}; + + std::vector<bool> values_is_valid = {true, false, true, true, true, true}; + + std::vector<int32_t> offset1_values = {0, 2, 2, 6}; + std::vector<int32_t> offset2_values = {0, 2, 6, 6}; + + std::vector<int8_t> values_values = {0, 1, 2, 3, 4, 5}; + const int length = 3; + + ArrayFromVector<Int32Type, int32_t>(offset1_values, &offsets1); + ArrayFromVector<Int32Type, int32_t>(offset2_values, &offsets2); + + ArrayFromVector<Int32Type, int32_t>(offsets_is_valid3, offset1_values, &offsets3); + ArrayFromVector<Int32Type, int32_t>(offsets_is_valid4, offset2_values, &offsets4); + + ArrayFromVector<Int8Type, int8_t>(values_is_valid, values_values, &values); + + auto list_type = list(int8()); + + std::shared_ptr<Array> list1, list3, list4; + ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1)); + ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3)); + ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4)); + + ListArray expected1(list_type, length, offsets1->data()->buffers[1], values, + offsets1->data()->buffers[0], 0); + AssertArraysEqual(expected1, *list1); + + // Use null bitmap from offsets3, but clean offsets from non-null version + ListArray expected3(list_type, length, offsets1->data()->buffers[1], values, + offsets3->data()->buffers[0], 1); + AssertArraysEqual(expected3, *list3); + + // Check that the last offset bit is zero + ASSERT_TRUE(BitUtil::BitNotSet(list3->null_bitmap()->data(), length + 1)); + + ListArray expected4(list_type, length, offsets2->data()->buffers[1], values, + offsets4->data()->buffers[0], 1); + AssertArraysEqual(expected4, *list4); + + // Test failure modes + + std::shared_ptr<Array> tmp; + + // Zero-length offsets + ASSERT_RAISES(Invalid, + ListArray::FromArrays(*offsets1->Slice(0, 0), *values, pool_, &tmp)); + + // Offsets not int32 + ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets1, pool_, &tmp)); +} + +TEST_F(TestListArray, TestAppendNull) { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); @@ -2076,7 +2133,7 @@ void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& valu } } -TEST_F(TestListBuilder, TestBasics) { +TEST_F(TestListArray, TestBasics) { vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6}; vector<int> lengths = {3, 0, 4}; vector<uint8_t> is_valid = {1, 0, 1}; @@ -2098,7 +2155,7 @@ TEST_F(TestListBuilder, TestBasics) { ValidateBasicListArray(result_.get(), values, is_valid); } -TEST_F(TestListBuilder, BulkAppend) { +TEST_F(TestListArray, BulkAppend) { vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6}; vector<int> lengths = {3, 0, 4}; vector<uint8_t> is_valid = {1, 0, 1}; @@ -2115,7 +2172,7 @@ TEST_F(TestListBuilder, BulkAppend) { ValidateBasicListArray(result_.get(), values, is_valid); } -TEST_F(TestListBuilder, BulkAppendInvalid) { +TEST_F(TestListArray, BulkAppendInvalid) { vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6}; vector<int> lengths = {3, 0, 4}; vector<uint8_t> is_null = {0, 1, 0}; @@ -2135,7 +2192,7 @@ TEST_F(TestListBuilder, BulkAppendInvalid) { ASSERT_RAISES(Invalid, ValidateArray(*result_)); } -TEST_F(TestListBuilder, TestZeroLength) { +TEST_F(TestListArray, TestZeroLength) { // All buffers are null Done(); ASSERT_OK(ValidateArray(*result_)); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index eaac187..fc4b96e 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -172,27 +172,55 @@ ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length, SetData(internal_data); } -Status ListArray::FromArrays(const Array& offsets, const Array& values, - MemoryPool* ARROW_ARG_UNUSED(pool), +Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr<Array>* out) { - if (ARROW_PREDICT_FALSE(offsets.length() == 0)) { + if (offsets.length() == 0) { return Status::Invalid("List offsets must have non-zero length"); } - if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) { - return Status::Invalid("Null offsets in ListArray::FromArrays not yet implemented"); - } - - if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) { + if (offsets.type_id() != Type::INT32) { return Status::Invalid("List offsets must be signed int32"); } - BufferVector buffers = {offsets.null_bitmap(), - static_cast<const Int32Array&>(offsets).values()}; + BufferVector buffers = {}; + + const auto& typed_offsets = static_cast<const Int32Array&>(offsets); + + const int64_t num_offsets = offsets.length(); + + if (offsets.null_count() > 0) { + std::shared_ptr<Buffer> clean_offsets, clean_valid_bits; + + RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets)); + + // Copy valid bits, zero out the bit for the final offset + RETURN_NOT_OK(offsets.null_bitmap()->Copy(0, BitUtil::BytesForBits(num_offsets - 1), + &clean_valid_bits)); + BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets); + buffers.emplace_back(std::move(clean_valid_bits)); + + const int32_t* raw_offsets = typed_offsets.raw_values(); + auto clean_raw_offsets = reinterpret_cast<int32_t*>(clean_offsets->mutable_data()); + + // Must work backwards so we can tell how many values were in the last non-null value + DCHECK(offsets.IsValid(num_offsets - 1)); + int32_t current_offset = raw_offsets[num_offsets - 1]; + for (int64_t i = num_offsets - 1; i >= 0; --i) { + if (offsets.IsValid(i)) { + current_offset = raw_offsets[i]; + } + clean_raw_offsets[i] = current_offset; + } + + buffers.emplace_back(std::move(clean_offsets)); + } else { + buffers.emplace_back(offsets.null_bitmap()); + buffers.emplace_back(typed_offsets.values()); + } auto list_type = list(values.type()); auto internal_data = - std::make_shared<ArrayData>(list_type, offsets.length() - 1, std::move(buffers), + std::make_shared<ArrayData>(list_type, num_offsets - 1, std::move(buffers), offsets.null_count(), offsets.offset()); internal_data->child_data.push_back(values.data()); diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 75dda4a..b5d2530 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -400,10 +400,13 @@ class ARROW_EXPORT ListArray : public Array { /// \brief Construct ListArray from array of offsets and child value array /// - /// Note: does not validate input beyond sanity checks. Use - /// arrow::ValidateArray if you need stronger validation of inputs + /// This function does the bare minimum of validation of the offsets and + /// input types, and will allocate a new offsets array if necessary (i.e. if + /// the offsets contain any nulls). If the offsets do not have nulls, they + /// are assumed to be well-formed /// - /// \param[in] offsets Array containing n + 1 offsets encoding length and size + /// \param[in] offsets Array containing n + 1 offsets encoding length and + /// size. Must be of int32 type /// \param[in] values Array containing /// \param[in] pool MemoryPool in case new offsets array needs to be /// allocated because of null values diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f402def..c5f28a9 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -173,6 +173,29 @@ def array(object obj, type=None, mask=None, return _sequence_to_array(obj, size, type, pool) +def asarray(values, type=None): + """ + Convert to pyarrow.Array, inferring type if not provided. Attempt to cast + if indicated type is different + + Parameters + ---------- + values : array-like (sequence, numpy.ndarray, pyarrow.Array) + type : string or DataType + + Returns + ------- + arr : Array + """ + if isinstance(values, Array): + if type is not None and not values.type.equals(type): + values = values.cast(type) + + return values + else: + return array(values, type=type) + + def _normalize_slice(object arrow_obj, slice key): cdef Py_ssize_t n = len(arrow_obj) @@ -574,7 +597,7 @@ cdef class DecimalArray(FixedSizeBinaryArray): cdef class ListArray(Array): @staticmethod - def from_arrays(Array offsets, Array values, MemoryPool pool=None): + def from_arrays(offsets, values, MemoryPool pool=None): """ Construct ListArray from arrays of int32 offsets and values @@ -587,11 +610,17 @@ cdef class ListArray(Array): ------- list_array : ListArray """ - cdef shared_ptr[CArray] out + cdef: + Array _offsets, _values + shared_ptr[CArray] out cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + + _offsets = asarray(offsets, type='int32') + _values = asarray(values) + with nogil: - check_status(CListArray.FromArrays( - deref(offsets.ap), deref(values.ap), cpool, &out)) + check_status(CListArray.FromArrays(_offsets.ap[0], _values.ap[0], + cpool, &out)) return pyarrow_wrap_array(out) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 414a268..418076f 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -218,6 +218,22 @@ def test_list_from_arrays(): assert result.equals(expected) + # With nulls + offsets = [0, None, 2, 6] + + values = ['a', 'b', 'c', 'd', 'e', 'f'] + + result = pa.ListArray.from_arrays(offsets, values) + expected = pa.array([values[:2], None, values[2:]]) + + assert result.equals(expected) + + # Another edge case + offsets2 = [0, 2, None, 6] + result = pa.ListArray.from_arrays(offsets2, values) + expected = pa.array([values[:2], values[2:], None]) + assert result.equals(expected) + def _check_cast_case(case, safe=True): in_data, in_type, out_data, out_type = case diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0bef1aa..3d838ba 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -73,7 +73,27 @@ cdef class DataType: return '{0.__class__.__name__}({0})'.format(self) def __richcmp__(DataType self, object other, int op): + if op == cp.Py_EQ: + return self.equals(other) + elif op == cp.Py_NE: + return not self.equals(other) + else: + raise TypeError('Invalid comparison') + + def equals(self, other): + """ + Return true if type is equivalent to passed value + + Parameters + ---------- + other : DataType or string convertible to DataType + + Returns + ------- + is_equal : boolean + """ cdef DataType other_type + if not isinstance(other, DataType): if not isinstance(other, six.string_types): raise TypeError(other) @@ -81,12 +101,7 @@ cdef class DataType: else: other_type = other - if op == cp.Py_EQ: - return self.type.Equals(deref(other_type.type)) - elif op == cp.Py_NE: - return not self.type.Equals(deref(other_type.type)) - else: - raise TypeError('Invalid comparison') + return self.type.Equals(deref(other_type.type)) def to_pandas_dtype(self): """ -- To stop receiving notification emails like this one, please contact ['"commits@arrow.apache.org" <commits@arrow.apache.org>'].