This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b77b7c  ARROW-507: [C++] Complete ListArray::FromArrays 
implementation, add unit tests
2b77b7c is described below

commit 2b77b7ce7a9f46b8c0bd9d4bf751e7686cec0f95
Author: Wes McKinney <wes.mckin...@twosigma.com>
AuthorDate: Mon Oct 23 17:57:13 2017 -0400

    ARROW-507: [C++] Complete ListArray::FromArrays implementation, add unit 
tests
    
    In the event that the offsets array has nulls, this will backward-fill the 
offsets to compute the correct value sizes.
    
    Author: Wes McKinney <wes.mckin...@twosigma.com>
    
    Closes #1224 from wesm/ARROW-507 and squashes the following commits:
    
    9027c140 [Wes McKinney] Clean valid bits to remove trailing set bit
    8d2cb512 [Wes McKinney] Implement / add tests for ListArray.from_arrays in 
Python
    1c6a8702 [Wes McKinney] Complete C++ implementation, unit test for 
ListArray::FromArrays, handling of offsets with nulls
---
 cpp/src/arrow/array-test.cc        | 73 +++++++++++++++++++++++++++++++++-----
 cpp/src/arrow/array.cc             | 50 ++++++++++++++++++++------
 cpp/src/arrow/array.h              |  9 +++--
 python/pyarrow/array.pxi           | 37 ++++++++++++++++---
 python/pyarrow/tests/test_array.py | 16 +++++++++
 python/pyarrow/types.pxi           | 27 ++++++++++----
 6 files changed, 180 insertions(+), 32 deletions(-)

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index ae9e9fd..168ef10 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1946,7 +1946,7 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
 // ----------------------------------------------------------------------
 // List tests
 
-class TestListBuilder : public TestBuilder {
+class TestListArray : public TestBuilder {
  public:
   void SetUp() {
     TestBuilder::SetUp();
@@ -1973,7 +1973,7 @@ class TestListBuilder : public TestBuilder {
   std::shared_ptr<ListArray> result_;
 };
 
-TEST_F(TestListBuilder, Equality) {
+TEST_F(TestListArray, Equality) {
   Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder());
 
   std::shared_ptr<Array> array, equal_array, unequal_array;
@@ -2032,9 +2032,66 @@ TEST_F(TestListBuilder, Equality) {
   ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
 }
 
-TEST_F(TestListBuilder, TestResize) {}
+TEST_F(TestListArray, TestResize) {}
 
-TEST_F(TestListBuilder, TestAppendNull) {
+TEST_F(TestListArray, TestFromArrays) {
+  std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, values;
+
+  std::vector<bool> offsets_is_valid3 = {true, false, true, true};
+  std::vector<bool> offsets_is_valid4 = {true, true, false, true};
+
+  std::vector<bool> values_is_valid = {true, false, true, true, true, true};
+
+  std::vector<int32_t> offset1_values = {0, 2, 2, 6};
+  std::vector<int32_t> offset2_values = {0, 2, 6, 6};
+
+  std::vector<int8_t> values_values = {0, 1, 2, 3, 4, 5};
+  const int length = 3;
+
+  ArrayFromVector<Int32Type, int32_t>(offset1_values, &offsets1);
+  ArrayFromVector<Int32Type, int32_t>(offset2_values, &offsets2);
+
+  ArrayFromVector<Int32Type, int32_t>(offsets_is_valid3, offset1_values, 
&offsets3);
+  ArrayFromVector<Int32Type, int32_t>(offsets_is_valid4, offset2_values, 
&offsets4);
+
+  ArrayFromVector<Int8Type, int8_t>(values_is_valid, values_values, &values);
+
+  auto list_type = list(int8());
+
+  std::shared_ptr<Array> list1, list3, list4;
+  ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1));
+  ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3));
+  ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4));
+
+  ListArray expected1(list_type, length, offsets1->data()->buffers[1], values,
+                      offsets1->data()->buffers[0], 0);
+  AssertArraysEqual(expected1, *list1);
+
+  // Use null bitmap from offsets3, but clean offsets from non-null version
+  ListArray expected3(list_type, length, offsets1->data()->buffers[1], values,
+                      offsets3->data()->buffers[0], 1);
+  AssertArraysEqual(expected3, *list3);
+
+  // Check that the last offset bit is zero
+  ASSERT_TRUE(BitUtil::BitNotSet(list3->null_bitmap()->data(), length + 1));
+
+  ListArray expected4(list_type, length, offsets2->data()->buffers[1], values,
+                      offsets4->data()->buffers[0], 1);
+  AssertArraysEqual(expected4, *list4);
+
+  // Test failure modes
+
+  std::shared_ptr<Array> tmp;
+
+  // Zero-length offsets
+  ASSERT_RAISES(Invalid,
+                ListArray::FromArrays(*offsets1->Slice(0, 0), *values, pool_, 
&tmp));
+
+  // Offsets not int32
+  ASSERT_RAISES(Invalid, ListArray::FromArrays(*values, *offsets1, pool_, 
&tmp));
+}
+
+TEST_F(TestListArray, TestAppendNull) {
   ASSERT_OK(builder_->AppendNull());
   ASSERT_OK(builder_->AppendNull());
 
@@ -2076,7 +2133,7 @@ void ValidateBasicListArray(const ListArray* result, 
const vector<int32_t>& valu
   }
 }
 
-TEST_F(TestListBuilder, TestBasics) {
+TEST_F(TestListArray, TestBasics) {
   vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
   vector<int> lengths = {3, 0, 4};
   vector<uint8_t> is_valid = {1, 0, 1};
@@ -2098,7 +2155,7 @@ TEST_F(TestListBuilder, TestBasics) {
   ValidateBasicListArray(result_.get(), values, is_valid);
 }
 
-TEST_F(TestListBuilder, BulkAppend) {
+TEST_F(TestListArray, BulkAppend) {
   vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
   vector<int> lengths = {3, 0, 4};
   vector<uint8_t> is_valid = {1, 0, 1};
@@ -2115,7 +2172,7 @@ TEST_F(TestListBuilder, BulkAppend) {
   ValidateBasicListArray(result_.get(), values, is_valid);
 }
 
-TEST_F(TestListBuilder, BulkAppendInvalid) {
+TEST_F(TestListArray, BulkAppendInvalid) {
   vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
   vector<int> lengths = {3, 0, 4};
   vector<uint8_t> is_null = {0, 1, 0};
@@ -2135,7 +2192,7 @@ TEST_F(TestListBuilder, BulkAppendInvalid) {
   ASSERT_RAISES(Invalid, ValidateArray(*result_));
 }
 
-TEST_F(TestListBuilder, TestZeroLength) {
+TEST_F(TestListArray, TestZeroLength) {
   // All buffers are null
   Done();
   ASSERT_OK(ValidateArray(*result_));
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index eaac187..fc4b96e 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -172,27 +172,55 @@ ListArray::ListArray(const std::shared_ptr<DataType>& 
type, int64_t length,
   SetData(internal_data);
 }
 
-Status ListArray::FromArrays(const Array& offsets, const Array& values,
-                             MemoryPool* ARROW_ARG_UNUSED(pool),
+Status ListArray::FromArrays(const Array& offsets, const Array& values, 
MemoryPool* pool,
                              std::shared_ptr<Array>* out) {
-  if (ARROW_PREDICT_FALSE(offsets.length() == 0)) {
+  if (offsets.length() == 0) {
     return Status::Invalid("List offsets must have non-zero length");
   }
 
-  if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) {
-    return Status::Invalid("Null offsets in ListArray::FromArrays not yet 
implemented");
-  }
-
-  if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) {
+  if (offsets.type_id() != Type::INT32) {
     return Status::Invalid("List offsets must be signed int32");
   }
 
-  BufferVector buffers = {offsets.null_bitmap(),
-                          static_cast<const Int32Array&>(offsets).values()};
+  BufferVector buffers = {};
+
+  const auto& typed_offsets = static_cast<const Int32Array&>(offsets);
+
+  const int64_t num_offsets = offsets.length();
+
+  if (offsets.null_count() > 0) {
+    std::shared_ptr<Buffer> clean_offsets, clean_valid_bits;
+
+    RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), 
&clean_offsets));
+
+    // Copy valid bits, zero out the bit for the final offset
+    RETURN_NOT_OK(offsets.null_bitmap()->Copy(0, 
BitUtil::BytesForBits(num_offsets - 1),
+                                              &clean_valid_bits));
+    BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets);
+    buffers.emplace_back(std::move(clean_valid_bits));
+
+    const int32_t* raw_offsets = typed_offsets.raw_values();
+    auto clean_raw_offsets = 
reinterpret_cast<int32_t*>(clean_offsets->mutable_data());
+
+    // Must work backwards so we can tell how many values were in the last 
non-null value
+    DCHECK(offsets.IsValid(num_offsets - 1));
+    int32_t current_offset = raw_offsets[num_offsets - 1];
+    for (int64_t i = num_offsets - 1; i >= 0; --i) {
+      if (offsets.IsValid(i)) {
+        current_offset = raw_offsets[i];
+      }
+      clean_raw_offsets[i] = current_offset;
+    }
+
+    buffers.emplace_back(std::move(clean_offsets));
+  } else {
+    buffers.emplace_back(offsets.null_bitmap());
+    buffers.emplace_back(typed_offsets.values());
+  }
 
   auto list_type = list(values.type());
   auto internal_data =
-      std::make_shared<ArrayData>(list_type, offsets.length() - 1, 
std::move(buffers),
+      std::make_shared<ArrayData>(list_type, num_offsets - 1, 
std::move(buffers),
                                   offsets.null_count(), offsets.offset());
   internal_data->child_data.push_back(values.data());
 
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 75dda4a..b5d2530 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -400,10 +400,13 @@ class ARROW_EXPORT ListArray : public Array {
 
   /// \brief Construct ListArray from array of offsets and child value array
   ///
-  /// Note: does not validate input beyond sanity checks. Use
-  /// arrow::ValidateArray if you need stronger validation of inputs
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types, and will allocate a new offsets array if necessary (i.e. if
+  /// the offsets contain any nulls). If the offsets do not have nulls, they
+  /// are assumed to be well-formed
   ///
-  /// \param[in] offsets Array containing n + 1 offsets encoding length and 
size
+  /// \param[in] offsets Array containing n + 1 offsets encoding length and
+  /// size. Must be of int32 type
   /// \param[in] values Array containing
   /// \param[in] pool MemoryPool in case new offsets array needs to be
   /// allocated because of null values
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f402def..c5f28a9 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -173,6 +173,29 @@ def array(object obj, type=None, mask=None,
         return _sequence_to_array(obj, size, type, pool)
 
 
+def asarray(values, type=None):
+    """
+    Convert to pyarrow.Array, inferring type if not provided. Attempt to cast
+    if indicated type is different
+
+    Parameters
+    ----------
+    values : array-like (sequence, numpy.ndarray, pyarrow.Array)
+    type : string or DataType
+
+    Returns
+    -------
+    arr : Array
+    """
+    if isinstance(values, Array):
+        if type is not None and not values.type.equals(type):
+            values = values.cast(type)
+
+        return values
+    else:
+        return array(values, type=type)
+
+
 def _normalize_slice(object arrow_obj, slice key):
     cdef Py_ssize_t n = len(arrow_obj)
 
@@ -574,7 +597,7 @@ cdef class DecimalArray(FixedSizeBinaryArray):
 cdef class ListArray(Array):
 
     @staticmethod
-    def from_arrays(Array offsets, Array values, MemoryPool pool=None):
+    def from_arrays(offsets, values, MemoryPool pool=None):
         """
         Construct ListArray from arrays of int32 offsets and values
 
@@ -587,11 +610,17 @@ cdef class ListArray(Array):
         -------
         list_array : ListArray
         """
-        cdef shared_ptr[CArray] out
+        cdef:
+            Array _offsets, _values
+            shared_ptr[CArray] out
         cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool)
+
+        _offsets = asarray(offsets, type='int32')
+        _values = asarray(values)
+
         with nogil:
-            check_status(CListArray.FromArrays(
-                deref(offsets.ap), deref(values.ap), cpool, &out))
+            check_status(CListArray.FromArrays(_offsets.ap[0], _values.ap[0],
+                                               cpool, &out))
         return pyarrow_wrap_array(out)
 
 
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index 414a268..418076f 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -218,6 +218,22 @@ def test_list_from_arrays():
 
     assert result.equals(expected)
 
+    # With nulls
+    offsets = [0, None, 2, 6]
+
+    values = ['a', 'b', 'c', 'd', 'e', 'f']
+
+    result = pa.ListArray.from_arrays(offsets, values)
+    expected = pa.array([values[:2], None, values[2:]])
+
+    assert result.equals(expected)
+
+    # Another edge case
+    offsets2 = [0, 2, None, 6]
+    result = pa.ListArray.from_arrays(offsets2, values)
+    expected = pa.array([values[:2], values[2:], None])
+    assert result.equals(expected)
+
 
 def _check_cast_case(case, safe=True):
     in_data, in_type, out_data, out_type = case
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 0bef1aa..3d838ba 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -73,7 +73,27 @@ cdef class DataType:
         return '{0.__class__.__name__}({0})'.format(self)
 
     def __richcmp__(DataType self, object other, int op):
+        if op == cp.Py_EQ:
+            return self.equals(other)
+        elif op == cp.Py_NE:
+            return not self.equals(other)
+        else:
+            raise TypeError('Invalid comparison')
+
+    def equals(self, other):
+        """
+        Return true if type is equivalent to passed value
+
+        Parameters
+        ----------
+        other : DataType or string convertible to DataType
+
+        Returns
+        -------
+        is_equal : boolean
+        """
         cdef DataType other_type
+
         if not isinstance(other, DataType):
             if not isinstance(other, six.string_types):
                 raise TypeError(other)
@@ -81,12 +101,7 @@ cdef class DataType:
         else:
             other_type = other
 
-        if op == cp.Py_EQ:
-            return self.type.Equals(deref(other_type.type))
-        elif op == cp.Py_NE:
-            return not self.type.Equals(deref(other_type.type))
-        else:
-            raise TypeError('Invalid comparison')
+        return self.type.Equals(deref(other_type.type))
 
     def to_pandas_dtype(self):
         """

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <commits@arrow.apache.org>'].

Reply via email to