[ 
https://issues.apache.org/jira/browse/ARROW-2311?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16400591#comment-16400591
 ] 

ASF GitHub Bot commented on ARROW-2311:
---------------------------------------

wesm closed pull request #1754: ARROW-2311: [Python/C++] Fix struct array 
slicing
URL: https://github.com/apache/arrow/pull/1754
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index ad2335fa1..2aa73a09a 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -2983,14 +2983,14 @@ TEST_F(TestStructBuilder, TestEquality) {
   std::shared_ptr<Array> unequal_bitmap_array, unequal_offsets_array,
       unequal_values_array;
 
-  vector<int32_t> int_values = {1, 2, 3, 4};
+  vector<int32_t> int_values = {101, 102, 103, 104};
   vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 
'k'};
   vector<int> list_lengths = {3, 0, 3, 4};
   vector<int> list_offsets = {0, 3, 3, 6};
   vector<uint8_t> list_is_valid = {1, 0, 1, 1};
   vector<uint8_t> struct_is_valid = {1, 1, 1, 1};
 
-  vector<int32_t> unequal_int_values = {4, 2, 3, 1};
+  vector<int32_t> unequal_int_values = {104, 102, 103, 101};
   vector<char> unequal_list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'l', 'u', 
'c', 'y'};
   vector<int> unequal_list_offsets = {0, 3, 4, 6};
   vector<uint8_t> unequal_list_is_valid = {1, 1, 1, 1};
@@ -3104,29 +3104,85 @@ TEST_F(TestStructBuilder, TestEquality) {
   EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array));
   EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array));
   EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array));
+}
 
-  // ARROW-33 Slice / equality
-  std::shared_ptr<Array> slice, slice2;
+TEST_F(TestStructBuilder, TestZeroLength) {
+  // All buffers are null
+  Done();
+  ASSERT_OK(ValidateArray(*result_));
+}
 
-  slice = array->Slice(2);
-  slice2 = array->Slice(2);
+TEST_F(TestStructBuilder, TestSlice) {
+  std::shared_ptr<Array> array, equal_array;
+  std::shared_ptr<Array> unequal_bitmap_array, unequal_offsets_array,
+      unequal_values_array;
+
+  vector<int32_t> int_values = {101, 102, 103, 104};
+  vector<char> list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 
'k'};
+  vector<int> list_lengths = {3, 0, 3, 4};
+  vector<int> list_offsets = {0, 3, 3, 6};
+  vector<uint8_t> list_is_valid = {1, 0, 1, 1};
+  vector<uint8_t> struct_is_valid = {1, 1, 1, 1};
+
+  ListBuilder* list_vb = static_cast<ListBuilder*>(builder_->field_builder(0));
+  Int8Builder* char_vb = static_cast<Int8Builder*>(list_vb->value_builder());
+  Int32Builder* int_vb = 
static_cast<Int32Builder*>(builder_->field_builder(1));
+  ASSERT_OK(builder_->Reserve(list_lengths.size()));
+  ASSERT_OK(char_vb->Reserve(list_values.size()));
+  ASSERT_OK(int_vb->Reserve(int_values.size()));
+
+  ASSERT_OK(builder_->Append(struct_is_valid.size(), struct_is_valid.data()));
+  ASSERT_OK(
+      list_vb->Append(list_offsets.data(), list_offsets.size(), 
list_is_valid.data()));
+  for (int8_t value : list_values) {
+    char_vb->UnsafeAppend(value);
+  }
+  for (int32_t value : int_values) {
+    int_vb->UnsafeAppend(value);
+  }
+  ASSERT_OK(builder_->Finish(&array));
+
+  std::shared_ptr<StructArray> slice, slice2;
+  std::shared_ptr<Int32Array> int_field;
+  std::shared_ptr<ListArray> list_field;
+
+  slice = std::dynamic_pointer_cast<StructArray>(array->Slice(2));
+  slice2 = std::dynamic_pointer_cast<StructArray>(array->Slice(2));
   ASSERT_EQ(array->length() - 2, slice->length());
 
   ASSERT_TRUE(slice->Equals(slice2));
   ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice));
 
-  slice = array->Slice(1, 2);
-  slice2 = array->Slice(1, 2);
+  int_field = std::dynamic_pointer_cast<Int32Array>(slice->field(1));
+  ASSERT_EQ(int_field->length(), slice->length());
+  ASSERT_EQ(int_field->Value(0), 103);
+  ASSERT_EQ(int_field->Value(1), 104);
+  ASSERT_EQ(int_field->null_count(), 0);
+  list_field = std::dynamic_pointer_cast<ListArray>(slice->field(0));
+  ASSERT_FALSE(list_field->IsNull(0));
+  ASSERT_FALSE(list_field->IsNull(1));
+  ASSERT_EQ(list_field->value_length(0), 3);
+  ASSERT_EQ(list_field->value_length(1), 4);
+  ASSERT_EQ(list_field->null_count(), 0);
+
+  slice = std::dynamic_pointer_cast<StructArray>(array->Slice(1, 2));
+  slice2 = std::dynamic_pointer_cast<StructArray>(array->Slice(1, 2));
   ASSERT_EQ(2, slice->length());
 
   ASSERT_TRUE(slice->Equals(slice2));
   ASSERT_TRUE(array->RangeEquals(1, 3, 0, slice));
-}
 
-TEST_F(TestStructBuilder, TestZeroLength) {
-  // All buffers are null
-  Done();
-  ASSERT_OK(ValidateArray(*result_));
+  int_field = std::dynamic_pointer_cast<Int32Array>(slice->field(1));
+  ASSERT_EQ(int_field->length(), slice->length());
+  ASSERT_EQ(int_field->Value(0), 102);
+  ASSERT_EQ(int_field->Value(1), 103);
+  ASSERT_EQ(int_field->null_count(), 0);
+  list_field = std::dynamic_pointer_cast<ListArray>(slice->field(0));
+  ASSERT_TRUE(list_field->IsNull(0));
+  ASSERT_FALSE(list_field->IsNull(1));
+  ASSERT_EQ(list_field->value_length(0), 0);
+  ASSERT_EQ(list_field->value_length(1), 3);
+  ASSERT_EQ(list_field->null_count(), 1);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index bd2b40c1a..7e66999a3 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -353,7 +353,13 @@ StructArray::StructArray(const std::shared_ptr<DataType>& 
type, int64_t length,
 
 std::shared_ptr<Array> StructArray::field(int i) const {
   if (!boxed_fields_[i]) {
-    boxed_fields_[i] = MakeArray(data_->child_data[i]);
+    std::shared_ptr<ArrayData> field_data;
+    if (data_->offset != 0 || data_->child_data[i]->length != data_->length) {
+      field_data = SliceData(*data_->child_data[i].get(), data_->offset, 
data_->length);
+    } else {
+      field_data = data_->child_data[i];
+    }
+    boxed_fields_[i] = MakeArray(field_data);
   }
   DCHECK(boxed_fields_[i]);
   return boxed_fields_[i];
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 04bd01261..0a155af7e 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -149,6 +149,8 @@ struct ARROW_EXPORT ArrayData {
   std::shared_ptr<DataType> type;
   int64_t length;
   int64_t null_count;
+  // The logical start point into the physical buffers (in values, not bytes).
+  // Note that, for child data, this must be *added* to the child data's own 
offset.
   int64_t offset;
   std::vector<std::shared_ptr<Buffer>> buffers;
   std::vector<std::shared_ptr<ArrayData>> child_data;
@@ -599,7 +601,8 @@ class ARROW_EXPORT StructArray : public Array {
               int64_t offset = 0);
 
   // Return a shared pointer in case the requestor desires to share ownership
-  // with this array.
+  // with this array.  The returned array has its offset, length and null
+  // count adjusted.
   std::shared_ptr<Array> field(int pos) const;
 
  private:
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 69cacbfac..002d4b852 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -136,11 +136,7 @@ class RangeEqualsVisitor {
       for (int j = 0; j < left.num_fields(); ++j) {
         // TODO: really we should be comparing stretches of non-null data 
rather
         // than looking at one value at a time.
-        const int64_t left_abs_index = i + left.offset();
-        const int64_t right_abs_index = o_i + right.offset();
-
-        equal_fields = left.field(j)->RangeEquals(left_abs_index, 
left_abs_index + 1,
-                                                  right_abs_index, 
right.field(j));
+        equal_fields = left.field(j)->RangeEquals(i, i + 1, o_i, 
right.field(j));
         if (!equal_fields) {
           return false;
         }
@@ -467,9 +463,9 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
       return Status::OK();
     }
 
-    result_ =
-        left.values()->RangeEquals(left.value_offset(0), 
left.value_offset(left.length()),
-                                   right.value_offset(0), right.values());
+    result_ = left.values()->RangeEquals(
+        left.value_offset(0), left.value_offset(left.length()) - 
left.value_offset(0),
+        right.value_offset(0), right.values());
     return Status::OK();
   }
 
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 3debd767e..078efe1b0 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -376,10 +376,6 @@ class RecordBatchSerializer : public ArrayVisitor {
     --max_recursion_depth_;
     for (int i = 0; i < array.num_fields(); ++i) {
       std::shared_ptr<Array> field = array.field(i);
-      if (array.offset() != 0 || array.length() < field->length()) {
-        // If offset is non-zero, slice the child array
-        field = field->Slice(array.offset(), array.length());
-      }
       RETURN_NOT_OK(VisitArray(*field));
     }
     ++max_recursion_depth_;
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 994f528ea..38ccb7b50 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -233,7 +233,8 @@ class ArrayPrinter : public PrettyPrinter {
     Newline();
     Write("-- values: ");
     auto values =
-        array.values()->Slice(array.value_offset(0), 
array.value_offset(array.length()));
+        array.values()->Slice(array.value_offset(0),
+                              array.value_offset(array.length()) - 
array.value_offset(0));
     RETURN_NOT_OK(PrettyPrint(*values, indent_ + 2, sink_));
 
     return Status::OK();
@@ -264,7 +265,7 @@ class ArrayPrinter : public PrettyPrinter {
     for (int i = 0; i < array.num_fields(); ++i) {
       children.emplace_back(array.field(i));
     }
-    return PrintChildren(children, array.offset(), array.length());
+    return PrintChildren(children, 0, array.length());
   }
 
   Status Visit(const UnionArray& array) {
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 01a641896..589103635 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -388,7 +388,6 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
                      int64_t offset=0)
 
         shared_ptr[CArray] field(int pos)
-        const vector[shared_ptr[CArray]] fields()
 
     CStatus ValidateArray(const CArray& array)
 
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index a801acd69..2692ace40 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -342,6 +342,7 @@ cdef class UnionValue(ArrayValue):
     def as_py(self):
         return self.getitem(self.index).as_py()
 
+
 cdef class FixedSizeBinaryValue(ArrayValue):
 
     def as_py(self):
@@ -358,14 +359,16 @@ cdef class FixedSizeBinaryValue(ArrayValue):
 
 
 cdef class StructValue(ArrayValue):
+
     def as_py(self):
         cdef:
             CStructArray* ap
             vector[shared_ptr[CField]] child_fields = self.type.type.children()
+
         ap = <CStructArray*> self.sp_array.get()
-        wrapped_arrays = (pyarrow_wrap_array(ap.field(i))
-                          for i in range(ap.num_fields()))
-        child_names = (child.get().name() for child in child_fields)
+        wrapped_arrays = [pyarrow_wrap_array(ap.field(i))
+                          for i in range(ap.num_fields())]
+        child_names = [child.get().name() for child in child_fields]
         # Return the struct as a dict
         return {
             frombytes(name): child_array[self.index].as_py()
@@ -373,6 +376,7 @@ cdef class StructValue(ArrayValue):
             zip(child_names, wrapped_arrays)
         }
 
+
 cdef dict _scalar_classes = {
     _Type_BOOL: BooleanValue,
     _Type_UINT8: UInt8Value,
diff --git a/python/pyarrow/tests/test_array.py 
b/python/pyarrow/tests/test_array.py
index fea56862c..6faf4566b 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -145,6 +145,15 @@ def test_array_slice():
             assert arr[start:stop].to_pylist() == arr.to_pylist()[start:stop]
 
 
+def test_struct_array_slice():
+    # ARROW-2311: slicing nested arrays needs special care
+    ty = pa.struct([pa.field('a', pa.int8()),
+                    pa.field('b', pa.float32())])
+    arr = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty)
+    assert arr[1:].to_pylist() == [{'a': 3, 'b': 4.5},
+                                   {'a': 5, 'b': 6.5}]
+
+
 def test_array_factory_invalid_type():
     arr = np.array([datetime.timedelta(1), datetime.timedelta(2)])
     with pytest.raises(ValueError):


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> [Python] Struct array slicing defective
> ---------------------------------------
>
>                 Key: ARROW-2311
>                 URL: https://issues.apache.org/jira/browse/ARROW-2311
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Python
>    Affects Versions: 0.8.0
>            Reporter: Antoine Pitrou
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> {code:python}
> >>> arr = pa.array([(1, 2.0), (3, 4.0), (5, 6.0)], 
> >>> type=pa.struct([pa.field('x', pa.int16()), pa.field('y', pa.float32())]))
> >>> arr
> <pyarrow.lib.StructArray object at 0x7fdfbe7916d8>
> [
>   {'x': 1, 'y': 2.0},
>   {'x': 3, 'y': 4.0},
>   {'x': 5, 'y': 6.0}
> ]
> >>> arr[1:]
> <pyarrow.lib.StructArray object at 0x7fdfbe791f48>
> [
>   {'x': 1, 'y': 2.0},
>   {'x': 3, 'y': 4.0}
> ]
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to