danepitkin commented on code in PR #40482:
URL: https://github.com/apache/arrow/pull/40482#discussion_r1520727478


##########
cpp/src/arrow/array/array_nested.h:
##########
@@ -58,6 +58,16 @@ void SetListData(VarLengthListLikeArray<TYPE>* self,
                  const std::shared_ptr<ArrayData>& data,
                  Type::type expected_type_id = TYPE::type_id);
 
+// Private helper for [Large]List[View]Array::Flatten when IncludeNulls = true.
+// The primary use case is to avoid duplicating this logic in PyArrow when 
converting
+// to pandas.
+template <typename ListViewArrayT, bool HasNulls, bool IncludeNulls>
+Result<std::shared_ptr<Array>> FlattenListViewArray(const ListViewArrayT& 
list_view_array,
+                                                    MemoryPool* memory_pool);
+template <typename ListArrayT, bool IncludeNulls>
+Result<std::shared_ptr<Array>> FlattenListArray(const ListArrayT& list_array,
+                                                MemoryPool* memory_pool);
+

Review Comment:
   Should these be exposed through the public apis instead? e.g. 
`ListViewArray::Flatten(include_nulls = true)`



##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -351,122 +481,21 @@ Result<std::shared_ptr<Array>> 
FlattenListViewArray(const ListViewArrayT& list_v
   return Concatenate(slices, memory_pool);
 }
 
-std::shared_ptr<Array> BoxOffsets(const std::shared_ptr<DataType>& boxed_type,
-                                  const ArrayData& data) {
-  const int64_t num_offsets =
-      is_list_view(data.type->id()) ? data.length : data.length + 1;
-  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[1]};
-  auto offsets_data =
-      std::make_shared<ArrayData>(boxed_type, /*length=*/num_offsets, 
std::move(buffers),
-                                  /*null_count=*/0, data.offset);
-  return MakeArray(offsets_data);
-}
-
-std::shared_ptr<Array> BoxSizes(const std::shared_ptr<DataType>& boxed_type,
-                                const ArrayData& data) {
-  DCHECK(is_list_view(data.type->id()));
-  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, data.buffers[2]};
-  auto sizes_data =
-      std::make_shared<ArrayData>(boxed_type, data.length, std::move(buffers),
-                                  /*null_count=*/0, data.offset);
-  return MakeArray(sizes_data);
-}
-
-template <typename DestListViewType, typename SrcListType>
-Result<std::shared_ptr<ArrayData>> ListViewFromListImpl(
-    const std::shared_ptr<ArrayData>& list_data, MemoryPool* pool) {
-  static_assert(
-      std::is_same<typename SrcListType::offset_type,
-                   typename DestListViewType::offset_type>::value,
-      "Offset types between list type and list-view type are expected to 
match");
-  using offset_type = typename SrcListType::offset_type;
-  const auto& list_type = checked_cast<const SrcListType&>(*list_data->type);
-
-  // To re-use the validity and offsets buffers, a sizes buffer with enough
-  // padding on the beginning is allocated and filled with the sizes after
-  // list_data->offset.
-  const int64_t buffer_length = list_data->offset + list_data->length;
-  ARROW_ASSIGN_OR_RAISE(auto sizes_buffer,
-                        AllocateBuffer(buffer_length * sizeof(offset_type), 
pool));
-  const auto* offsets = list_data->template GetValues<offset_type>(1, 0);
-  auto* sizes = sizes_buffer->mutable_data_as<offset_type>();
-  // Zero the initial padding area to avoid leaking any data when buffers are
-  // sent over IPC or through the C Data interface.
-  memset(sizes, 0, list_data->offset * sizeof(offset_type));
-  for (int64_t i = list_data->offset; i < buffer_length; i++) {
-    sizes[i] = offsets[i + 1] - offsets[i];
-  }
-  BufferVector buffers = {list_data->buffers[0], list_data->buffers[1],
-                          std::move(sizes_buffer)};
-
-  return 
ArrayData::Make(std::make_shared<DestListViewType>(list_type.value_type()),
-                         list_data->length, std::move(buffers),
-                         {list_data->child_data[0]}, list_data->null_count,
-                         list_data->offset);
-}
-
-template <typename DestListType, typename SrcListViewType>
-Result<std::shared_ptr<ArrayData>> ListFromListViewImpl(
-    const std::shared_ptr<ArrayData>& list_view_data, MemoryPool* pool) {
-  static_assert(
-      std::is_same<typename SrcListViewType::offset_type,
-                   typename DestListType::offset_type>::value,
-      "Offset types between list type and list-view type are expected to 
match");
-  using offset_type = typename DestListType::offset_type;
-  using ListBuilderType = typename TypeTraits<DestListType>::BuilderType;
-
-  const auto& list_view_type =
-      checked_cast<const SrcListViewType&>(*list_view_data->type);
-  const auto& value_type = list_view_type.value_type();
-  const auto list_type = std::make_shared<DestListType>(value_type);
-
-  ARROW_ASSIGN_OR_RAISE(auto sum_of_list_view_sizes,
-                        
list_util::internal::SumOfLogicalListSizes(*list_view_data));
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayBuilder> value_builder,
-                        MakeBuilder(value_type, pool));
-  RETURN_NOT_OK(value_builder->Reserve(sum_of_list_view_sizes));
-  auto list_builder = std::make_shared<ListBuilderType>(pool, value_builder, 
list_type);
-  RETURN_NOT_OK(list_builder->Reserve(list_view_data->length));
-
-  ArraySpan values{*list_view_data->child_data[0]};
-  const auto* in_validity_bitmap = list_view_data->GetValues<uint8_t>(0);
-  const auto* in_offsets = list_view_data->GetValues<offset_type>(1);
-  const auto* in_sizes = list_view_data->GetValues<offset_type>(2);
-  for (int64_t i = 0; i < list_view_data->length; ++i) {
-    const bool is_valid =
-        !in_validity_bitmap ||
-        bit_util::GetBit(in_validity_bitmap, list_view_data->offset + i);
-    const int64_t size = is_valid ? in_sizes[i] : 0;
-    RETURN_NOT_OK(list_builder->Append(is_valid, size));
-    RETURN_NOT_OK(value_builder->AppendArraySlice(values, in_offsets[i], 
size));
-  }
-  std::shared_ptr<ArrayData> list_array_data;
-  RETURN_NOT_OK(list_builder->FinishInternal(&list_array_data));
-  return list_array_data;
-}
-
-}  // namespace
-
-namespace internal {
-
-template <typename TYPE>
-inline void SetListData(VarLengthListLikeArray<TYPE>* self,
-                        const std::shared_ptr<ArrayData>& data,
-                        Type::type expected_type_id) {
-  ARROW_CHECK_EQ(data->buffers.size(), is_list_view(TYPE::type_id) ? 3 : 2);
-  ARROW_CHECK_EQ(data->type->id(), expected_type_id);
-  ARROW_CHECK_EQ(data->child_data.size(), 1);
-
-  self->Array::SetData(data);
-
-  self->list_type_ = checked_cast<const TYPE*>(data->type.get());
-  self->raw_value_offsets_ =
-      data->GetValuesSafe<typename TYPE::offset_type>(1, /*offset=*/0);
-  // BaseListViewArray::SetData takes care of setting raw_value_sizes_.
-
-  ARROW_CHECK_EQ(self->list_type_->value_type()->id(), 
data->child_data[0]->type->id());
-  DCHECK(self->list_type_->value_type()->Equals(data->child_data[0]->type));
-  self->values_ = MakeArray(self->data_->child_data[0]);
+void InstantiateFlattenListArray() {
+  (void)FlattenListArray<ListArray, false>;
+  (void)FlattenListArray<ListArray, true>;
+  (void)FlattenListArray<LargeListArray, false>;
+  (void)FlattenListArray<LargeListArray, true>;
+  (void)FlattenListArray<FixedSizeListArray, false>;
+  (void)FlattenListArray<FixedSizeListArray, true>;
+  (void)FlattenListViewArray<ListViewArray, false, false>;
+  (void)FlattenListViewArray<ListViewArray, false, true>;
+  (void)FlattenListViewArray<ListViewArray, true, false>;
+  (void)FlattenListViewArray<ListViewArray, true, true>;
+  (void)FlattenListViewArray<LargeListViewArray, false, false>;
+  (void)FlattenListViewArray<LargeListViewArray, false, true>;
+  (void)FlattenListViewArray<LargeListViewArray, true, false>;
+  (void)FlattenListViewArray<LargeListViewArray, true, true>;

Review Comment:
   Some of these instantiations are required since only PyArrow would 
templatize the function with the last parameter (include_nulls) = true.



##########
cpp/src/arrow/array/array_nested.cc:
##########
@@ -263,7 +381,7 @@ Result<std::shared_ptr<Array>> FlattenListArray(const 
ListArrayT& list_array,
   return Concatenate(non_null_fragments, memory_pool);
 }
 
-template <typename ListViewArrayT, bool HasNulls>
+template <typename ListViewArrayT, bool HasNulls, bool IncludeNulls>

Review Comment:
   templatizing with an addtional 3rd arg (IncludeNulls) makes the code harder 
to reason about.



##########
python/pyarrow/src/arrow/python/arrow_to_pandas.cc:
##########
@@ -826,6 +796,145 @@ Status ConvertListsLike(PandasOptions options, const 
ChunkedArray& data,
   return Status::OK();
 }
 
+template <typename T>
+enable_if_list_view<T, Status> ConvertListsLikeChunks(PyObject* numpy_array,
+                                                      const ChunkedArray& data,
+                                                      PyObject** out_values) {
+  using ListArrayT = typename TypeTraits<T>::ArrayType;
+  const auto& arr = static_cast<const ListArrayT&>(*data.chunk(0));
+  const bool has_nulls = data.null_count() > 0;
+  for (int64_t i = 0; i < arr.length(); ++i) {
+    if (has_nulls && arr.IsNull(i)) {
+      Py_INCREF(Py_None);
+      *out_values = Py_None;
+    } else {
+      // Need to subtract value_offset(0) since the original chunk might be a 
slice
+      // into another array.
+      OwnedRef start(PyLong_FromLongLong(arr.value_offset(i)));
+      OwnedRef end(PyLong_FromLongLong(arr.value_offset(i) + 
arr.value_length(i)));
+      OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));
+
+      if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
+        // Fall out of loop, will return from RETURN_IF_PYERROR
+        break;
+      }
+      *out_values = PyObject_GetItem(numpy_array, slice.obj());
+
+      if (*out_values == nullptr) {
+        // Fall out of loop, will return from RETURN_IF_PYERROR
+        break;
+      }
+    }
+    ++out_values;
+  }
+  RETURN_IF_PYERROR();
+
+  return Status::OK();
+}
+
+// template <typename T>
+// enable_if_list_like<T, Status> FlattenList(ArrayVector& value_arrays, const 
ChunkedArray& data, PandasOptions options) {
+//   using ListArrayT = typename TypeTraits<T>::ArrayType;
+//   for (int c = 0; c < data.num_chunks(); c++) {
+//     const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+//     std::shared_ptr<Array> flattened_values = arr.values()->Slice(
+//         arr.value_offset(0), arr.value_offset(arr.length()) - 
arr.value_offset(0));
+//     if (arr.value_type()->id() == Type::EXTENSION) {
+//       const auto& arr_ext = checked_cast<const 
ExtensionArray&>(*flattened_values);
+//       value_arrays.emplace_back(arr_ext.storage());
+//     } else {
+//       value_arrays.emplace_back(flattened_values);
+//     }
+//   }
+//   return Status::OK();
+// }
+
+// template <typename T>
+// enable_if_list_view<T, Status> FlattenList(ArrayVector& value_arrays, const 
ChunkedArray& data, PandasOptions options) {
+//   using ListArrayT = typename TypeTraits<T>::ArrayType;
+//   for (int c = 0; c < data.num_chunks(); c++) {
+//     const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+//     std::vector<std::shared_ptr<Array>> slices;
+//     for (int i = 0; i < arr.length(); i++) {
+//       slices.push_back(arr.value_slice(i));
+//     }
+//     ARROW_ASSIGN_OR_RAISE(auto flattened_values, Concatenate(slices, 
options.pool));
+//     if (arr.value_type()->id() == Type::EXTENSION) {
+//       const auto& arr_ext = checked_cast<const 
ExtensionArray&>(*flattened_values);
+//       value_arrays.emplace_back(arr_ext.storage());
+//     } else {
+//       value_arrays.emplace_back(flattened_values);
+//     }
+//   }
+//   return Status::OK();
+// }

Review Comment:
   If we don't want to reuse the C++ Flatten APIs, this is what a duplicate 
implementation would look like in PyArrow.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to