kszucs commented on a change in pull request #10556:
URL: https://github.com/apache/arrow/pull/10556#discussion_r654446550



##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -738,6 +748,8 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
           RETURN_NOT_OK(value_builder->Append(values[i]));
         }
       }
+    } else if (!values.is_strided()) {

Review comment:
       This is unrelated to the fix, but quality of life improvement regarding 
the testing speed.

##########
File path: cpp/src/arrow/util/converter.h
##########
@@ -302,32 +305,59 @@ class Chunker {
     return status;
   }
 
-  // we could get bit smarter here since the whole batch of appendable values
-  // will be rejected if a capacity error is raised
-  Status Extend(InputType values, int64_t size) {
-    auto status = converter_->Extend(values, size);
-    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
-      if (converter_->builder()->length() == 0) {
+  Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->Extend(values, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;

Review comment:
       We can get out the number of converted items from the underlying builder 
object. Depending on the kind of the converter we may need to alter this value.

##########
File path: cpp/src/arrow/util/converter.h
##########
@@ -302,32 +305,59 @@ class Chunker {
     return status;
   }
 
-  // we could get bit smarter here since the whole batch of appendable values
-  // will be rejected if a capacity error is raised
-  Status Extend(InputType values, int64_t size) {
-    auto status = converter_->Extend(values, size);
-    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
-      if (converter_->builder()->length() == 0) {
+  Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->Extend(values, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;
+
+      offset += num_converted;
+      length_ += num_converted;
+
+      if (status.IsCapacityError()) {
+        if (converter_->builder()->length() == 0) {
+          // Builder length == 0 means the individual element is too large to 
append.
+          // In this case, no need to try again.
+          return status;
+        } else if (converter_->rewind_on_capacity_error()) {

Review comment:
       The list-like and binary-like conversion paths may raise capacity error, 
but there is a difference. 
   
   While the binary-like converters check the capacity before append/extend the 
list-like converters first append/extend the value(s) and checks just 
afterwards. Thus depending on the implementation semantics we may need to 
rewind (slice) the output chunk by one.

##########
File path: python/pyarrow/tests/test_convert_builtin.py
##########
@@ -2185,7 +2185,11 @@ def test_auto_chunking_list_like():
     assert arr.num_chunks == 2
     assert len(arr.chunk(0)) == 7
     assert len(arr.chunk(1)) == 1
-    assert arr.chunk(1)[0].as_py() == list(item)

Review comment:
       Converting to python object takes a lot of time, so assert on arrow 
objects instead.

##########
File path: cpp/src/arrow/util/converter.h
##########
@@ -302,32 +305,59 @@ class Chunker {
     return status;
   }
 
-  // we could get bit smarter here since the whole batch of appendable values
-  // will be rejected if a capacity error is raised
-  Status Extend(InputType values, int64_t size) {
-    auto status = converter_->Extend(values, size);
-    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
-      if (converter_->builder()->length() == 0) {
+  Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->Extend(values, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;
+
+      offset += num_converted;
+      length_ += num_converted;
+
+      if (status.IsCapacityError()) {
+        if (converter_->builder()->length() == 0) {
+          // Builder length == 0 means the individual element is too large to 
append.
+          // In this case, no need to try again.
+          return status;
+        } else if (converter_->rewind_on_capacity_error()) {

Review comment:
       Yes. I'm going to add comments based on these discussions.

##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -655,6 +633,8 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
     return ValidateBuilder(this->list_type_);
   }
 
+  bool rewind_on_capacity_error() const override { return true; }

Review comment:
       Yes, 
[ReserveData](https://github.com/apache/arrow/blob/1082c1c0501e534e333aa04cfb78f3091677a655/cpp/src/arrow/python/python_to_arrow.cc#L559)
 calls out to 
[ValidateOverflow](https://github.com/apache/arrow/blob/master/cpp/src/arrow/array/builder_binary.h#L310)
 before the `UnsafeAppend` call.

##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -655,6 +633,8 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
     return ValidateBuilder(this->list_type_);
   }
 
+  bool rewind_on_capacity_error() const override { return true; }

Review comment:
       Right, but this is belongs to the `PyListConverter` (expand the diff).

##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -655,6 +633,8 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
     return ValidateBuilder(this->list_type_);
   }
 
+  bool rewind_on_capacity_error() const override { return true; }

Review comment:
       Right, but this belongs to the `PyListConverter` (expand the diff).

##########
File path: cpp/src/arrow/python/python_to_arrow.cc
##########
@@ -655,6 +633,8 @@ class PyListConverter : public ListConverter<T, 
PyConverter, PyConverterTrait> {
     return ValidateBuilder(this->list_type_);
   }
 
+  bool rewind_on_capacity_error() const override { return true; }

Review comment:
       This fix is all about confusion :D

##########
File path: .github/workflows/python.yml
##########
@@ -36,6 +36,8 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  PYARROW_TEST_SLOW: ON
+  PYARROW_TEST_LARGE_MEMORY: ON

Review comment:
       Just experimental to see whether GHA is able to execute these tests.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to