[ 
https://issues.apache.org/jira/browse/ARROW-2227?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16397626#comment-16397626
 ] 

ASF GitHub Bot commented on ARROW-2227:
---------------------------------------

wesm closed pull request #1740: ARROW-2227: [Python] Fix off-by-one error in 
chunked binary conversions
URL: https://github.com/apache/arrow/pull/1740
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index ef4e7fde9..aa9f3ce42 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -1236,7 +1236,7 @@ Status ListBuilder::Append(const int32_t* offsets, 
int64_t length,
 
 Status ListBuilder::AppendNextOffset() {
   int64_t num_values = value_builder_->length();
-  if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) {
+  if (ARROW_PREDICT_FALSE(num_values > kListMaximumElements)) {
     std::stringstream ss;
     ss << "ListArray cannot contain more then INT32_MAX - 1 child elements,"
        << " have " << num_values;
@@ -1252,14 +1252,14 @@ Status ListBuilder::Append(bool is_valid) {
 }
 
 Status ListBuilder::Init(int64_t elements) {
-  DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(elements, kListMaximumElements);
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
   return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
 }
 
 Status ListBuilder::Resize(int64_t capacity) {
-  DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(capacity, kListMaximumElements);
   // one more then requested for offsets
   RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
   return ArrayBuilder::Resize(capacity);
@@ -1303,14 +1303,14 @@ BinaryBuilder::BinaryBuilder(const 
std::shared_ptr<DataType>& type, MemoryPool*
 BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) 
{}
 
 Status BinaryBuilder::Init(int64_t elements) {
-  DCHECK_LT(elements, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(elements, kListMaximumElements);
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
   // one more then requested for offsets
   return offsets_builder_.Resize((elements + 1) * sizeof(int32_t));
 }
 
 Status BinaryBuilder::Resize(int64_t capacity) {
-  DCHECK_LT(capacity, std::numeric_limits<int32_t>::max());
+  DCHECK_LE(capacity, kListMaximumElements);
   // one more then requested for offsets
   RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t)));
   return ArrayBuilder::Resize(capacity);
@@ -1318,7 +1318,7 @@ Status BinaryBuilder::Resize(int64_t capacity) {
 
 Status BinaryBuilder::ReserveData(int64_t elements) {
   if (value_data_length() + elements > value_data_capacity()) {
-    if (value_data_length() + elements > std::numeric_limits<int32_t>::max()) {
+    if (value_data_length() + elements > kBinaryMemoryLimit) {
       return Status::Invalid("Cannot reserve capacity larger than 2^31 - 1 for 
binary");
     }
     RETURN_NOT_OK(value_data_builder_.Reserve(elements));
@@ -1328,9 +1328,9 @@ Status BinaryBuilder::ReserveData(int64_t elements) {
 
 Status BinaryBuilder::AppendNextOffset() {
   const int64_t num_bytes = value_data_builder_.length();
-  if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) {
+  if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) {
     std::stringstream ss;
-    ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " 
bytes, have "
+    ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " 
bytes, have "
        << num_bytes;
     return Status::Invalid(ss.str());
   }
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index dabfb7506..cdcee80be 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -41,13 +41,16 @@ namespace arrow {
 class Array;
 class Decimal128;
 
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 
1;
+
 namespace internal {
 
 struct ArrayData;
 
 }  // namespace internal
 
-static constexpr int64_t kMinBuilderCapacity = 1 << 5;
+constexpr int64_t kMinBuilderCapacity = 1 << 5;
 
 /// Base class for all data array builders.
 //
@@ -702,8 +705,6 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
   TypedBufferBuilder<int32_t> offsets_builder_;
   TypedBufferBuilder<uint8_t> value_data_builder_;
 
-  static constexpr int64_t kMaximumCapacity = 
std::numeric_limits<int32_t>::max() - 1;
-
   Status AppendNextOffset();
   void Reset();
 };
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc 
b/cpp/src/arrow/python/numpy_to_arrow.cc
index 4d91e5317..71bf69fc1 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -60,8 +60,6 @@ namespace py {
 
 using internal::NumPyTypeSize;
 
-constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max();
-
 // ----------------------------------------------------------------------
 // Conversion utilities
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index dff4e1068..d448de08b 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1048,9 +1048,12 @@ def test_bytes_to_binary(self):
 
     @pytest.mark.large_memory
     def test_bytes_exceed_2gb(self):
-        val = 'x' * (1 << 20)
+        v1 = b'x' * 100000000
+        v2 = b'x' * 147483646
+
+        # ARROW-2227, hit exactly 2GB on the nose
         df = pd.DataFrame({
-            'strings': np.array([val] * 4000, dtype=object)
+            'strings': [v1] * 20 + [v2] + ['x'] * 20
         })
         arr = pa.array(df['strings'])
         assert isinstance(arr, pa.ChunkedArray)


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Table.from_pandas does not create chunked_arrays.
> ----------------------------------------------------------
>
>                 Key: ARROW-2227
>                 URL: https://issues.apache.org/jira/browse/ARROW-2227
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.8.0
>            Reporter: Chris Ellison
>            Assignee: Wes McKinney
>            Priority: Blocker
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>
> When creating a large enough array, pyarrow raises an exception:
> {code:java}
> import numpy as np
> import pandas as pd
> import pyarrow as pa
> x = list('1' * 2**31)
> y = pd.DataFrame({'x': x})
> t = pa.Table.from_pandas(y)
> # ArrowInvalid: BinaryArrow cannot contain more than 2147483646 bytes, have 
> 2147483647{code}
> The array should be chunked for the user. As is, data frames with >2 GiB in 
> binary data will struggle to get into arrow.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to