This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c344446957 GH-40153: [Python][C++] Fix large file handling on 32-bit 
Python build (#40176)
c344446957 is described below

commit c3444469570eb33f32a6f960ffa1d2e446c271f3
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Feb 21 15:56:38 2024 +0100

    GH-40153: [Python][C++] Fix large file handling on 32-bit Python build 
(#40176)
    
    ### Rationale for this change
    
    Python large file tests fail on 32-bit platforms.
    
    ### What changes are included in this PR?
    
    1. Fix passing `int64_t` position to the Python file methods when a Python 
file object is wrapped in an Arrow `RandomAccessFile`
    2. Disallow creating a `MemoryMappedFile` spanning more than the `size_t` 
maximum, instead of silently truncating its length
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #40153
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/io/file.cc              | 22 ++++++++++++++--------
 python/pyarrow/src/arrow/python/io.cc | 15 +++++++++------
 python/pyarrow/tests/test_io.py       | 26 +++++++++++++++++++-------
 3 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index 543fa90a86..3b18bb7b0f 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -36,6 +36,7 @@
 #include <cerrno>
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <mutex>
 #include <sstream>
@@ -560,17 +561,22 @@ class MemoryMappedFile::MemoryMap
       RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(), 
initial_size));
     }
 
-    size_t mmap_length = static_cast<size_t>(initial_size);
-    if (length > initial_size) {
-      return Status::Invalid("mapping length is beyond file size");
-    }
-    if (length >= 0 && length < initial_size) {
+    int64_t mmap_length = initial_size;
+    if (length >= 0) {
       // memory mapping a file region
-      mmap_length = static_cast<size_t>(length);
+      if (length > initial_size) {
+        return Status::Invalid("mapping length is beyond file size");
+      }
+      mmap_length = length;
+    }
+    if (static_cast<int64_t>(static_cast<size_t>(mmap_length)) != mmap_length) 
{
+      return Status::CapacityError("Requested memory map length ", mmap_length,
+                                   " does not fit in a C size_t "
+                                   "(are you using a 32-bit build of Arrow?");
     }
 
-    void* result = mmap(nullptr, mmap_length, prot_flags_, map_mode_, 
file_->fd(),
-                        static_cast<off_t>(offset));
+    void* result = mmap(nullptr, static_cast<size_t>(mmap_length), 
prot_flags_, map_mode_,
+                        file_->fd(), static_cast<off_t>(offset));
     if (result == MAP_FAILED) {
       return Status::IOError("Memory mapping file failed: ",
                              ::arrow::internal::ErrnoMessage(errno));
diff --git a/python/pyarrow/src/arrow/python/io.cc 
b/python/pyarrow/src/arrow/python/io.cc
index 43f8297c5a..197f8b9d39 100644
--- a/python/pyarrow/src/arrow/python/io.cc
+++ b/python/pyarrow/src/arrow/python/io.cc
@@ -92,9 +92,12 @@ class PythonFile {
   Status Seek(int64_t position, int whence) {
     RETURN_NOT_OK(CheckClosed());
 
+    // NOTE: `long long` is at least 64 bits in the C standard, the cast below 
is
+    // therefore safe.
+
     // whence: 0 for relative to start of file, 2 for end of file
-    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)",
-                                               
static_cast<Py_ssize_t>(position), whence);
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(Li)",
+                                               static_cast<long 
long>(position), whence);
     Py_XDECREF(result);
     PY_RETURN_IF_ERROR(StatusCode::IOError);
     return Status::OK();
@@ -103,16 +106,16 @@ class PythonFile {
   Status Read(int64_t nbytes, PyObject** out) {
     RETURN_NOT_OK(CheckClosed());
 
-    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)",
-                                               
static_cast<Py_ssize_t>(nbytes));
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(L)",
+                                               static_cast<long long>(nbytes));
     PY_RETURN_IF_ERROR(StatusCode::IOError);
     *out = result;
     return Status::OK();
   }
 
   Status ReadBuffer(int64_t nbytes, PyObject** out) {
-    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", 
"(n)",
-                                               
static_cast<Py_ssize_t>(nbytes));
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", 
"(L)",
+                                               static_cast<long long>(nbytes));
     PY_RETURN_IF_ERROR(StatusCode::IOError);
     *out = result;
     return Status::OK();
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 5a495aa80a..17eab871a2 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -36,7 +36,7 @@ from pyarrow import Codec
 import pyarrow as pa
 
 
-def check_large_seeks(file_factory):
+def check_large_seeks(file_factory, expected_error=None):
     if sys.platform in ('win32', 'darwin'):
         pytest.skip("need sparse file support")
     try:
@@ -45,11 +45,16 @@ def check_large_seeks(file_factory):
             f.truncate(2 ** 32 + 10)
             f.seek(2 ** 32 + 5)
             f.write(b'mark\n')
-        with file_factory(filename) as f:
-            assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
-            assert f.tell() == 2 ** 32 + 5
-            assert f.read(5) == b'mark\n'
-            assert f.tell() == 2 ** 32 + 10
+        if expected_error:
+            with expected_error:
+                file_factory(filename)
+        else:
+            with file_factory(filename) as f:
+                assert f.size() == 2 ** 32 + 10
+                assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
+                assert f.tell() == 2 ** 32 + 5
+                assert f.read(5) == b'mark\n'
+                assert f.tell() == 2 ** 32 + 10
     finally:
         os.unlink(filename)
 
@@ -1137,7 +1142,14 @@ def test_memory_zero_length(tmpdir):
 
 
 def test_memory_map_large_seeks():
-    check_large_seeks(pa.memory_map)
+    if sys.maxsize >= 2**32:
+        expected_error = None
+    else:
+        expected_error = pytest.raises(
+            pa.ArrowCapacityError,
+            match="Requested memory map length 4294967306 "
+                  "does not fit in a C size_t")
+    check_large_seeks(pa.memory_map, expected_error=expected_error)
 
 
 def test_memory_map_close_remove(tmpdir):

Reply via email to