This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c344446957 GH-40153: [Python][C++] Fix large file handling on 32-bit
Python build (#40176)
c344446957 is described below
commit c3444469570eb33f32a6f960ffa1d2e446c271f3
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Feb 21 15:56:38 2024 +0100
GH-40153: [Python][C++] Fix large file handling on 32-bit Python build
(#40176)
### Rationale for this change
Python large file tests fail on 32-bit platforms.
### What changes are included in this PR?
1. Fix passing `int64_t` position to the Python file methods when a Python
file object is wrapped in an Arrow `RandomAccessFile`
2. Disallow creating a `MemoryMappedFile` spanning more than the `size_t`
maximum, instead of silently truncating its length
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No.
* GitHub Issue: #40153
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/io/file.cc | 22 ++++++++++++++--------
python/pyarrow/src/arrow/python/io.cc | 15 +++++++++------
python/pyarrow/tests/test_io.py | 26 +++++++++++++++++++-------
3 files changed, 42 insertions(+), 21 deletions(-)
diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc
index 543fa90a86..3b18bb7b0f 100644
--- a/cpp/src/arrow/io/file.cc
+++ b/cpp/src/arrow/io/file.cc
@@ -36,6 +36,7 @@
#include <cerrno>
#include <cstdint>
#include <cstring>
+#include <limits>
#include <memory>
#include <mutex>
#include <sstream>
@@ -560,17 +561,22 @@ class MemoryMappedFile::MemoryMap
RETURN_NOT_OK(::arrow::internal::FileTruncate(file_->fd(),
initial_size));
}
- size_t mmap_length = static_cast<size_t>(initial_size);
- if (length > initial_size) {
- return Status::Invalid("mapping length is beyond file size");
- }
- if (length >= 0 && length < initial_size) {
+ int64_t mmap_length = initial_size;
+ if (length >= 0) {
// memory mapping a file region
- mmap_length = static_cast<size_t>(length);
+ if (length > initial_size) {
+ return Status::Invalid("mapping length is beyond file size");
+ }
+ mmap_length = length;
+ }
+ if (static_cast<int64_t>(static_cast<size_t>(mmap_length)) != mmap_length)
{
+ return Status::CapacityError("Requested memory map length ", mmap_length,
+ " does not fit in a C size_t "
+ "(are you using a 32-bit build of Arrow?");
}
- void* result = mmap(nullptr, mmap_length, prot_flags_, map_mode_,
file_->fd(),
- static_cast<off_t>(offset));
+ void* result = mmap(nullptr, static_cast<size_t>(mmap_length),
prot_flags_, map_mode_,
+ file_->fd(), static_cast<off_t>(offset));
if (result == MAP_FAILED) {
return Status::IOError("Memory mapping file failed: ",
::arrow::internal::ErrnoMessage(errno));
diff --git a/python/pyarrow/src/arrow/python/io.cc
b/python/pyarrow/src/arrow/python/io.cc
index 43f8297c5a..197f8b9d39 100644
--- a/python/pyarrow/src/arrow/python/io.cc
+++ b/python/pyarrow/src/arrow/python/io.cc
@@ -92,9 +92,12 @@ class PythonFile {
Status Seek(int64_t position, int whence) {
RETURN_NOT_OK(CheckClosed());
+ // NOTE: `long long` is at least 64 bits in the C standard, the cast below
is
+ // therefore safe.
+
// whence: 0 for relative to start of file, 2 for end of file
- PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)",
-
static_cast<Py_ssize_t>(position), whence);
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(Li)",
+ static_cast<long
long>(position), whence);
Py_XDECREF(result);
PY_RETURN_IF_ERROR(StatusCode::IOError);
return Status::OK();
@@ -103,16 +106,16 @@ class PythonFile {
Status Read(int64_t nbytes, PyObject** out) {
RETURN_NOT_OK(CheckClosed());
- PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)",
-
static_cast<Py_ssize_t>(nbytes));
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(L)",
+ static_cast<long long>(nbytes));
PY_RETURN_IF_ERROR(StatusCode::IOError);
*out = result;
return Status::OK();
}
Status ReadBuffer(int64_t nbytes, PyObject** out) {
- PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer",
"(n)",
-
static_cast<Py_ssize_t>(nbytes));
+ PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer",
"(L)",
+ static_cast<long long>(nbytes));
PY_RETURN_IF_ERROR(StatusCode::IOError);
*out = result;
return Status::OK();
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 5a495aa80a..17eab871a2 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -36,7 +36,7 @@ from pyarrow import Codec
import pyarrow as pa
-def check_large_seeks(file_factory):
+def check_large_seeks(file_factory, expected_error=None):
if sys.platform in ('win32', 'darwin'):
pytest.skip("need sparse file support")
try:
@@ -45,11 +45,16 @@ def check_large_seeks(file_factory):
f.truncate(2 ** 32 + 10)
f.seek(2 ** 32 + 5)
f.write(b'mark\n')
- with file_factory(filename) as f:
- assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
- assert f.tell() == 2 ** 32 + 5
- assert f.read(5) == b'mark\n'
- assert f.tell() == 2 ** 32 + 10
+ if expected_error:
+ with expected_error:
+ file_factory(filename)
+ else:
+ with file_factory(filename) as f:
+ assert f.size() == 2 ** 32 + 10
+ assert f.seek(2 ** 32 + 5) == 2 ** 32 + 5
+ assert f.tell() == 2 ** 32 + 5
+ assert f.read(5) == b'mark\n'
+ assert f.tell() == 2 ** 32 + 10
finally:
os.unlink(filename)
@@ -1137,7 +1142,14 @@ def test_memory_zero_length(tmpdir):
def test_memory_map_large_seeks():
- check_large_seeks(pa.memory_map)
+ if sys.maxsize >= 2**32:
+ expected_error = None
+ else:
+ expected_error = pytest.raises(
+ pa.ArrowCapacityError,
+ match="Requested memory map length 4294967306 "
+ "does not fit in a C size_t")
+ check_large_seeks(pa.memory_map, expected_error=expected_error)
def test_memory_map_close_remove(tmpdir):