Repository: arrow Updated Branches: refs/heads/master c7947dc2d -> 685ebf490
ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer WIP, but tests all pass Author: Jeff Knupp <[email protected]> Closes #410 from jeffknupp/master and squashes the following commits: bfba71d [Jeff Knupp] Fix typo in test 0a39f24 [Jeff Knupp] Fix some logical issues with initialization; add bytearray test fb2cfa3 [Jeff Knupp] Add proper reference counting regardless of if buf is memoryview 26f8b74 [Jeff Knupp] Need to investigate why test failed travis but not locally. For now, revert f7d21ac [Jeff Knupp] ARROW-626: [Python] Replace PyBytesBuffer with zero-copy, memoryview-based PyBuffer Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/685ebf49 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/685ebf49 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/685ebf49 Branch: refs/heads/master Commit: 685ebf49001ef02134e404dddae2bfd032dc4a65 Parents: c7947dc Author: Jeff Knupp <[email protected]> Authored: Sat Mar 25 12:15:02 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Sat Mar 25 12:15:02 2017 -0400 ---------------------------------------------------------------------- python/pyarrow/includes/pyarrow.pxd | 4 ++-- python/pyarrow/io.pyx | 19 ++++++++----------- python/pyarrow/tests/test_io.py | 20 +++++++++++++++----- python/src/pyarrow/common.cc | 24 +++++++++++++++--------- python/src/pyarrow/common.h | 10 +++++++--- python/src/pyarrow/io.cc | 6 +++--- python/src/pyarrow/io.h | 2 +- 7 files changed, 51 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/includes/pyarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 805950b..3fdbebc 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -55,8 +55,8 @@ cdef extern from "pyarrow/api.h" namespace "arrow::py" nogil: cdef extern from "pyarrow/common.h" namespace "arrow::py" nogil: - cdef cppclass PyBytesBuffer(CBuffer): - PyBytesBuffer(object o) + cdef cppclass PyBuffer(CBuffer): + PyBuffer(object o) cdef extern from "pyarrow/io.h" namespace "arrow::py" nogil: http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/io.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 72e0e0f..cb44ce8 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -501,16 +501,11 @@ cdef class BufferReader(NativeFile): Buffer buffer def __cinit__(self, object obj): - cdef shared_ptr[CBuffer] buf if isinstance(obj, Buffer): self.buffer = obj - elif isinstance(obj, bytes): - buf.reset(new pyarrow.PyBytesBuffer(obj)) - self.buffer = wrap_buffer(buf) else: - raise ValueError('Unable to convert value to buffer: {0}' - .format(type(obj))) + self.buffer = build_arrow_buffer(obj) self.rd_file.reset(new CBufferReader(self.buffer.buffer)) self.is_readable = 1 @@ -518,16 +513,18 @@ cdef class BufferReader(NativeFile): self.is_open = True -def buffer_from_bytes(object obj): +def build_arrow_buffer(object obj): """ Construct an Arrow buffer from a Python bytes object """ cdef shared_ptr[CBuffer] buf - if not isinstance(obj, bytes): - raise ValueError('Must pass bytes object') + try: + memoryview(obj) + buf.reset(new pyarrow.PyBuffer(obj)) + return wrap_buffer(buf) + except TypeError: + raise ValueError('Must pass object that implements buffer protocol') - buf.reset(new pyarrow.PyBytesBuffer(obj)) - return wrap_buffer(buf) cdef Buffer wrap_buffer(const shared_ptr[CBuffer]& buf): http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/pyarrow/tests/test_io.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index c6caba5..9cd15c4 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -82,7 +82,6 @@ def test_bytes_reader(): # Like a BytesIO, but zero-copy underneath for C++ consumers data = b'some sample data' f = io.BufferReader(data) - assert f.tell() == 0 assert f.size() == len(data) @@ -128,7 +127,7 @@ def test_bytes_reader_retains_parent_reference(): def test_buffer_bytes(): val = b'some data' - buf = io.buffer_from_bytes(val) + buf = io.build_arrow_buffer(val) assert isinstance(buf, io.Buffer) result = buf.to_pybytes() @@ -138,18 +137,29 @@ def test_buffer_bytes(): def test_buffer_memoryview(): val = b'some data' - buf = io.buffer_from_bytes(val) + buf = io.build_arrow_buffer(val) assert isinstance(buf, io.Buffer) result = memoryview(buf) assert result == val +def test_buffer_bytearray(): + val = bytearray(b'some data') + + + buf = io.build_arrow_buffer(val) + assert isinstance(buf, io.Buffer) + + result = bytearray(buf) + + assert result == val + def test_buffer_memoryview_is_immutable(): val = b'some data' - buf = io.buffer_from_bytes(val) + buf = io.build_arrow_buffer(val) assert isinstance(buf, io.Buffer) result = memoryview(buf) @@ -192,7 +202,7 @@ def test_buffer_protocol_ref_counting(): import gc def make_buffer(bytes_obj): - return bytearray(io.buffer_from_bytes(bytes_obj)) + return bytearray(io.build_arrow_buffer(bytes_obj)) buf = make_buffer(b'foo') gc.collect() http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/common.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc index c898f63..792aa47 100644 --- a/python/src/pyarrow/common.cc +++ b/python/src/pyarrow/common.cc @@ -45,18 +45,24 @@ MemoryPool* get_memory_pool() { } // ---------------------------------------------------------------------- -// PyBytesBuffer +// PyBuffer -PyBytesBuffer::PyBytesBuffer(PyObject* obj) - : Buffer(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), - PyBytes_GET_SIZE(obj)), - obj_(obj) { - Py_INCREF(obj_); +PyBuffer::PyBuffer(PyObject* obj) + : Buffer(nullptr, 0) { + if (PyObject_CheckBuffer(obj)) { + obj_ = PyMemoryView_FromObject(obj); + Py_buffer* buffer = PyMemoryView_GET_BUFFER(obj_); + data_ = reinterpret_cast<const uint8_t*>(buffer->buf); + size_ = buffer->len; + capacity_ = buffer->len; + is_mutable_ = false; + Py_INCREF(obj_); + } } -PyBytesBuffer::~PyBytesBuffer() { - PyAcquireGIL lock; - Py_DECREF(obj_); +PyBuffer::~PyBuffer() { + PyAcquireGIL lock; + Py_DECREF(obj_); } } // namespace py http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/common.h ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 0b4c6be..b4e4ea6 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -118,10 +118,14 @@ class ARROW_EXPORT NumPyBuffer : public Buffer { PyArrayObject* arr_; }; -class ARROW_EXPORT PyBytesBuffer : public Buffer { +class ARROW_EXPORT PyBuffer : public Buffer { public: - PyBytesBuffer(PyObject* obj); - ~PyBytesBuffer(); + /// Note that the GIL must be held when calling the PyBuffer constructor. + /// + /// While memoryview objects support multi-demensional buffers, PyBuffer only supports + /// one-dimensional byte buffers. + PyBuffer(PyObject* obj); + ~PyBuffer(); private: PyObject* obj_; http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/io.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index 0aa61dc..c66155b 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -156,7 +156,7 @@ Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr<Buffer>* out) { PyObject* bytes_obj; ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); - *out = std::make_shared<PyBytesBuffer>(bytes_obj); + *out = std::make_shared<PyBuffer>(bytes_obj); Py_DECREF(bytes_obj); return Status::OK(); @@ -210,10 +210,10 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { } // ---------------------------------------------------------------------- -// A readable file that is backed by a PyBytes +// A readable file that is backed by a PyBuffer PyBytesReader::PyBytesReader(PyObject* obj) - : io::BufferReader(std::make_shared<PyBytesBuffer>(obj)) {} + : io::BufferReader(std::make_shared<PyBuffer>(obj)) {} PyBytesReader::~PyBytesReader() {} http://git-wip-us.apache.org/repos/asf/arrow/blob/685ebf49/python/src/pyarrow/io.h ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h index e38cd81..89af609 100644 --- a/python/src/pyarrow/io.h +++ b/python/src/pyarrow/io.h @@ -84,7 +84,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream { std::unique_ptr<PythonFile> file_; }; -// A zero-copy reader backed by a PyBytes object +// A zero-copy reader backed by a PyBuffer object class ARROW_EXPORT PyBytesReader : public io::BufferReader { public: explicit PyBytesReader(PyObject* obj);
