Repository: arrow Updated Branches: refs/heads/master 543e50814 -> 7d3e2a3ab
ARROW-421: [Python] Retain parent reference in PyBytesReader Pass Buffer to BufferReader so that zero-copy slices retain reference to PyBytesBuffer, which prevents the bytes object from being garbage collected prematurely. Also added some helper tools for inspecting Arrow Buffer objects in Python. Close #278 Author: Wes McKinney <[email protected]> Closes #279 from wesm/ARROW-421 and squashes the following commits: acf730e [Wes McKinney] Rename method 50c195a [Wes McKinney] Fix accidental typo ef20185 [Wes McKinney] Pass Buffer to BufferReader so that zero-copy slices retain reference to PyBytesBuffer, which prevents the bytes object from being garbage collected prematurely Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7d3e2a3a Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7d3e2a3a Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7d3e2a3a Branch: refs/heads/master Commit: 7d3e2a3ab90324625b738e464a020758379f457a Parents: 543e508 Author: Wes McKinney <[email protected]> Authored: Wed Jan 11 09:33:29 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Wed Jan 11 09:33:29 2017 -0500 ---------------------------------------------------------------------- cpp/src/arrow/io/memory.h | 2 ++ python/pyarrow/_parquet.pxd | 2 +- python/pyarrow/_parquet.pyx | 8 +++--- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/io.pyx | 46 +++++++++++++++++++++++++++++-- python/pyarrow/tests/test_io.py | 14 ++++++++++ python/src/pyarrow/io.cc | 10 ++----- python/src/pyarrow/io.h | 5 ++-- 8 files changed, 69 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/cpp/src/arrow/io/memory.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 8428a12..2d3df42 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -79,6 +79,8 @@ class ARROW_EXPORT BufferReader : public ReadableFileInterface { bool supports_zero_copy() const override; + std::shared_ptr<Buffer> buffer() const { return buffer_; } + private: std::shared_ptr<Buffer> buffer_; const uint8_t* data_; http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/_parquet.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index faca845..7e49e9e 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -156,7 +156,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: int num_columns() int64_t num_rows() int num_row_groups() - int32_t version() + ParquetVersion version() const c_string created_by() int num_schema_elements() http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/_parquet.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c0dc3eb..30e3de4 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -138,11 +138,11 @@ cdef class FileMetaData: property format_version: def __get__(self): - cdef int version = self.metadata.version() - if version == 2: - return '2.0' - elif version == 1: + cdef ParquetVersion version = self.metadata.version() + if version == ParquetVersion_V1: return '1.0' + if version == ParquetVersion_V2: + return '2.0' else: print('Unrecognized file version, assuming 1.0: {0}' .format(version)) http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b0f971d..d1970e5 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -66,6 +66,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CBuffer" arrow::Buffer": uint8_t* data() int64_t size() + shared_ptr[CBuffer] parent() cdef cppclass ResizableBuffer(CBuffer): CStatus Resize(int64_t nbytes) http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/io.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index cab6ccb..b62de6c 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -123,11 +123,17 @@ cdef class NativeFile: with nogil: check_status(self.wr_file.get().Write(buf, bufsize)) - def read(self, int64_t nbytes): + def read(self, nbytes=None): cdef: + int64_t c_nbytes int64_t bytes_read = 0 PyObject* obj + if nbytes is None: + c_nbytes = self.size() - self.tell() + else: + c_nbytes = nbytes + self._assert_readable() # Allocate empty write space @@ -135,17 +141,35 @@ cdef class NativeFile: cdef uint8_t* buf = <uint8_t*> cp.PyBytes_AS_STRING(<object> obj) with nogil: - check_status(self.rd_file.get().Read(nbytes, &bytes_read, buf)) + check_status(self.rd_file.get().Read(c_nbytes, &bytes_read, buf)) - if bytes_read < nbytes: + if bytes_read < c_nbytes: cp._PyBytes_Resize(&obj, <Py_ssize_t> bytes_read) return PyObject_to_object(obj) + def read_buffer(self, nbytes=None): + cdef: + int64_t c_nbytes + int64_t bytes_read = 0 + shared_ptr[CBuffer] output + self._assert_readable() + + if nbytes is None: + c_nbytes = self.size() - self.tell() + else: + c_nbytes = nbytes + + with nogil: + check_status(self.rd_file.get().ReadB(c_nbytes, &output)) + + return wrap_buffer(output) + # ---------------------------------------------------------------------- # Python file-like objects + cdef class PythonFileInterface(NativeFile): cdef: object handle @@ -199,6 +223,16 @@ cdef class Buffer: def __get__(self): return self.buffer.get().size() + property parent: + + def __get__(self): + cdef shared_ptr[CBuffer] parent_buf = self.buffer.get().parent() + + if parent_buf.get() == NULL: + return None + else: + return wrap_buffer(parent_buf) + def __getitem__(self, key): # TODO(wesm): buffer slicing raise NotImplementedError @@ -209,6 +243,12 @@ cdef class Buffer: self.buffer.get().size()) +cdef wrap_buffer(const shared_ptr[CBuffer]& buffer): + cdef Buffer result = Buffer() + result.buffer = buffer + return result + + cdef shared_ptr[PoolBuffer] allocate_buffer(): cdef shared_ptr[PoolBuffer] result result.reset(new PoolBuffer(pyarrow.get_memory_pool())) http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/pyarrow/tests/test_io.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index c10ed03..3e7a437 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -102,6 +102,20 @@ def test_bytes_reader_non_bytes(): io.BytesReader(u('some sample data')) +def test_bytes_reader_retains_parent_reference(): + import gc + + # ARROW-421 + def get_buffer(): + data = b'some sample data' * 1000 + reader = io.BytesReader(data) + reader.seek(5) + return reader.read_buffer(6) + + buf = get_buffer() + gc.collect() + assert buf.to_pybytes() == b'sample' + assert buf.parent is not None # ---------------------------------------------------------------------- # Buffers http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/src/pyarrow/io.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index ac1aa63..01f851d 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -203,14 +203,8 @@ Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { // A readable file that is backed by a PyBytes PyBytesReader::PyBytesReader(PyObject* obj) - : arrow::io::BufferReader(reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(obj)), - PyBytes_GET_SIZE(obj)), - obj_(obj) { - Py_INCREF(obj_); -} + : arrow::io::BufferReader(std::make_shared<PyBytesBuffer>(obj)) {} -PyBytesReader::~PyBytesReader() { - Py_DECREF(obj_); -} +PyBytesReader::~PyBytesReader() {} } // namespace pyarrow http://git-wip-us.apache.org/repos/asf/arrow/blob/7d3e2a3a/python/src/pyarrow/io.h ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h index fd3e7c0..4cb010f 100644 --- a/python/src/pyarrow/io.h +++ b/python/src/pyarrow/io.h @@ -22,6 +22,8 @@ #include "arrow/io/memory.h" #include "pyarrow/config.h" + +#include "pyarrow/common.h" #include "pyarrow/visibility.h" namespace arrow { @@ -87,9 +89,6 @@ class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader { public: explicit PyBytesReader(PyObject* obj); virtual ~PyBytesReader(); - - private: - PyObject* obj_; }; // TODO(wesm): seekable output files
