Repository: arrow Updated Branches: refs/heads/master 852ee4fbf -> c7839e9fa
ARROW-1017: [Python] Fix memory leaks in conversion to pandas.DataFrame Notes: * `PyList_Append` increments ref count, so new objects must be DECREF'd after being inserted * `PyArray_SimpleNewFromDescr` does not set `NPY_ARRAY_OWNDATA`, neither does `NewFromDescr` Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #685 from wesm/ARROW-1017 and squashes the following commits: 8459123 [Wes McKinney] Fix memory leak caused by list append ref count, lack of setting NPY_ARRAY_OWNDATA Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c7839e9f Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c7839e9f Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c7839e9f Branch: refs/heads/master Commit: c7839e9fab91e62cced9367f23e561afb6728652 Parents: 852ee4f Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Sun May 14 14:35:28 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Sun May 14 14:35:28 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/python/common.h | 7 +++++- cpp/src/arrow/python/pandas_convert.cc | 25 ++++++++++----------- python/scripts/test_leak.py | 35 +++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/c7839e9f/cpp/src/arrow/python/common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index f6e706b..ec40d0e 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -69,7 +69,7 @@ class ARROW_EXPORT OwnedRef { ~OwnedRef() { PyAcquireGIL lock; - Py_XDECREF(obj_); + release(); } void reset(PyObject* obj) { @@ -80,6 +80,11 @@ class ARROW_EXPORT OwnedRef { obj_ = obj; } + void release() { + Py_XDECREF(obj_); + obj_ = nullptr; + } + PyObject* obj() const { return obj_; } private: http://git-wip-us.apache.org/repos/asf/arrow/blob/c7839e9f/cpp/src/arrow/python/pandas_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc index 264bed1..b6fb05e 100644 --- a/cpp/src/arrow/python/pandas_convert.cc +++ b/cpp/src/arrow/python/pandas_convert.cc @@ -1023,7 +1023,8 @@ static inline PyObject* NewArray1DFromType( } set_numpy_metadata(type, arrow_type, descr); - return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, 0, nullptr); + return PyArray_NewFromDescr(&PyArray_Type, descr, 1, dims, nullptr, data, + NPY_ARRAY_OWNDATA | NPY_ARRAY_CARRAY, nullptr); } class PandasBlock { @@ -1078,12 +1079,10 @@ class PandasBlock { PyObject* block_arr; if (ndim == 2) { npy_intp block_dims[2] = {num_columns_, num_rows_}; - block_arr = PyArray_NewFromDescr( - &PyArray_Type, descr, 2, block_dims, nullptr, nullptr, 0, nullptr); + block_arr = PyArray_SimpleNewFromDescr(2, block_dims, descr); } else { npy_intp block_dims[1] = {num_rows_}; - block_arr = PyArray_NewFromDescr( - &PyArray_Type, descr, 1, block_dims, nullptr, nullptr, 0, nullptr); + block_arr = PyArray_SimpleNewFromDescr(1, block_dims, descr); } if (block_arr == NULL) { @@ -1091,6 +1090,8 @@ class PandasBlock { return Status::OK(); } + PyArray_ENABLEFLAGS(reinterpret_cast<PyArrayObject*>(block_arr), NPY_ARRAY_OWNDATA); + npy_intp placement_dims[1] = {num_columns_}; PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); if (placement_arr == NULL) { @@ -1357,8 +1358,6 @@ inline void ConvertDatetimeNanos(const ChunkedArray& data, int64_t* out_values) class ObjectBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - virtual ~ObjectBlock() {} - Status Allocate() override { return AllocateNDArray(NPY_OBJECT); } Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement, @@ -1416,7 +1415,6 @@ template <int ARROW_TYPE, typename C_TYPE> class IntBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(arrow_traits<ARROW_TYPE>::npy_type); } @@ -1450,7 +1448,6 @@ using Int64Block = IntBlock<Type::INT64, int64_t>; class Float32Block : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(NPY_FLOAT32); } Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement, @@ -1470,7 +1467,6 @@ class Float32Block : public PandasBlock { class Float64Block : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(NPY_FLOAT64); } Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement, @@ -1523,7 +1519,6 @@ class Float64Block : public PandasBlock { class BoolBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status Allocate() override { return AllocateNDArray(NPY_BOOL); } Status Write(const std::shared_ptr<Column>& col, int64_t abs_placement, @@ -1544,7 +1539,6 @@ class BoolBlock : public PandasBlock { class DatetimeBlock : public PandasBlock { public: using PandasBlock::PandasBlock; - Status AllocateDatetime(int ndim) { RETURN_NOT_OK(AllocateNDArray(NPY_DATETIME, ndim)); @@ -1629,7 +1623,6 @@ template <int ARROW_INDEX_TYPE> class CategoricalBlock : public PandasBlock { public: explicit CategoricalBlock(int64_t num_rows) : PandasBlock(num_rows, 1) {} - Status Allocate() override { constexpr int npy_type = arrow_traits<ARROW_INDEX_TYPE>::npy_type; @@ -1960,6 +1953,9 @@ class DataFrameBlockCreator { PyObject* item; RETURN_NOT_OK(it.second->GetPyResult(&item)); if (PyList_Append(list, item) < 0) { RETURN_IF_PYERROR(); } + + // ARROW-1017; PyList_Append increments object refcount + Py_DECREF(item); } return Status::OK(); } @@ -2045,6 +2041,9 @@ class ArrowDeserializer { // Arrow data is immutable. PyArray_CLEARFLAGS(arr_, NPY_ARRAY_WRITEABLE); + // Arrow data is owned by another + PyArray_CLEARFLAGS(arr_, NPY_ARRAY_OWNDATA); + return Status::OK(); } http://git-wip-us.apache.org/repos/asf/arrow/blob/c7839e9f/python/scripts/test_leak.py ---------------------------------------------------------------------- diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py new file mode 100644 index 0000000..2b197b6 --- /dev/null +++ b/python/scripts/test_leak.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +import numpy as np +import memory_profiler +import gc + + +def leak(): + data = [pa.array(np.concatenate([np.random.randn(100000)] * 1000))] + table = pa.Table.from_arrays(data, ['foo']) + while True: + print('calling to_pandas') + print('memory_usage: {0}'.format(memory_profiler.memory_usage())) + table.to_pandas() + gc.collect() + +leak()