svn commit: r20494 - in /dev/arrow/apache-arrow-0.5.0-rc1: ./ apache-arrow-0.5.0.tar.gz apache-arrow-0.5.0.tar.gz.asc apache-arrow-0.5.0.tar.gz.md5 apache-arrow-0.5.0.tar.gz.sha
Author: wesm Date: Wed Jul 19 21:24:03 2017 New Revision: 20494 Log: Apache Arrow ${version} RC${rc} Added: dev/arrow/apache-arrow-0.5.0-rc1/ dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz (with props) dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.asc dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.md5 dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.sha Added: dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz == Binary file - no diff available. Propchange: dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz -- svn:mime-type = application/octet-stream Added: dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.asc == --- dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.asc (added) +++ dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.asc Wed Jul 19 21:24:03 2017 @@ -0,0 +1,17 @@ +-BEGIN PGP SIGNATURE- +Version: GnuPG v1 + +iQIcBAABCgAGBQJZb83rAAoJEPEFiDoXNWI9M20P/0HAOCvAczOs4FeYGMzwhz8a +lfV+2XT7KaY2d0VjkWKgsBbXGfQrh3h2Kz/sSZ8EiIuOcLpr6ZgjTXnFI51jjt7w +nlYfkLeWU5kw76jLw6+VJwLuLSnDGQc8t+TAhjDla/E1RrV0t0O3FEdmklVN8EPk +rwebRsWwdceGeOdtcxEiZwn02AH1iT4wvfuIlvypNZz6oBNkrIL8revyGHeFipqj +/3P8bCGIhqNnc7awgqZfLAzEiYgLnQMhWfmxhQj8/qAf1ewpGX3uvBGDH0xwjzVo +NyXlB1MW3a6bkEYduLx6jYiiCtRhD2WLWUe+BNK19k/TV4no8KmuZEJgBusWbSyU +Uq2iyx5aJ4BnP5Ohg53nMOAh5GfQWavfCpR3Q4Riv6/b7wVLeUp7Hsj2LUKgc2jL +kK1SwXZH33BHCayJBCEBGRo2KkjzG3nBWOeAD1Ejvy0jqWztxQtEUPZ8ynoBVqJP +wJmUIZ9V/Gfd2uoRUG6BWZZwYe2v4NniKGertsln5kcAgyEX4m1d3aXmw4lDANcg +mWE8/3SgAKDnE6fZ9gOVd5TLoThd40HW4AbwFS42jhbtL1IzXgoVm7VE8WCikPEt +8GOvHJeweLh+z4BpQeg81s+YNCrlDlvSsrsSsq6MSsht1P3KYi9c1eHYHWMv4L0Z +u0NSitCfJlxYGkMX7sp/ +=COkp +-END PGP SIGNATURE- Added: dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.md5 == --- dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.md5 (added) +++ dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.md5 Wed Jul 19 21:24:03 2017 @@ -0,0 +1 @@ +apache-arrow-0.5.0.tar.gz: A6 51 0C 29 2A EE 6D 01 44 A3 C3 27 D5 7B 08 C2 Added: dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.sha == --- dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.sha (added) +++ dev/arrow/apache-arrow-0.5.0-rc1/apache-arrow-0.5.0.tar.gz.sha Wed Jul 19 21:24:03 2017 @@ -0,0 +1 @@ +2445770d9d30e1fad48fa578e58f61c96cf4a1d9 apache-arrow-0.5.0.tar.gz
[arrow] Git Push Summary
Repository: arrow Updated Tags: refs/tags/apache-arrow-0.5.0 [created] 76e89bd03
[arrow] Git Push Summary
Repository: arrow Updated Tags: refs/tags/apache-arrow-0.5.0 [deleted] e2d00bb75
[arrow] Git Push Summary
Repository: arrow Updated Tags: refs/tags/apache-arrow-0.5.0 [created] e2d00bb75
[arrow] Git Push Summary
Repository: arrow Updated Tags: refs/tags/apache-arrow-0.5.0 [deleted] 348ac62cd
[arrow] Git Push Summary
Repository: arrow Updated Tags: refs/tags/apache-arrow-0.5.0 [created] 348ac62cd
[3/3] arrow git commit: ARROW-1167: [Python] Support chunking string columns in Table.from_pandas
ARROW-1167: [Python] Support chunking string columns in Table.from_pandas This resolves the error with converting the dataset in ARROW-1167, which only takes up 4.5 GB in memory but has a single column with over 2GB in binary data. The unit test for this is not run in CI because of large memory allocation, but can be run with ``` py.test pyarrow --large_memory ``` cc @jeffknupp Author: Wes McKinneyCloses #867 from wesm/ARROW-1167 and squashes the following commits: dae62326 [Wes McKinney] cpplint dcdec91a [Wes McKinney] Support ChunkedArray outputs of Array.from_pandas 150e9fc9 [Wes McKinney] Produced ChunkedArray when exceeding 2GB in a single BinaryArray column 707555f8 [Wes McKinney] Split up pandas_convert, make PandasObjectsToArrow return ChunkedArray to accommodate large string data Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/2c5b412c Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/2c5b412c Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/2c5b412c Branch: refs/heads/master Commit: 2c5b412c2866b6561d35ba3399036c22b646d699 Parents: 6999dbd Author: Wes McKinney Authored: Wed Jul 19 08:16:25 2017 -0400 Committer: Wes McKinney Committed: Wed Jul 19 08:16:25 2017 -0400 -- cpp/src/arrow/builder.h |6 + cpp/src/arrow/ipc/feather.cc|3 +- cpp/src/arrow/python/CMakeLists.txt |6 +- cpp/src/arrow/python/api.h |3 +- cpp/src/arrow/python/arrow_to_pandas.cc | 1627 ++ cpp/src/arrow/python/arrow_to_pandas.h | 67 + cpp/src/arrow/python/pandas_convert.cc | 2609 -- cpp/src/arrow/python/pandas_convert.h | 77 - cpp/src/arrow/python/pandas_to_arrow.cc | 1099 + cpp/src/arrow/python/pandas_to_arrow.h | 58 + cpp/src/arrow/python/python-test.cc |2 +- cpp/src/arrow/table.h |3 + python/pyarrow/array.pxi| 11 +- python/pyarrow/includes/libarrow.pxd|3 +- python/pyarrow/parquet.py | 21 +- python/pyarrow/public-api.pxi | 15 + python/pyarrow/table.pxi| 38 +- python/pyarrow/tests/conftest.py|5 +- python/pyarrow/tests/test_convert_pandas.py | 14 + 19 files changed, 2949 insertions(+), 2718 deletions(-) -- http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/builder.h -- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 6b54c9f..065e115 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -585,6 +585,9 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { Status Resize(int64_t capacity) override; Status Finish(std::shared_ptr* out) override; + /// \return size of values buffer so far + int64_t value_data_length() const { return value_data_builder_.length(); } + /// Temporary access to a value. /// /// This pointer becomes invalid on the next modifying operation. @@ -632,6 +635,9 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Resize(int64_t capacity) override; Status Finish(std::shared_ptr* out) override; + /// \return size of values buffer so far + int64_t value_data_length() const { return byte_builder_.length(); } + protected: int32_t byte_width_; BufferBuilder byte_builder_; http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/ipc/feather.cc -- diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index d5b938b..61b96e0 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -497,8 +497,7 @@ fbs::Type ToFlatbufferType(Type::type type) { return fbs::Type_MIN; } -static Status SanitizeUnsupportedTypes( -const Array& values, std::shared_ptr* out) { +static Status SanitizeUnsupportedTypes(const Array& values, std::shared_ptr* out) { if (values.type_id() == Type::NA) { // As long as R doesn't support NA, we write this as a StringColumn // to ensure stable roundtrips. http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/CMakeLists.txt -- diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index d5e980b..0fdf81e 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -42,6 +42,7 @@ set(ARROW_PYTHON_TEST_LINK_LIBS ${ARROW_PYTHON_MIN_TEST_LIBS}) #
[2/3] arrow git commit: ARROW-1167: [Python] Support chunking string columns in Table.from_pandas
http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/pandas_convert.cc -- diff --git a/cpp/src/arrow/python/pandas_convert.cc b/cpp/src/arrow/python/pandas_convert.cc deleted file mode 100644 index 282b3a9..000 --- a/cpp/src/arrow/python/pandas_convert.cc +++ /dev/null @@ -1,2609 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for pandas conversion via NumPy - -#include "arrow/python/numpy_interop.h" - -#include "arrow/python/pandas_convert.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type_fwd.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/decimal.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/visitor_inline.h" - -#include "arrow/python/builtin_convert.h" -#include "arrow/python/common.h" -#include "arrow/python/config.h" -#include "arrow/python/helpers.h" -#include "arrow/python/numpy-internal.h" -#include "arrow/python/numpy_convert.h" -#include "arrow/python/type_traits.h" -#include "arrow/python/util/datetime.h" - -namespace arrow { -namespace py { - -// -- -// Utility code - -static inline bool PyFloat_isnan(const PyObject* obj) { - if (PyFloat_Check(obj)) { -double val = PyFloat_AS_DOUBLE(obj); -return val != val; - } else { -return false; - } -} -static inline bool PandasObjectIsNull(const PyObject* obj) { - return obj == Py_None || obj == numpy_nan || PyFloat_isnan(obj); -} - -static inline bool PyObject_is_string(const PyObject* obj) { -#if PY_MAJOR_VERSION >= 3 - return PyUnicode_Check(obj) || PyBytes_Check(obj); -#else - return PyString_Check(obj) || PyUnicode_Check(obj); -#endif -} - -static inline bool PyObject_is_float(const PyObject* obj) { - return PyFloat_Check(obj); -} - -static inline bool PyObject_is_integer(const PyObject* obj) { - return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); -} - -template -static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { - typedef npy_traits traits; - typedef typename traits::value_type T; - - int64_t null_count = 0; - - Ndarray1DIndexer values(arr); - - // TODO(wesm): striding - for (int i = 0; i < values.size(); ++i) { -if (traits::isnull(values[i])) { - ++null_count; -} else { - BitUtil::SetBit(bitmap, i); -} - } - - return null_count; -} - -// Returns null count -static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { - int64_t null_count = 0; - - Ndarray1DIndexer mask_values(mask); - for (int i = 0; i < length; ++i) { -if (mask_values[i]) { - ++null_count; -} else { - BitUtil::SetBit(bitmap, i); -} - } - return null_count; -} - -template -static int64_t ValuesToValidBytes( -const void* data, int64_t length, uint8_t* valid_bytes) { - typedef npy_traits traits; - typedef typename traits::value_type T; - - int64_t null_count = 0; - const T* values = reinterpret_cast(data); - - // TODO(wesm): striding - for (int i = 0; i < length; ++i) { -valid_bytes[i] = !traits::isnull(values[i]); -if (traits::isnull(values[i])) null_count++; - } - - return null_count; -} - -Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) { - if (PyArray_NDIM(numpy_array) != 1) { -return Status::Invalid("only handle 1-dimensional arrays"); - } - - if (PyArray_DESCR(numpy_array)->type_num != np_type) { -return Status::Invalid("can only handle exact conversions"); - } - - npy_intp* astrides = PyArray_STRIDES(numpy_array); - if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) { -return Status::Invalid("No support for strided arrays in lists yet"); - } - return Status::OK(); -} - -static Status AppendObjectStrings( -PyArrayObject* arr, PyArrayObject* mask, StringBuilder* builder, bool* have_bytes) { - PyObject* obj; - - Ndarray1DIndexerobjects(arr); - Ndarray1DIndexer
[1/3] arrow git commit: ARROW-1167: [Python] Support chunking string columns in Table.from_pandas
Repository: arrow Updated Branches: refs/heads/master 6999dbd1e -> 2c5b412c2 http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/pandas_convert.h -- diff --git a/cpp/src/arrow/python/pandas_convert.h b/cpp/src/arrow/python/pandas_convert.h deleted file mode 100644 index 45c8a1a..000 --- a/cpp/src/arrow/python/pandas_convert.h +++ /dev/null @@ -1,77 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Functions for converting between pandas's NumPy-based data representation -// and Arrow data structures - -#ifndef ARROW_PYTHON_ADAPTERS_PANDAS_H -#define ARROW_PYTHON_ADAPTERS_PANDAS_H - -#include "arrow/python/platform.h" - -#include -#include - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class Column; -class DataType; -class MemoryPool; -class Status; -class Table; - -namespace py { - -ARROW_EXPORT -Status ConvertArrayToPandas( -const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); - -ARROW_EXPORT -Status ConvertColumnToPandas( -const std::shared_ptr& col, PyObject* py_ref, PyObject** out); - -struct PandasOptions { - bool strings_to_categorical; -}; - -// Convert a whole table as efficiently as possible to a pandas.DataFrame. -// -// The returned Python object is a list of tuples consisting of the exact 2D -// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. -// -// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) -ARROW_EXPORT -Status ConvertTableToPandas( -const std::shared_ptr& table, int nthreads, PyObject** out); - -ARROW_EXPORT -Status PandasToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, -const std::shared_ptr& type, std::shared_ptr* out); - -/// Convert dtype=object arrays. If target data type is not known, pass a type -/// with nullptr -ARROW_EXPORT -Status PandasObjectsToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, -const std::shared_ptr& type, std::shared_ptr* out); - -} // namespace py -} // namespace arrow - -#endif // ARROW_PYTHON_ADAPTERS_PANDAS_H http://git-wip-us.apache.org/repos/asf/arrow/blob/2c5b412c/cpp/src/arrow/python/pandas_to_arrow.cc -- diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc new file mode 100644 index 000..1368c36 --- /dev/null +++ b/cpp/src/arrow/python/pandas_to_arrow.cc @@ -0,0 +1,1099 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for pandas conversion via NumPy + +#include "arrow/python/numpy_interop.h" + +#include "arrow/python/pandas_to_arrow.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/visitor_inline.h" + +#include "arrow/python/builtin_convert.h" +#include "arrow/python/common.h" +#include "arrow/python/config.h" +#include "arrow/python/helpers.h" +#include "arrow/python/numpy-internal.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/type_traits.h" +#include "arrow/python/util/datetime.h" + +namespace arrow { +namespace py { + +//