Repository: arrow Updated Branches: refs/heads/master ad0157547 -> 66f650cd3
ARROW-547: [Python] Add zero-copy slice methods to Array, RecordBatch Author: Wes McKinney <[email protected]> Closes #336 from wesm/ARROW-547 and squashes the following commits: 42037c2 [Wes McKinney] cpplint 2b91b5b [Wes McKinney] Tweak docstring 5f80d80 [Wes McKinney] Add slice methods to pyarrow.Array and RecordBatch. Fix bug in RecordBatch::Slice 20dc23f [Wes McKinney] Draft Array.slice implementation Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/66f650cd Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/66f650cd Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/66f650cd Branch: refs/heads/master Commit: 66f650cd359e13f3d5c3d4ef78d89f389d6bcecc Parents: ad01575 Author: Wes McKinney <[email protected]> Authored: Mon Feb 13 09:04:37 2017 -0500 Committer: Wes McKinney <[email protected]> Committed: Mon Feb 13 09:04:37 2017 -0500 ---------------------------------------------------------------------- cpp/src/arrow/ipc/adapter.cc | 2 +- cpp/src/arrow/table-test.cc | 2 ++ cpp/src/arrow/table.cc | 5 +++- python/pyarrow/array.pxd | 2 +- python/pyarrow/array.pyx | 42 +++++++++++++++++++++++++------ python/pyarrow/includes/libarrow.pxd | 6 +++++ python/pyarrow/scalar.pxd | 6 ++--- python/pyarrow/scalar.pyx | 7 +++--- python/pyarrow/table.pyx | 37 ++++++++++++++++++++++----- python/pyarrow/tests/test_array.py | 36 ++++++++++++++++++++++++++ python/pyarrow/tests/test_table.py | 32 +++++++++++++++++++++++ 11 files changed, 153 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/cpp/src/arrow/ipc/adapter.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index f36ff37..a24c007 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -727,7 +727,7 @@ class ArrayLoader : public TypeVisitor { RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &offsets)); } } - context_->buffer_index += type.mode == UnionMode::DENSE? 2 : 1; + context_->buffer_index += type.mode == UnionMode::DENSE ? 2 : 1; std::vector<std::shared_ptr<Array>> fields; RETURN_NOT_OK(LoadChildren(type.children(), &fields)); http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/cpp/src/arrow/table-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index e7c5d66..25f12c4 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -259,6 +259,8 @@ TEST_F(TestRecordBatch, Slice) { auto batch_slice = batch.Slice(2); auto batch_slice2 = batch.Slice(1, 5); + ASSERT_EQ(batch_slice->num_rows(), batch.num_rows() - 2); + for (int i = 0; i < batch.num_columns(); ++i) { ASSERT_EQ(2, batch_slice->column(i)->offset()); ASSERT_EQ(length - 2, batch_slice->column(i)->length()); http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/cpp/src/arrow/table.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index a9e0909..8ac06b8 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -17,6 +17,7 @@ #include "arrow/table.h" +#include <algorithm> #include <cstdlib> #include <memory> #include <sstream> @@ -70,7 +71,9 @@ std::shared_ptr<RecordBatch> RecordBatch::Slice(int32_t offset, int32_t length) for (const auto& field : columns_) { arrays.emplace_back(field->Slice(offset, length)); } - return std::make_shared<RecordBatch>(schema_, num_rows_, arrays); + + int32_t num_rows = std::min(num_rows_ - offset, length); + return std::make_shared<RecordBatch>(schema_, num_rows, arrays); } // ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/array.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index af10535..9e4d469 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -38,7 +38,7 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array) cdef getitem(self, int i) -cdef object box_arrow_array(const shared_ptr[CArray]& sp_array) +cdef object box_array(const shared_ptr[CArray]& sp_array) cdef class BooleanArray(Array): http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 9b34f56..11abf03 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -131,7 +131,7 @@ cdef class Array: check_status(pyarrow.PandasToArrow( pool, series_values, mask, c_field, &out)) - return box_arrow_array(out) + return box_array(out) @staticmethod def from_list(object list_obj, DataType type=None, MemoryPool memory_pool=None): @@ -156,7 +156,7 @@ cdef class Array: else: raise NotImplementedError() - return box_arrow_array(sp_array) + return box_array(sp_array) property null_count: @@ -201,9 +201,9 @@ cdef class Array: step = key.step or 1 if step != 1: - raise NotImplementedError + raise IndexError('only slices with step 1 supported') else: - return self.slice(start, stop) + return self.slice(start, stop - start) while key < 0: key += len(self) @@ -211,10 +211,36 @@ cdef class Array: return self.getitem(key) cdef getitem(self, int i): - return scalar.box_arrow_scalar(self.type, self.sp_array, i) + return scalar.box_scalar(self.type, self.sp_array, i) - def slice(self, start, end): - pass + def slice(self, offset=0, length=None): + """ + Compute zero-copy slice of this array + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of Array starting from + offset) + + Returns + ------- + sliced : RecordBatch + """ + cdef: + shared_ptr[CArray] result + + if offset < 0: + raise IndexError('Offset must be non-negative') + + if length is None: + result = self.ap.Slice(offset) + else: + result = self.ap.Slice(offset, length) + + return box_array(result) def to_pandas(self): """ @@ -390,7 +416,7 @@ cdef dict _array_classes = { Type_DICTIONARY: DictionaryArray } -cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): +cdef object box_array(const shared_ptr[CArray]& sp_array): if sp_array.get() == NULL: raise ValueError('Array was NULL') http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index ebfdc41..702acfb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -71,6 +71,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool Equals(const shared_ptr[CArray]& arr) c_bool IsNull(int i) + shared_ptr[CArray] Slice(int32_t offset) + shared_ptr[CArray] Slice(int32_t offset, int32_t length) + cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType): int bit_width() @@ -228,6 +231,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int num_columns() int32_t num_rows() + shared_ptr[CRecordBatch] Slice(int32_t offset) + shared_ptr[CRecordBatch] Slice(int32_t offset, int32_t length) + cdef cppclass CTable" arrow::Table": CTable(const c_string& name, const shared_ptr[CSchema]& schema, const vector[shared_ptr[CColumn]]& columns) http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/scalar.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pxd b/python/pyarrow/scalar.pxd index b068457..2d55757 100644 --- a/python/pyarrow/scalar.pxd +++ b/python/pyarrow/scalar.pxd @@ -61,6 +61,6 @@ cdef class ListValue(ArrayValue): cdef class StringValue(ArrayValue): pass -cdef object box_arrow_scalar(DataType type, - const shared_ptr[CArray]& sp_array, - int index) +cdef object box_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index) http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 9d2b2b1..57a15ad 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -203,7 +203,7 @@ cdef class ListValue(ArrayValue): cdef getitem(self, int i): cdef int j = self.ap.value_offset(self.index) + i - return box_arrow_scalar(self.value_type, self.ap.values(), j) + return box_scalar(self.value_type, self.ap.values(), j) def as_py(self): cdef: @@ -235,9 +235,8 @@ cdef dict _scalar_classes = { Type_STRING: StringValue, } -cdef object box_arrow_scalar(DataType type, - const shared_ptr[CArray]& sp_array, - int index): +cdef object box_scalar(DataType type, const shared_ptr[CArray]& sp_array, + int index): cdef ArrayValue val if type.type.type == Type_NA: return NA http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/table.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 1707210..7d73362 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -27,7 +27,7 @@ cimport pyarrow.includes.pyarrow as pyarrow import pyarrow.config -from pyarrow.array cimport Array, box_arrow_array, wrap_array_output +from pyarrow.array cimport Array, box_array, wrap_array_output from pyarrow.error import ArrowException from pyarrow.error cimport check_status from pyarrow.schema cimport box_data_type, box_schema, Field @@ -109,8 +109,7 @@ cdef class ChunkedArray: pyarrow.array.Array """ self._check_nullptr() - return box_arrow_array(self.chunked_array.chunk(i)) - + return box_array(self.chunked_array.chunk(i)) def iterchunks(self): for i in range(self.num_chunks): @@ -387,9 +386,35 @@ cdef class RecordBatch: return self._schema def __getitem__(self, i): - cdef Array arr = Array() - arr.init(self.batch.column(i)) - return arr + return box_array(self.batch.column(i)) + + def slice(self, offset=0, length=None): + """ + Compute zero-copy slice of this RecordBatch + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : RecordBatch + """ + cdef shared_ptr[CRecordBatch] result + + if offset < 0: + raise IndexError('Offset must be non-negative') + + if length is None: + result = self.batch.Slice(offset) + else: + result = self.batch.Slice(offset, length) + + return batch_from_cbatch(result) def equals(self, RecordBatch other): cdef: http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/tests/test_array.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ead17db..d8b2e2f 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -17,6 +17,8 @@ import sys +import pytest + import pyarrow import pyarrow.formatting as fmt @@ -100,3 +102,37 @@ def test_to_pandas_zero_copy(): base_refcount = sys.getrefcount(np_arr.base) assert base_refcount == 2 np_arr.sum() + + +def test_array_slice(): + arr = pyarrow.from_pylist(range(10)) + + sliced = arr.slice(2) + expected = pyarrow.from_pylist(range(2, 10)) + assert sliced.equals(expected) + + sliced2 = arr.slice(2, 4) + expected2 = pyarrow.from_pylist(range(2, 6)) + assert sliced2.equals(expected2) + + # 0 offset + assert arr.slice(0).equals(arr) + + # Slice past end of array + assert len(arr.slice(len(arr))) == 0 + + with pytest.raises(IndexError): + arr.slice(-1) + + # Test slice notation + assert arr[2:].equals(arr.slice(2)) + + assert arr[2:5].equals(arr.slice(2, 3)) + + assert arr[-5:].equals(arr.slice(len(arr) - 5)) + + with pytest.raises(IndexError): + arr[::-1] + + with pytest.raises(IndexError): + arr[::2] http://git-wip-us.apache.org/repos/asf/arrow/blob/66f650cd/python/pyarrow/tests/test_table.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index d49b33c..67f1892 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -68,6 +68,38 @@ def test_recordbatch_basics(): ]) +def test_recordbatch_slice(): + data = [ + pa.from_pylist(range(5)), + pa.from_pylist([-10, -5, 0, 5, 10]) + ] + names = ['c0', 'c1'] + + batch = pa.RecordBatch.from_arrays(data, names) + + sliced = batch.slice(2) + + assert sliced.num_rows == 3 + + expected = pa.RecordBatch.from_arrays( + [x.slice(2) for x in data], names) + assert sliced.equals(expected) + + sliced2 = batch.slice(2, 2) + expected2 = pa.RecordBatch.from_arrays( + [x.slice(2, 2) for x in data], names) + assert sliced2.equals(expected2) + + # 0 offset + assert batch.slice(0).equals(batch) + + # Slice past end of array + assert len(batch.slice(len(batch))) == 0 + + with pytest.raises(IndexError): + batch.slice(-1) + + def test_recordbatch_from_to_pandas(): data = pd.DataFrame({ 'c1': np.array([1, 2, 3, 4, 5], dtype='int64'),
