Repository: arrow Updated Branches: refs/heads/master 4b72329fe -> 72f80d450
ARROW-409: [Python] Change record batches conversion to Table >From discussion in ARROW-369, it is more consistent and flexible for the >pyarrow.Table API to convert a RecordBatch list first a Table, then Table to >pandas.DataFrame. For example: ``` table = pa.Table.from_batches(batches) df = table.to_pandas() ``` Also updated conversion to print schemas in exception message if not equal. Author: Bryan Cutler <[email protected]> Closes #229 from BryanCutler/pyarrow-table-from_batches-ARROW-409 and squashes the following commits: f5751e0 [Bryan Cutler] fixed schema check to print out if not equal 72ea875 [Bryan Cutler] changed batches conversion to Table instead Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/72f80d45 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/72f80d45 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/72f80d45 Branch: refs/heads/master Commit: 72f80d450e0e8e20812fd80571b0c1d18e88114a Parents: 4b72329 Author: Bryan Cutler <[email protected]> Authored: Wed Dec 7 15:00:18 2016 +0100 Committer: Uwe L. Korn <[email protected]> Committed: Wed Dec 7 15:00:18 2016 +0100 ---------------------------------------------------------------------- python/pyarrow/__init__.py | 3 +- python/pyarrow/table.pyx | 94 +++++++++++++++++---------------- python/pyarrow/tests/test_table.py | 5 +- 3 files changed, 52 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index f366317..5af93fb 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -49,5 +49,4 @@ from pyarrow.schema import (null, bool_, list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.table import (Column, RecordBatch, dataframe_from_batches, Table, - from_pandas_dataframe) +from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/table.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 45cf7be..0a9805c 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -415,52 +415,6 @@ cdef class RecordBatch: return result -def dataframe_from_batches(batches): - """ - Convert a list of Arrow RecordBatches to a pandas.DataFrame - - Parameters - ---------- - - batches: list of RecordBatch - RecordBatch list to be converted, schemas must be equal - """ - - cdef: - vector[shared_ptr[CArray]] c_array_chunks - vector[shared_ptr[CColumn]] c_columns - shared_ptr[CTable] c_table - Array arr - Schema schema - - import pandas as pd - - schema = batches[0].schema - - # check schemas are equal - if any((not schema.equals(other.schema) for other in batches[1:])): - raise ArrowException("Error converting list of RecordBatches to " - "DataFrame, not all schemas are equal") - - cdef int K = batches[0].num_columns - - # create chunked columns from the batches - c_columns.resize(K) - for i in range(K): - for batch in batches: - arr = batch[i] - c_array_chunks.push_back(arr.sp_array) - c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i), - c_array_chunks)) - c_array_chunks.clear() - - # create a Table from columns and convert to DataFrame - c_table.reset(new CTable('', schema.sp_schema, c_columns)) - table = Table() - table.init(c_table) - return table.to_pandas() - - cdef class Table: """ A collection of top-level named, equal length Arrow arrays. @@ -567,6 +521,54 @@ cdef class Table: return result + @staticmethod + def from_batches(batches): + """ + Construct a Table from a list of Arrow RecordBatches + + Parameters + ---------- + + batches: list of RecordBatch + RecordBatch list to be converted, schemas must be equal + """ + + cdef: + vector[shared_ptr[CArray]] c_array_chunks + vector[shared_ptr[CColumn]] c_columns + shared_ptr[CTable] c_table + Array arr + Schema schema + + import pandas as pd + + schema = batches[0].schema + + # check schemas are equal + for other in batches[1:]: + if not schema.equals(other.schema): + raise ArrowException("Error converting list of RecordBatches " + "to DataFrame, not all schemas are equal: {%s} != {%s}" + % (str(schema), str(other.schema))) + + cdef int K = batches[0].num_columns + + # create chunked columns from the batches + c_columns.resize(K) + for i in range(K): + for batch in batches: + arr = batch[i] + c_array_chunks.push_back(arr.sp_array) + c_columns[i].reset(new CColumn(schema.sp_schema.get().field(i), + c_array_chunks)) + c_array_chunks.clear() + + # create a Table from columns and convert to DataFrame + c_table.reset(new CTable('', schema.sp_schema, c_columns)) + table = Table() + table.init(c_table) + return table + def to_pandas(self): """ Convert the arrow::Table to a pandas DataFrame http://git-wip-us.apache.org/repos/asf/arrow/blob/72f80d45/python/pyarrow/tests/test_table.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index dc4f37a..2546314 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -69,7 +69,8 @@ def test_recordbatchlist_to_pandas(): batch1 = pa.RecordBatch.from_pandas(data1) batch2 = pa.RecordBatch.from_pandas(data2) - result = pa.dataframe_from_batches([batch1, batch2]) + table = pa.Table.from_batches([batch1, batch2]) + result = table.to_pandas() data = pd.concat([data1, data2], ignore_index=True) assert_frame_equal(data, result) @@ -82,7 +83,7 @@ def test_recordbatchlist_schema_equals(): batch2 = pa.RecordBatch.from_pandas(data2) with pytest.raises(pa.ArrowException): - pa.dataframe_from_batches([batch1, batch2]) + pa.Table.from_batches([batch1, batch2]) def test_table_basics():
