This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new c30a7e3 ARROW-1732: [Python] Permit creating record batches with no columns, test pandas roundtrips c30a7e3 is described below commit c30a7e30af2469dde1a00f74d8ba9631887825c4 Author: Wes McKinney <wes.mckin...@twosigma.com> AuthorDate: Thu Oct 26 14:13:50 2017 +0200 ARROW-1732: [Python] Permit creating record batches with no columns, test pandas roundtrips I ran into this rough edge today, invariably serialization code paths will need to send across a DataFrame with no columns, this will need to work even if `preserve_index=False` Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #1252 from wesm/ARROW-1732 and squashes the following commits: a240c05 [Wes McKinney] Permit creating record batches with no columns, test pandas roundtrips --- python/pyarrow/table.pxi | 20 ++++++++++---------- python/pyarrow/tests/test_convert_pandas.py | 25 ++++++++++++++++++------- python/pyarrow/tests/test_table.py | 8 ++++++++ 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 694fe91..eb19115 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -308,8 +308,8 @@ cdef shared_ptr[const CKeyValueMetadata] unbox_metadata(dict metadata): make_shared[CKeyValueMetadata](unordered_metadata)) -cdef int _schema_from_arrays( - arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1: +cdef _schema_from_arrays(arrays, names, dict metadata, + shared_ptr[CSchema]* schema): cdef: Column col c_string c_name @@ -317,10 +317,11 @@ cdef int _schema_from_arrays( shared_ptr[CDataType] type_ Py_ssize_t K = len(arrays) - fields.resize(K) + if K == 0: + schema.reset(new CSchema(fields, unbox_metadata(metadata))) + return - if not K: - raise ValueError('Must pass at least one array') + fields.resize(K) if isinstance(arrays[0], Column): for i in range(K): @@ -346,7 +347,6 @@ cdef int _schema_from_arrays( fields[i].reset(new CField(c_name, type_, True)) schema.reset(new CSchema(fields, unbox_metadata(metadata))) - return 0 cdef class RecordBatch: @@ -613,10 +613,10 @@ cdef class RecordBatch: int64_t i int64_t number_of_arrays = len(arrays) - if not number_of_arrays: - raise ValueError('Record batch cannot contain no arrays (for now)') - - num_rows = len(arrays[0]) + if len(arrays) > 0: + num_rows = len(arrays[0]) + else: + num_rows = 0 _schema_from_arrays(arrays, names, metadata, &schema) c_arrays.reserve(len(arrays)) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 527466e..6d146f9 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -71,11 +71,11 @@ class TestPandasConversion(object): def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, expected_schema=None, check_dtype=True, schema=None, - check_index=False, + preserve_index=False, as_batch=False): klass = pa.RecordBatch if as_batch else pa.Table table = klass.from_pandas(df, schema=schema, - preserve_index=check_index, + preserve_index=preserve_index, nthreads=nthreads) result = table.to_pandas(nthreads=nthreads) @@ -83,7 +83,9 @@ class TestPandasConversion(object): assert table.schema.equals(expected_schema) if expected is None: expected = df - tm.assert_frame_equal(result, expected, check_dtype=check_dtype) + tm.assert_frame_equal(result, expected, check_dtype=check_dtype, + check_index_type=('equiv' if preserve_index + else False)) def _check_series_roundtrip(self, s, type_=None): arr = pa.array(s, from_pandas=True, type=type_) @@ -131,14 +133,14 @@ class TestPandasConversion(object): def test_column_index_names_are_preserved(self): df = pd.DataFrame({'data': [1, 2, 3]}) df.columns.names = ['a'] - self._check_pandas_roundtrip(df, check_index=True) + self._check_pandas_roundtrip(df, preserve_index=True) def test_multiindex_columns(self): columns = pd.MultiIndex.from_arrays([ ['one', 'two'], ['X', 'Y'] ]) df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) - self._check_pandas_roundtrip(df, check_index=True) + self._check_pandas_roundtrip(df, preserve_index=True) def test_multiindex_columns_with_dtypes(self): columns = pd.MultiIndex.from_arrays( @@ -149,11 +151,11 @@ class TestPandasConversion(object): names=['level_1', 'level_2'], ) df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns) - self._check_pandas_roundtrip(df, check_index=True) + self._check_pandas_roundtrip(df, preserve_index=True) def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')]) - self._check_pandas_roundtrip(df, check_index=True) + self._check_pandas_roundtrip(df, preserve_index=True) def test_categorical_column_index(self): # I *really* hope no one uses category dtypes for single level column @@ -1095,6 +1097,15 @@ class TestPandasConversion(object): expected = pd.DataFrame({'strings': pd.Categorical(values)}) tm.assert_frame_equal(result, expected, check_dtype=True) + def test_table_batch_empty_dataframe(self): + df = pd.DataFrame({}) + self._check_pandas_roundtrip(df) + self._check_pandas_roundtrip(df, as_batch=True) + + df2 = pd.DataFrame({}, index=[0, 1, 2]) + self._check_pandas_roundtrip(df2, preserve_index=True) + self._check_pandas_roundtrip(df2, as_batch=True, preserve_index=True) + def test_array_from_pandas_date_with_mask(self): m = np.array([True, False, True]) data = pd.Series([ diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 50190f5..4282224 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -82,6 +82,14 @@ def test_recordbatch_basics(): batch[2] +def test_recordbatch_no_fields(): + batch = pa.RecordBatch.from_arrays([], []) + + assert len(batch) == 0 + assert batch.num_rows == 0 + assert batch.num_columns == 0 + + def test_recordbatch_from_arrays_invalid_names(): data = [ pa.array(range(5)), -- To stop receiving notification emails like this one, please contact ['"commits@arrow.apache.org" <commits@arrow.apache.org>'].