[
https://issues.apache.org/jira/browse/ARROW-1732?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16220363#comment-16220363
]
ASF GitHub Bot commented on ARROW-1732:
---------------------------------------
xhochy closed pull request #1252: ARROW-1732: [Python] Permit creating record
batches with no columns, test pandas roundtrips
URL: https://github.com/apache/arrow/pull/1252
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 694fe9190..eb1911592 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -308,8 +308,8 @@ cdef shared_ptr[const CKeyValueMetadata]
unbox_metadata(dict metadata):
make_shared[CKeyValueMetadata](unordered_metadata))
-cdef int _schema_from_arrays(
- arrays, names, dict metadata, shared_ptr[CSchema]* schema) except -1:
+cdef _schema_from_arrays(arrays, names, dict metadata,
+ shared_ptr[CSchema]* schema):
cdef:
Column col
c_string c_name
@@ -317,10 +317,11 @@ cdef int _schema_from_arrays(
shared_ptr[CDataType] type_
Py_ssize_t K = len(arrays)
- fields.resize(K)
+ if K == 0:
+ schema.reset(new CSchema(fields, unbox_metadata(metadata)))
+ return
- if not K:
- raise ValueError('Must pass at least one array')
+ fields.resize(K)
if isinstance(arrays[0], Column):
for i in range(K):
@@ -346,7 +347,6 @@ cdef int _schema_from_arrays(
fields[i].reset(new CField(c_name, type_, True))
schema.reset(new CSchema(fields, unbox_metadata(metadata)))
- return 0
cdef class RecordBatch:
@@ -613,10 +613,10 @@ cdef class RecordBatch:
int64_t i
int64_t number_of_arrays = len(arrays)
- if not number_of_arrays:
- raise ValueError('Record batch cannot contain no arrays (for now)')
-
- num_rows = len(arrays[0])
+ if len(arrays) > 0:
+ num_rows = len(arrays[0])
+ else:
+ num_rows = 0
_schema_from_arrays(arrays, names, metadata, &schema)
c_arrays.reserve(len(arrays))
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 527466e6e..6d146f977 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -71,11 +71,11 @@ def tearDown(self):
def _check_pandas_roundtrip(self, df, expected=None, nthreads=1,
expected_schema=None,
check_dtype=True, schema=None,
- check_index=False,
+ preserve_index=False,
as_batch=False):
klass = pa.RecordBatch if as_batch else pa.Table
table = klass.from_pandas(df, schema=schema,
- preserve_index=check_index,
+ preserve_index=preserve_index,
nthreads=nthreads)
result = table.to_pandas(nthreads=nthreads)
@@ -83,7 +83,9 @@ def _check_pandas_roundtrip(self, df, expected=None,
nthreads=1,
assert table.schema.equals(expected_schema)
if expected is None:
expected = df
- tm.assert_frame_equal(result, expected, check_dtype=check_dtype)
+ tm.assert_frame_equal(result, expected, check_dtype=check_dtype,
+ check_index_type=('equiv' if preserve_index
+ else False))
def _check_series_roundtrip(self, s, type_=None):
arr = pa.array(s, from_pandas=True, type=type_)
@@ -131,14 +133,14 @@ def test_non_string_columns(self):
def test_column_index_names_are_preserved(self):
df = pd.DataFrame({'data': [1, 2, 3]})
df.columns.names = ['a']
- self._check_pandas_roundtrip(df, check_index=True)
+ self._check_pandas_roundtrip(df, preserve_index=True)
def test_multiindex_columns(self):
columns = pd.MultiIndex.from_arrays([
['one', 'two'], ['X', 'Y']
])
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
- self._check_pandas_roundtrip(df, check_index=True)
+ self._check_pandas_roundtrip(df, preserve_index=True)
def test_multiindex_columns_with_dtypes(self):
columns = pd.MultiIndex.from_arrays(
@@ -149,11 +151,11 @@ def test_multiindex_columns_with_dtypes(self):
names=['level_1', 'level_2'],
)
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')], columns=columns)
- self._check_pandas_roundtrip(df, check_index=True)
+ self._check_pandas_roundtrip(df, preserve_index=True)
def test_integer_index_column(self):
df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])
- self._check_pandas_roundtrip(df, check_index=True)
+ self._check_pandas_roundtrip(df, preserve_index=True)
def test_categorical_column_index(self):
# I *really* hope no one uses category dtypes for single level column
@@ -1095,6 +1097,15 @@ def test_table_str_to_categorical(self):
expected = pd.DataFrame({'strings': pd.Categorical(values)})
tm.assert_frame_equal(result, expected, check_dtype=True)
+ def test_table_batch_empty_dataframe(self):
+ df = pd.DataFrame({})
+ self._check_pandas_roundtrip(df)
+ self._check_pandas_roundtrip(df, as_batch=True)
+
+ df2 = pd.DataFrame({}, index=[0, 1, 2])
+ self._check_pandas_roundtrip(df2, preserve_index=True)
+ self._check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
+
def test_array_from_pandas_date_with_mask(self):
m = np.array([True, False, True])
data = pd.Series([
diff --git a/python/pyarrow/tests/test_table.py
b/python/pyarrow/tests/test_table.py
index 50190f597..428222466 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -82,6 +82,14 @@ def test_recordbatch_basics():
batch[2]
+def test_recordbatch_no_fields():
+ batch = pa.RecordBatch.from_arrays([], [])
+
+ assert len(batch) == 0
+ assert batch.num_rows == 0
+ assert batch.num_columns == 0
+
+
def test_recordbatch_from_arrays_invalid_names():
data = [
pa.array(range(5)),
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> [Python] RecordBatch.from_pandas fails on DataFrame with no columns when
> preserve_index=False
> ---------------------------------------------------------------------------------------------
>
> Key: ARROW-1732
> URL: https://issues.apache.org/jira/browse/ARROW-1732
> Project: Apache Arrow
> Issue Type: Bug
> Components: Python
> Reporter: Wes McKinney
> Assignee: Wes McKinney
> Labels: pull-request-available
> Fix For: 0.8.0
>
>
> I believe this should have well-defined behavior and not raise an error:
> {code}
> In [5]: pa.RecordBatch.from_pandas(pd.DataFrame({}), preserve_index=False)
> ---------------------------------------------------------------------------
> ValueError Traceback (most recent call last)
> <ipython-input-5-4dda72b47dbd> in <module>()
> ----> 1 pa.RecordBatch.from_pandas(pd.DataFrame({}), preserve_index=False)
> ~/code/arrow/python/pyarrow/table.pxi in pyarrow.lib.RecordBatch.from_pandas
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:39957)()
> 586 df, schema, preserve_index, nthreads=nthreads
> 587 )
> --> 588 return cls.from_arrays(arrays, names, metadata)
> 589
> 590 @staticmethod
> ~/code/arrow/python/pyarrow/table.pxi in pyarrow.lib.RecordBatch.from_arrays
> (/home/wesm/code/arrow/python/build/temp.linux-x86_64-3.5/lib.cxx:40130)()
> 615
> 616 if not number_of_arrays:
> --> 617 raise ValueError('Record batch cannot contain no arrays
> (for now)')
> 618
> 619 num_rows = len(arrays[0])
> ValueError: Record batch cannot contain no arrays (for now)
> {code}
--
This message was sent by Atlassian JIRA
(v6.4.14#64029)