This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c78c92a ARROW-2149: [Python] Reorganize test_convert_pandas.py
c78c92a is described below
commit c78c92a04f3db0bd7dd9a34e6f36743050871acc
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Feb 13 11:19:44 2018 -0500
ARROW-2149: [Python] Reorganize test_convert_pandas.py
This PR simply shuffles things around, reorganizing the test methods under
several classes.
Author: Antoine Pitrou <[email protected]>
Closes #1601 from pitrou/ARROW-2149-reorganize-test-convert-pandas and
squashes the following commits:
821c59be [Antoine Pitrou] ARROW-2149: [Python] Reorganize
test_convert_pandas.py
---
python/pyarrow/tests/test_convert_pandas.py | 1447 ++++++++++++++-------------
1 file changed, 749 insertions(+), 698 deletions(-)
diff --git a/python/pyarrow/tests/test_convert_pandas.py
b/python/pyarrow/tests/test_convert_pandas.py
index 987ac23..5b6f6bc 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -117,41 +117,10 @@ def _check_array_from_pandas_roundtrip(np_array):
npt.assert_array_equal(result, np_array)
-class TestPandasConversion(object):
-
- type_pairs = [
- (np.int8, pa.int8()),
- (np.int16, pa.int16()),
- (np.int32, pa.int32()),
- (np.int64, pa.int64()),
- (np.uint8, pa.uint8()),
- (np.uint16, pa.uint16()),
- (np.uint32, pa.uint32()),
- (np.uint64, pa.uint64()),
- # (np.float16, pa.float16()), # XXX unsupported
- (np.float32, pa.float32()),
- (np.float64, pa.float64()),
- # XXX unsupported
- # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
- (np.object, pa.string()),
- # (np.object, pa.binary()), # XXX unsupported
- (np.object, pa.binary(10)),
- (np.object, pa.list_(pa.int64())),
- ]
-
- def test_all_none_objects(self):
- df = pd.DataFrame({'a': [None, None, None]})
- _check_pandas_roundtrip(df)
-
- def test_all_none_category(self):
- df = pd.DataFrame({'a': [None, None, None]})
- df['a'] = df['a'].astype('category')
- _check_pandas_roundtrip(df)
-
- def test_empty_arrays(self):
- for dtype, pa_type in self.type_pairs:
- arr = np.array([], dtype=dtype)
- _check_array_roundtrip(arr, type=pa_type)
+class TestConvertMetadata(object):
+ """
+ Conversion tests for Pandas metadata & indices.
+ """
def test_non_string_columns(self):
df = pd.DataFrame({0: [1, 2, 3]})
@@ -302,25 +271,6 @@ class TestPandasConversion(object):
_check_pandas_roundtrip(df, preserve_index=True)
- def test_float_no_nulls(self):
- data = {}
- fields = []
- dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
- num_values = 100
-
- for numpy_dtype, arrow_dtype in dtypes:
- values = np.random.randn(num_values)
- data[numpy_dtype] = values.astype(numpy_dtype)
- fields.append(pa.field(numpy_dtype, arrow_dtype))
-
- df = pd.DataFrame(data)
- schema = pa.schema(fields)
- _check_pandas_roundtrip(df, expected_schema=schema)
-
- def test_zero_copy_success(self):
- result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
- npt.assert_array_equal(result, [0, 1, 2])
-
def test_duplicate_column_names_does_not_crash(self):
df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa'))
with pytest.raises(ValueError):
@@ -341,44 +291,187 @@ class TestPandasConversion(object):
with pytest.raises(pa.ArrowException):
table.to_pandas()
- def test_zero_copy_dictionaries(self):
- arr = pa.DictionaryArray.from_arrays(
- np.array([0, 0]),
- np.array([5]))
+ def test_unicode_with_unicode_column_and_index(self):
+ df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
- result = arr.to_pandas(zero_copy_only=True)
- values = pd.Categorical([5, 5])
+ _check_pandas_roundtrip(df, preserve_index=True)
- tm.assert_series_equal(pd.Series(result), pd.Series(values),
- check_names=False)
+ def test_mixed_unicode_column_names(self):
+ df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
- def test_zero_copy_failure_on_object_types(self):
- with pytest.raises(pa.ArrowException):
- pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True)
+ # TODO(phillipc): Should this raise?
+ with pytest.raises(AssertionError):
+ _check_pandas_roundtrip(df, preserve_index=True)
- def test_zero_copy_failure_with_int_when_nulls(self):
- with pytest.raises(pa.ArrowException):
- pa.array([0, 1, None]).to_pandas(zero_copy_only=True)
+ def test_binary_column_name(self):
+ column_data = [u'い']
+ data = {u'あ'.encode('utf8'): column_data}
+ df = pd.DataFrame(data)
- def test_zero_copy_failure_with_float_when_nulls(self):
- with pytest.raises(pa.ArrowException):
- pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True)
+ # we can't use _check_pandas_roundtrip here because our metdata
+ # is always decoded as utf8: even if binary goes in, utf8 comes out
+ t = pa.Table.from_pandas(df, preserve_index=True)
+ df2 = t.to_pandas()
+ assert df.values[0] == df2.values[0]
+ assert df.index.values[0] == df2.index.values[0]
+ assert df.columns[0] == df2.columns[0].encode('utf8')
- def test_zero_copy_failure_on_bool_types(self):
- with pytest.raises(pa.ArrowException):
- pa.array([True, False]).to_pandas(zero_copy_only=True)
+ def test_multiindex_duplicate_values(self):
+ num_rows = 3
+ numbers = list(range(num_rows))
+ index = pd.MultiIndex.from_arrays(
+ [['foo', 'foo', 'bar'], numbers],
+ names=['foobar', 'some_numbers'],
+ )
- def test_zero_copy_failure_on_list_types(self):
- arr = np.array([[1, 2], [8, 9]], dtype=object)
+ df = pd.DataFrame({'numbers': numbers}, index=index)
- with pytest.raises(pa.ArrowException):
- pa.array(arr).to_pandas(zero_copy_only=True)
+ table = pa.Table.from_pandas(df)
+ result_df = table.to_pandas()
+ tm.assert_frame_equal(result_df, df)
- def test_zero_copy_failure_on_timestamp_types(self):
- arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
+ def test_metadata_with_mixed_types(self):
+ df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
+ table = pa.Table.from_pandas(df)
+ metadata = table.schema.metadata
+ assert b'mixed' not in metadata[b'pandas']
- with pytest.raises(pa.ArrowException):
- pa.array(arr).to_pandas(zero_copy_only=True)
+ js = json.loads(metadata[b'pandas'].decode('utf8'))
+ data_column = js['columns'][0]
+ assert data_column['pandas_type'] == 'bytes'
+ assert data_column['numpy_type'] == 'object'
+
+ def test_list_metadata(self):
+ df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
+ schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
+ table = pa.Table.from_pandas(df, schema=schema)
+ metadata = table.schema.metadata
+ assert b'mixed' not in metadata[b'pandas']
+
+ js = json.loads(metadata[b'pandas'].decode('utf8'))
+ data_column = js['columns'][0]
+ assert data_column['pandas_type'] == 'list[int64]'
+ assert data_column['numpy_type'] == 'object'
+
+ def test_decimal_metadata(self):
+ expected = pd.DataFrame({
+ 'decimals': [
+ decimal.Decimal('394092382910493.12341234678'),
+ -decimal.Decimal('314292388910493.12343437128'),
+ ]
+ })
+ table = pa.Table.from_pandas(expected)
+ metadata = table.schema.metadata
+ assert b'mixed' not in metadata[b'pandas']
+
+ js = json.loads(metadata[b'pandas'].decode('utf8'))
+ data_column = js['columns'][0]
+ assert data_column['pandas_type'] == 'decimal'
+ assert data_column['numpy_type'] == 'object'
+ assert data_column['metadata'] == {'precision': 26, 'scale': 11}
+
+ def test_table_column_subset_metadata(self):
+ # ARROW-1883
+ df = pd.DataFrame({
+ 'a': [1, 2, 3],
+ 'b': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
+ table = pa.Table.from_pandas(df)
+
+ table_subset = table.remove_column(1)
+ result = table_subset.to_pandas()
+ tm.assert_frame_equal(result, df[['a']])
+
+ table_subset2 = table_subset.remove_column(1)
+ result = table_subset2.to_pandas()
+ tm.assert_frame_equal(result, df[['a']])
+
+ # non-default index
+ for index in [
+ pd.Index(['a', 'b', 'c'], name='index'),
+ pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]:
+ df = pd.DataFrame({'a': [1, 2, 3],
+ 'b': [.1, .2, .3]}, index=index)
+ table = pa.Table.from_pandas(df)
+
+ table_subset = table.remove_column(1)
+ result = table_subset.to_pandas()
+ tm.assert_frame_equal(result, df[['a']])
+
+ table_subset2 = table_subset.remove_column(1)
+ result = table_subset2.to_pandas()
+ tm.assert_frame_equal(result, df[['a']].reset_index(drop=True))
+
+ def test_empty_list_metadata(self):
+ # Create table with array of empty lists, forced to have type
+ # list(string) in pyarrow
+ c1 = [["test"], ["a", "b"], None]
+ c2 = [[], [], []]
+ arrays = OrderedDict([
+ ('c1', pa.array(c1, type=pa.list_(pa.string()))),
+ ('c2', pa.array(c2, type=pa.list_(pa.string()))),
+ ])
+ rb = pa.RecordBatch.from_arrays(
+ list(arrays.values()),
+ list(arrays.keys())
+ )
+ tbl = pa.Table.from_batches([rb])
+
+ # First roundtrip changes schema, because pandas cannot preserve the
+ # type of empty lists
+ df = tbl.to_pandas()
+ tbl2 = pa.Table.from_pandas(df, preserve_index=True)
+ md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
+
+ # Second roundtrip
+ df2 = tbl2.to_pandas()
+ expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
+
+ tm.assert_frame_equal(df2, expected)
+
+ assert md2['columns'] == [
+ {
+ 'name': 'c1',
+ 'field_name': 'c1',
+ 'metadata': None,
+ 'numpy_type': 'object',
+ 'pandas_type': 'list[unicode]',
+ },
+ {
+ 'name': 'c2',
+ 'field_name': 'c2',
+ 'metadata': None,
+ 'numpy_type': 'object',
+ 'pandas_type': 'list[empty]',
+ },
+ {
+ 'name': None,
+ 'field_name': '__index_level_0__',
+ 'metadata': None,
+ 'numpy_type': 'int64',
+ 'pandas_type': 'int64',
+ }
+ ]
+
+
+class TestConvertPrimitiveTypes(object):
+ """
+ Conversion tests for primitive (e.g. numeric) types.
+ """
+
+ def test_float_no_nulls(self):
+ data = {}
+ fields = []
+ dtypes = [('f4', pa.float32()), ('f8', pa.float64())]
+ num_values = 100
+
+ for numpy_dtype, arrow_dtype in dtypes:
+ values = np.random.randn(num_values)
+ data[numpy_dtype] = values.astype(numpy_dtype)
+ fields.append(pa.field(numpy_dtype, arrow_dtype))
+
+ df = pd.DataFrame(data)
+ schema = pa.schema(fields)
+ _check_pandas_roundtrip(df, expected_schema=schema)
def test_float_nulls(self):
num_values = 100
@@ -408,24 +501,6 @@ class TestPandasConversion(object):
result = table.to_pandas()
tm.assert_frame_equal(result, ex_frame)
- def test_float_object_nulls(self):
- arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
- df = pd.DataFrame({'floats': arr})
- expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
- field = pa.field('floats', pa.float64())
- schema = pa.schema([field])
- _check_pandas_roundtrip(df, expected=expected,
- expected_schema=schema)
-
- def test_int_object_nulls(self):
- arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
- df = pd.DataFrame({'ints': arr})
- expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
- field = pa.field('ints', pa.int64())
- schema = pa.schema([field])
- _check_pandas_roundtrip(df, expected=expected,
- expected_schema=schema)
-
def test_integer_no_nulls(self):
data = OrderedDict()
fields = []
@@ -522,6 +597,24 @@ class TestPandasConversion(object):
tm.assert_frame_equal(result, ex_frame)
+ def test_float_object_nulls(self):
+ arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
+ df = pd.DataFrame({'floats': arr})
+ expected = pd.DataFrame({'floats': pd.to_numeric(arr)})
+ field = pa.field('floats', pa.float64())
+ schema = pa.schema([field])
+ _check_pandas_roundtrip(df, expected=expected,
+ expected_schema=schema)
+
+ def test_int_object_nulls(self):
+ arr = np.array([None, 1, np.int64(3)] * 5, dtype=object)
+ df = pd.DataFrame({'ints': arr})
+ expected = pd.DataFrame({'ints': pd.to_numeric(arr)})
+ field = pa.field('ints', pa.int64())
+ schema = pa.schema([field])
+ _check_pandas_roundtrip(df, expected=expected,
+ expected_schema=schema)
+
def test_boolean_object_nulls(self):
arr = np.array([False, None, True] * 100, dtype=object)
df = pd.DataFrame({'bools': arr})
@@ -540,81 +633,11 @@ class TestPandasConversion(object):
_check_type(pa.int32())
_check_type(pa.float64())
- def test_unicode(self):
- repeats = 1000
- values = [u'foo', None, u'bar', u'mañana', np.nan]
- df = pd.DataFrame({'strings': values * repeats})
- field = pa.field('strings', pa.string())
- schema = pa.schema([field])
-
- _check_pandas_roundtrip(df, expected_schema=schema)
-
- def test_unicode_with_unicode_column_and_index(self):
- df = pd.DataFrame({u'あ': [u'い']}, index=[u'う'])
-
- _check_pandas_roundtrip(df, preserve_index=True)
-
- def test_mixed_unicode_column_names(self):
- df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う'])
-
- # TODO(phillipc): Should this raise?
- with pytest.raises(AssertionError):
- _check_pandas_roundtrip(df, preserve_index=True)
-
- def test_binary_column_name(self):
- column_data = [u'い']
- data = {u'あ'.encode('utf8'): column_data}
- df = pd.DataFrame(data)
-
- # we can't use _check_pandas_roundtrip here because our metdata
- # is always decoded as utf8: even if binary goes in, utf8 comes out
- t = pa.Table.from_pandas(df, preserve_index=True)
- df2 = t.to_pandas()
- assert df.values[0] == df2.values[0]
- assert df.index.values[0] == df2.index.values[0]
- assert df.columns[0] == df2.columns[0].encode('utf8')
-
- def test_bytes_to_binary(self):
- values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
- df = pd.DataFrame({'strings': values})
-
- table = pa.Table.from_pandas(df)
- assert table[0].type == pa.binary()
-
- values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
- expected = pd.DataFrame({'strings': values2})
- _check_pandas_roundtrip(df, expected)
-
- @pytest.mark.large_memory
- def test_bytes_exceed_2gb(self):
- val = 'x' * (1 << 20)
- df = pd.DataFrame({
- 'strings': np.array([val] * 4000, dtype=object)
- })
- arr = pa.array(df['strings'])
- assert isinstance(arr, pa.ChunkedArray)
- assert arr.num_chunks == 2
- arr = None
-
- table = pa.Table.from_pandas(df)
- assert table[0].data.num_chunks == 2
-
- def test_fixed_size_bytes(self):
- values = [b'foo', None, b'bar', None, None, b'hey']
- df = pd.DataFrame({'strings': values})
- schema = pa.schema([pa.field('strings', pa.binary(3))])
- table = pa.Table.from_pandas(df, schema=schema)
- assert table.schema[0].type == schema[0].type
- assert table.schema[0].name == schema[0].name
- result = table.to_pandas()
- tm.assert_frame_equal(result, df)
- def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
- values = [b'foo', None, b'ba', None, None, b'hey']
- df = pd.DataFrame({'strings': values})
- schema = pa.schema([pa.field('strings', pa.binary(3))])
- with pytest.raises(pa.ArrowInvalid):
- pa.Table.from_pandas(df, schema=schema)
+class TestConvertDateTimeLikeTypes(object):
+ """
+ Conversion tests for datetime- and timestamp-like types (date64, etc.).
+ """
def test_timestamps_notimezone_no_nulls(self):
df = pd.DataFrame({
@@ -788,205 +811,257 @@ class TestPandasConversion(object):
})
pa.Table.from_pandas(df)
- def test_column_of_arrays(self):
- df, schema = dataframe_with_arrays()
- _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
- table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
- assert table.schema.equals(schema)
+ def test_pytime_from_pandas(self):
+ pytimes = [time(1, 2, 3, 1356),
+ time(4, 5, 6, 1356)]
- for column in df.columns:
- field = schema.field_by_name(column)
- _check_array_roundtrip(df[column], type=field.type)
+ # microseconds
+ t1 = pa.time64('us')
- def test_column_of_arrays_to_py(self):
- # Test regression in ARROW-1199 not caught in above test
- dtype = 'i1'
- arr = np.array([
- np.arange(10, dtype=dtype),
- np.arange(5, dtype=dtype),
- None,
- np.arange(1, dtype=dtype)
- ])
- type_ = pa.list_(pa.int8())
- parr = pa.array(arr, type=type_)
+ aobjs = np.array(pytimes + [None], dtype=object)
+ parr = pa.array(aobjs)
+ assert parr.type == t1
+ assert parr[0].as_py() == pytimes[0]
+ assert parr[1].as_py() == pytimes[1]
+ assert parr[2] is pa.NA
- assert parr[0].as_py() == list(range(10))
- assert parr[1].as_py() == list(range(5))
- assert parr[2].as_py() is None
- assert parr[3].as_py() == [0]
+ # DataFrame
+ df = pd.DataFrame({'times': aobjs})
+ batch = pa.RecordBatch.from_pandas(df)
+ assert batch[0].equals(parr)
- def test_column_of_lists(self):
- df, schema = dataframe_with_lists()
- _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
- table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
- assert table.schema.equals(schema)
+ # Test ndarray of int64 values
+ arr = np.array([_pytime_to_micros(v) for v in pytimes],
+ dtype='int64')
- for column in df.columns:
- field = schema.field_by_name(column)
- _check_array_roundtrip(df[column], type=field.type)
+ a1 = pa.array(arr, type=pa.time64('us'))
+ assert a1[0].as_py() == pytimes[0]
- def test_column_of_lists_first_empty(self):
- # ARROW-2124
- num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]]
- series = pd.Series([np.array(s, dtype=float) for s in num_lists])
- arr = pa.array(series)
- result = pd.Series(arr.to_pandas())
- tm.assert_series_equal(result, series)
+ a2 = pa.array(arr * 1000, type=pa.time64('ns'))
+ assert a2[0].as_py() == pytimes[0]
- def test_column_of_lists_chunked(self):
- # ARROW-1357
- df = pd.DataFrame({
- 'lists': np.array([
- [1, 2],
- None,
- [2, 3],
- [4, 5],
- [6, 7],
- [8, 9]
- ], dtype=object)
- })
+ a3 = pa.array((arr / 1000).astype('i4'),
+ type=pa.time32('ms'))
+ assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
- schema = pa.schema([
- pa.field('lists', pa.list_(pa.int64()))
- ])
+ a4 = pa.array((arr / 1000000).astype('i4'),
+ type=pa.time32('s'))
+ assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
- t1 = pa.Table.from_pandas(df[:2], schema=schema)
- t2 = pa.Table.from_pandas(df[2:], schema=schema)
+ def test_arrow_time_to_pandas(self):
+ pytimes = [time(1, 2, 3, 1356),
+ time(4, 5, 6, 1356),
+ time(0, 0, 0)]
- table = pa.concat_tables([t1, t2])
- result = table.to_pandas()
+ expected = np.array(pytimes[:2] + [None])
+ expected_ms = np.array([x.replace(microsecond=1000)
+ for x in pytimes[:2]] +
+ [None])
+ expected_s = np.array([x.replace(microsecond=0)
+ for x in pytimes[:2]] +
+ [None])
- tm.assert_frame_equal(result, df)
+ arr = np.array([_pytime_to_micros(v) for v in pytimes],
+ dtype='int64')
+ arr = np.array([_pytime_to_micros(v) for v in pytimes],
+ dtype='int64')
- def test_column_of_lists_chunked2(self):
- data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11],
- [12, 13], [14, 15], [16, 17]]
- data2 = [[8, 9], [18, 19]]
+ null_mask = np.array([False, False, True], dtype=bool)
- a1 = pa.array(data1)
- a2 = pa.array(data2)
+ a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
+ a2 = pa.array(arr * 1000, mask=null_mask,
+ type=pa.time64('ns'))
- t1 = pa.Table.from_arrays([a1], names=['a'])
- t2 = pa.Table.from_arrays([a2], names=['a'])
+ a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
+ type=pa.time32('ms'))
+ a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
+ type=pa.time32('s'))
- concatenated = pa.concat_tables([t1, t2])
+ names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
+ batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
+ arr = a1.to_pandas()
+ assert (arr == expected).all()
- result = concatenated.to_pandas()
- expected = pd.DataFrame({'a': data1 + data2})
+ arr = a2.to_pandas()
+ assert (arr == expected).all()
- tm.assert_frame_equal(result, expected)
+ arr = a3.to_pandas()
+ assert (arr == expected_ms).all()
- def test_column_of_lists_strided(self):
- df, schema = dataframe_with_lists()
- df = pd.concat([df] * 6, ignore_index=True)
+ arr = a4.to_pandas()
+ assert (arr == expected_s).all()
- arr = df['int64'].values[::3]
- assert arr.strides[0] != 8
+ df = batch.to_pandas()
+ expected_df = pd.DataFrame({'time64[us]': expected,
+ 'time64[ns]': expected,
+ 'time32[ms]': expected_ms,
+ 'time32[s]': expected_s},
+ columns=names)
- _check_array_roundtrip(arr)
+ tm.assert_frame_equal(df, expected_df)
- def test_nested_lists_all_none(self):
- data = np.array([[None, None], None], dtype=object)
+ def test_numpy_datetime64_columns(self):
+ datetime64_ns = np.array([
+ '2007-07-13T01:23:34.123456789',
+ None,
+ '2006-01-13T12:34:56.432539784',
+ '2010-08-13T05:46:57.437699912'],
+ dtype='datetime64[ns]')
+ _check_array_from_pandas_roundtrip(datetime64_ns)
- arr = pa.array(data)
- expected = pa.array(list(data))
- assert arr.equals(expected)
- assert arr.type == pa.list_(pa.null())
+ datetime64_us = np.array([
+ '2007-07-13T01:23:34.123456',
+ None,
+ '2006-01-13T12:34:56.432539',
+ '2010-08-13T05:46:57.437699'],
+ dtype='datetime64[us]')
+ _check_array_from_pandas_roundtrip(datetime64_us)
- data2 = np.array([None, None, [None, None],
- np.array([None, None], dtype=object)],
- dtype=object)
- arr = pa.array(data2)
- expected = pa.array([None, None, [None, None], [None, None]])
- assert arr.equals(expected)
+ datetime64_ms = np.array([
+ '2007-07-13T01:23:34.123',
+ None,
+ '2006-01-13T12:34:56.432',
+ '2010-08-13T05:46:57.437'],
+ dtype='datetime64[ms]')
+ _check_array_from_pandas_roundtrip(datetime64_ms)
- def test_nested_lists_all_empty(self):
- # ARROW-2128
- data = pd.Series([[], [], []])
- arr = pa.array(data)
- expected = pa.array(list(data))
- assert arr.equals(expected)
- assert arr.type == pa.list_(pa.null())
+ datetime64_s = np.array([
+ '2007-07-13T01:23:34',
+ None,
+ '2006-01-13T12:34:56',
+ '2010-08-13T05:46:57'],
+ dtype='datetime64[s]')
+ _check_array_from_pandas_roundtrip(datetime64_s)
- def test_threaded_conversion(self):
- df = _alltypes_example()
- _check_pandas_roundtrip(df, nthreads=2)
- _check_pandas_roundtrip(df, nthreads=2, as_batch=True)
+ def test_numpy_datetime64_day_unit(self):
+ datetime64_d = np.array([
+ '2007-07-13',
+ None,
+ '2006-01-15',
+ '2010-08-19'],
+ dtype='datetime64[D]')
+ _check_array_from_pandas_roundtrip(datetime64_d)
- def test_category(self):
- repeats = 5
- v1 = ['foo', None, 'bar', 'qux', np.nan]
- v2 = [4, 5, 6, 7, 8]
- v3 = [b'foo', None, b'bar', b'qux', np.nan]
- df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
- 'cat_ints': pd.Categorical(v2 * repeats),
- 'cat_binary': pd.Categorical(v3 * repeats),
- 'cat_strings_ordered': pd.Categorical(
- v1 * repeats, categories=['bar', 'qux', 'foo'],
- ordered=True),
- 'ints': v2 * repeats,
- 'ints2': v2 * repeats,
- 'strings': v1 * repeats,
- 'strings2': v1 * repeats,
- 'strings3': v3 * repeats})
- _check_pandas_roundtrip(df)
+ def test_array_from_pandas_date_with_mask(self):
+ m = np.array([True, False, True])
+ data = pd.Series([
+ date(1990, 1, 1),
+ date(1991, 1, 1),
+ date(1992, 1, 1)
+ ])
- arrays = [
- pd.Categorical(v1 * repeats),
- pd.Categorical(v2 * repeats),
- pd.Categorical(v3 * repeats)
- ]
- for values in arrays:
- _check_array_roundtrip(values)
+ result = pa.Array.from_pandas(data, mask=m)
- def test_mixed_types_fails(self):
- data = pd.DataFrame({'a': ['a', 1, 2.0]})
- with pytest.raises(pa.ArrowException):
- pa.Table.from_pandas(data)
+ expected = pd.Series([None, date(1991, 1, 1), None])
+ assert pa.Array.from_pandas(expected).equals(result)
- data = pd.DataFrame({'a': [1, True]})
- with pytest.raises(pa.ArrowException):
- pa.Table.from_pandas(data)
- def test_strided_data_import(self):
- cases = []
+class TestConvertStringLikeTypes(object):
+ """
+ Conversion tests for string and binary types.
+ """
- columns = ['a', 'b', 'c']
- N, K = 100, 3
- random_numbers = np.random.randn(N, K).copy() * 100
+ def test_unicode(self):
+ repeats = 1000
+ values = [u'foo', None, u'bar', u'mañana', np.nan]
+ df = pd.DataFrame({'strings': values * repeats})
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
- numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
- 'f4', 'f8']
+ _check_pandas_roundtrip(df, expected_schema=schema)
- for type_name in numeric_dtypes:
- cases.append(random_numbers.astype(type_name))
+ def test_bytes_to_binary(self):
+ values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan]
+ df = pd.DataFrame({'strings': values})
- # strings
- cases.append(np.array([tm.rands(10) for i in range(N * K)],
- dtype=object)
- .reshape(N, K).copy())
+ table = pa.Table.from_pandas(df)
+ assert table[0].type == pa.binary()
- # booleans
- boolean_objects = (np.array([True, False, True] * N, dtype=object)
- .reshape(N, K).copy())
+ values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan]
+ expected = pd.DataFrame({'strings': values2})
+ _check_pandas_roundtrip(df, expected)
- # add some nulls, so dtype comes back as objects
- boolean_objects[5] = None
- cases.append(boolean_objects)
+ @pytest.mark.large_memory
+ def test_bytes_exceed_2gb(self):
+ val = 'x' * (1 << 20)
+ df = pd.DataFrame({
+ 'strings': np.array([val] * 4000, dtype=object)
+ })
+ arr = pa.array(df['strings'])
+ assert isinstance(arr, pa.ChunkedArray)
+ assert arr.num_chunks == 2
+ arr = None
- cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
- dtype='datetime64[ms]')
- .reshape(N, K).copy())
+ table = pa.Table.from_pandas(df)
+ assert table[0].data.num_chunks == 2
- strided_mask = (random_numbers > 0).astype(bool)[:, 0]
+ def test_fixed_size_bytes(self):
+ values = [b'foo', None, b'bar', None, None, b'hey']
+ df = pd.DataFrame({'strings': values})
+ schema = pa.schema([pa.field('strings', pa.binary(3))])
+ table = pa.Table.from_pandas(df, schema=schema)
+ assert table.schema[0].type == schema[0].type
+ assert table.schema[0].name == schema[0].name
+ result = table.to_pandas()
+ tm.assert_frame_equal(result, df)
- for case in cases:
- df = pd.DataFrame(case, columns=columns)
- col = df['a']
+ def test_fixed_size_bytes_does_not_accept_varying_lengths(self):
+ values = [b'foo', None, b'ba', None, None, b'hey']
+ df = pd.DataFrame({'strings': values})
+ schema = pa.schema([pa.field('strings', pa.binary(3))])
+ with pytest.raises(pa.ArrowInvalid):
+ pa.Table.from_pandas(df, schema=schema)
- _check_pandas_roundtrip(df)
- _check_array_roundtrip(col)
- _check_array_roundtrip(col, mask=strided_mask)
+ def test_table_empty_str(self):
+ values = ['', '', '', '', '']
+ df = pd.DataFrame({'strings': values})
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
+ table = pa.Table.from_pandas(df, schema=schema)
+
+ result1 = table.to_pandas(strings_to_categorical=False)
+ expected1 = pd.DataFrame({'strings': values})
+ tm.assert_frame_equal(result1, expected1, check_dtype=True)
+
+ result2 = table.to_pandas(strings_to_categorical=True)
+ expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
+ tm.assert_frame_equal(result2, expected2, check_dtype=True)
+
+ def test_table_str_to_categorical_without_na(self):
+ values = ['a', 'a', 'b', 'b', 'c']
+ df = pd.DataFrame({'strings': values})
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
+ table = pa.Table.from_pandas(df, schema=schema)
+
+ result = table.to_pandas(strings_to_categorical=True)
+ expected = pd.DataFrame({'strings': pd.Categorical(values)})
+ tm.assert_frame_equal(result, expected, check_dtype=True)
+
+ with pytest.raises(pa.ArrowInvalid):
+ table.to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
+
+ def test_table_str_to_categorical_with_na(self):
+ values = [None, 'a', 'b', np.nan]
+ df = pd.DataFrame({'strings': values})
+ field = pa.field('strings', pa.string())
+ schema = pa.schema([field])
+ table = pa.Table.from_pandas(df, schema=schema)
+
+ result = table.to_pandas(strings_to_categorical=True)
+ expected = pd.DataFrame({'strings': pd.Categorical(values)})
+ tm.assert_frame_equal(result, expected, check_dtype=True)
+
+ with pytest.raises(pa.ArrowInvalid):
+ table.to_pandas(strings_to_categorical=True,
+ zero_copy_only=True)
+
+
+class TestConvertDecimalTypes(object):
+ """
+ Conversion test for decimal types.
+ """
def test_decimal_32_from_pandas(self):
expected = pd.DataFrame({
@@ -1057,201 +1132,131 @@ class TestPandasConversion(object):
df = converted.to_pandas()
tm.assert_frame_equal(df, expected)
- def test_pytime_from_pandas(self):
- pytimes = [time(1, 2, 3, 1356),
- time(4, 5, 6, 1356)]
-
- # microseconds
- t1 = pa.time64('us')
- aobjs = np.array(pytimes + [None], dtype=object)
- parr = pa.array(aobjs)
- assert parr.type == t1
- assert parr[0].as_py() == pytimes[0]
- assert parr[1].as_py() == pytimes[1]
- assert parr[2] is pa.NA
+class TestListTypes(object):
+ """
+ Conversion tests for list<> types.
+ """
- # DataFrame
- df = pd.DataFrame({'times': aobjs})
- batch = pa.RecordBatch.from_pandas(df)
- assert batch[0].equals(parr)
+ def test_column_of_arrays(self):
+ df, schema = dataframe_with_arrays()
+ _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
+ table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
+ assert table.schema.equals(schema)
- # Test ndarray of int64 values
- arr = np.array([_pytime_to_micros(v) for v in pytimes],
- dtype='int64')
+ for column in df.columns:
+ field = schema.field_by_name(column)
+ _check_array_roundtrip(df[column], type=field.type)
- a1 = pa.array(arr, type=pa.time64('us'))
- assert a1[0].as_py() == pytimes[0]
+ def test_column_of_arrays_to_py(self):
+ # Test regression in ARROW-1199 not caught in above test
+ dtype = 'i1'
+ arr = np.array([
+ np.arange(10, dtype=dtype),
+ np.arange(5, dtype=dtype),
+ None,
+ np.arange(1, dtype=dtype)
+ ])
+ type_ = pa.list_(pa.int8())
+ parr = pa.array(arr, type=type_)
- a2 = pa.array(arr * 1000, type=pa.time64('ns'))
- assert a2[0].as_py() == pytimes[0]
+ assert parr[0].as_py() == list(range(10))
+ assert parr[1].as_py() == list(range(5))
+ assert parr[2].as_py() is None
+ assert parr[3].as_py() == [0]
- a3 = pa.array((arr / 1000).astype('i4'),
- type=pa.time32('ms'))
- assert a3[0].as_py() == pytimes[0].replace(microsecond=1000)
-
- a4 = pa.array((arr / 1000000).astype('i4'),
- type=pa.time32('s'))
- assert a4[0].as_py() == pytimes[0].replace(microsecond=0)
-
- def test_arrow_time_to_pandas(self):
- pytimes = [time(1, 2, 3, 1356),
- time(4, 5, 6, 1356),
- time(0, 0, 0)]
-
- expected = np.array(pytimes[:2] + [None])
- expected_ms = np.array([x.replace(microsecond=1000)
- for x in pytimes[:2]] +
- [None])
- expected_s = np.array([x.replace(microsecond=0)
- for x in pytimes[:2]] +
- [None])
-
- arr = np.array([_pytime_to_micros(v) for v in pytimes],
- dtype='int64')
- arr = np.array([_pytime_to_micros(v) for v in pytimes],
- dtype='int64')
-
- null_mask = np.array([False, False, True], dtype=bool)
-
- a1 = pa.array(arr, mask=null_mask, type=pa.time64('us'))
- a2 = pa.array(arr * 1000, mask=null_mask,
- type=pa.time64('ns'))
-
- a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask,
- type=pa.time32('ms'))
- a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask,
- type=pa.time32('s'))
-
- names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]']
- batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names)
- arr = a1.to_pandas()
- assert (arr == expected).all()
-
- arr = a2.to_pandas()
- assert (arr == expected).all()
-
- arr = a3.to_pandas()
- assert (arr == expected_ms).all()
-
- arr = a4.to_pandas()
- assert (arr == expected_s).all()
+ def test_column_of_lists(self):
+ df, schema = dataframe_with_lists()
+ _check_pandas_roundtrip(df, schema=schema, expected_schema=schema)
+ table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
+ assert table.schema.equals(schema)
- df = batch.to_pandas()
- expected_df = pd.DataFrame({'time64[us]': expected,
- 'time64[ns]': expected,
- 'time32[ms]': expected_ms,
- 'time32[s]': expected_s},
- columns=names)
+ for column in df.columns:
+ field = schema.field_by_name(column)
+ _check_array_roundtrip(df[column], type=field.type)
- tm.assert_frame_equal(df, expected_df)
+ def test_column_of_lists_first_empty(self):
+ # ARROW-2124
+ num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]]
+ series = pd.Series([np.array(s, dtype=float) for s in num_lists])
+ arr = pa.array(series)
+ result = pd.Series(arr.to_pandas())
+ tm.assert_series_equal(result, series)
- def test_numpy_datetime64_columns(self):
- datetime64_ns = np.array([
- '2007-07-13T01:23:34.123456789',
+ def test_column_of_lists_chunked(self):
+ # ARROW-1357
+ df = pd.DataFrame({
+ 'lists': np.array([
+ [1, 2],
None,
- '2006-01-13T12:34:56.432539784',
- '2010-08-13T05:46:57.437699912'],
- dtype='datetime64[ns]')
- _check_array_from_pandas_roundtrip(datetime64_ns)
+ [2, 3],
+ [4, 5],
+ [6, 7],
+ [8, 9]
+ ], dtype=object)
+ })
- datetime64_us = np.array([
- '2007-07-13T01:23:34.123456',
- None,
- '2006-01-13T12:34:56.432539',
- '2010-08-13T05:46:57.437699'],
- dtype='datetime64[us]')
- _check_array_from_pandas_roundtrip(datetime64_us)
+ schema = pa.schema([
+ pa.field('lists', pa.list_(pa.int64()))
+ ])
- datetime64_ms = np.array([
- '2007-07-13T01:23:34.123',
- None,
- '2006-01-13T12:34:56.432',
- '2010-08-13T05:46:57.437'],
- dtype='datetime64[ms]')
- _check_array_from_pandas_roundtrip(datetime64_ms)
+ t1 = pa.Table.from_pandas(df[:2], schema=schema)
+ t2 = pa.Table.from_pandas(df[2:], schema=schema)
- datetime64_s = np.array([
- '2007-07-13T01:23:34',
- None,
- '2006-01-13T12:34:56',
- '2010-08-13T05:46:57'],
- dtype='datetime64[s]')
- _check_array_from_pandas_roundtrip(datetime64_s)
+ table = pa.concat_tables([t1, t2])
+ result = table.to_pandas()
- def test_numpy_datetime64_day_unit(self):
- datetime64_d = np.array([
- '2007-07-13',
- None,
- '2006-01-15',
- '2010-08-19'],
- dtype='datetime64[D]')
- _check_array_from_pandas_roundtrip(datetime64_d)
+ tm.assert_frame_equal(result, df)
- def test_all_nones(self):
- def _check_series(s):
- converted = pa.array(s)
- assert isinstance(converted, pa.NullArray)
- assert len(converted) == 3
- assert converted.null_count == 3
- assert converted[0] is pa.NA
+ def test_column_of_lists_chunked2(self):
+ data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11],
+ [12, 13], [14, 15], [16, 17]]
+ data2 = [[8, 9], [18, 19]]
- _check_series(pd.Series([None] * 3, dtype=object))
- _check_series(pd.Series([np.nan] * 3, dtype=object))
- _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object))
+ a1 = pa.array(data1)
+ a2 = pa.array(data2)
- def test_multiindex_duplicate_values(self):
- num_rows = 3
- numbers = list(range(num_rows))
- index = pd.MultiIndex.from_arrays(
- [['foo', 'foo', 'bar'], numbers],
- names=['foobar', 'some_numbers'],
- )
+ t1 = pa.Table.from_arrays([a1], names=['a'])
+ t2 = pa.Table.from_arrays([a2], names=['a'])
- df = pd.DataFrame({'numbers': numbers}, index=index)
+ concatenated = pa.concat_tables([t1, t2])
- table = pa.Table.from_pandas(df)
- result_df = table.to_pandas()
- tm.assert_frame_equal(result_df, df)
+ result = concatenated.to_pandas()
+ expected = pd.DataFrame({'a': data1 + data2})
- def test_partial_schema(self):
- data = OrderedDict([
- ('a', [0, 1, 2, 3, 4]),
- ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
- ('c', [-10, -5, 0, 5, 10])
- ])
- df = pd.DataFrame(data)
+ tm.assert_frame_equal(result, expected)
- partial_schema = pa.schema([
- pa.field('a', pa.int64()),
- pa.field('b', pa.int32())
- ])
+ def test_column_of_lists_strided(self):
+ df, schema = dataframe_with_lists()
+ df = pd.concat([df] * 6, ignore_index=True)
- expected_schema = pa.schema([
- pa.field('a', pa.int64()),
- pa.field('b', pa.int32()),
- pa.field('c', pa.int64())
- ])
+ arr = df['int64'].values[::3]
+ assert arr.strides[0] != 8
- _check_pandas_roundtrip(df, schema=partial_schema,
- expected_schema=expected_schema)
+ _check_array_roundtrip(arr)
- def test_structarray(self):
- ints = pa.array([None, 2, 3], type=pa.int64())
- strs = pa.array([u'a', None, u'c'], type=pa.string())
- bools = pa.array([True, False, None], type=pa.bool_())
- arr = pa.StructArray.from_arrays(
- [ints, strs, bools],
- ['ints', 'strs', 'bools'])
+ def test_nested_lists_all_none(self):
+ data = np.array([[None, None], None], dtype=object)
- expected = pd.Series([
- {'ints': None, 'strs': u'a', 'bools': True},
- {'ints': 2, 'strs': None, 'bools': False},
- {'ints': 3, 'strs': u'c', 'bools': None},
- ])
+ arr = pa.array(data)
+ expected = pa.array(list(data))
+ assert arr.equals(expected)
+ assert arr.type == pa.list_(pa.null())
- series = pd.Series(arr.to_pandas())
- tm.assert_series_equal(series, expected)
+ data2 = np.array([None, None, [None, None],
+ np.array([None, None], dtype=object)],
+ dtype=object)
+ arr = pa.array(data2)
+ expected = pa.array([None, None, [None, None], [None, None]])
+ assert arr.equals(expected)
+
+ def test_nested_lists_all_empty(self):
+ # ARROW-2128
+ data = pd.Series([[], [], []])
+ arr = pa.array(data)
+ expected = pa.array(list(data))
+ assert arr.equals(expected)
+ assert arr.type == pa.list_(pa.null())
def test_infer_lists(self):
data = OrderedDict([
@@ -1285,242 +1290,288 @@ class TestPandasConversion(object):
_check_pandas_roundtrip(df, expected_schema=expected_schema)
- def test_metadata_with_mixed_types(self):
- df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']})
- table = pa.Table.from_pandas(df)
- metadata = table.schema.metadata
- assert b'mixed' not in metadata[b'pandas']
+ @pytest.mark.parametrize('t,data,expected', [
+ (
+ pa.int64,
+ [[1, 2], [3], None],
+ [None, [3], None]
+ ),
+ (
+ pa.string,
+ [[u'aaa', u'bb'], [u'c'], None],
+ [None, [u'c'], None]
+ ),
+ (
+ pa.null,
+ [[None, None], [None], None],
+ [None, [None], None]
+ )
+ ])
+ def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
+ m = np.array([True, False, True])
- js = json.loads(metadata[b'pandas'].decode('utf8'))
- data_column = js['columns'][0]
- assert data_column['pandas_type'] == 'bytes'
- assert data_column['numpy_type'] == 'object'
+ s = pd.Series(data)
+ result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))
- def test_list_metadata(self):
- df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]})
- schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))])
- table = pa.Table.from_pandas(df, schema=schema)
- metadata = table.schema.metadata
- assert b'mixed' not in metadata[b'pandas']
+ assert pa.Array.from_pandas(expected,
+ type=pa.list_(t())).equals(result)
- js = json.loads(metadata[b'pandas'].decode('utf8'))
- data_column = js['columns'][0]
- assert data_column['pandas_type'] == 'list[int64]'
- assert data_column['numpy_type'] == 'object'
+ def test_empty_list_roundtrip(self):
+ empty_list_array = np.empty((3,), dtype=object)
+ empty_list_array.fill([])
- def test_decimal_metadata(self):
- expected = pd.DataFrame({
- 'decimals': [
- decimal.Decimal('394092382910493.12341234678'),
- -decimal.Decimal('314292388910493.12343437128'),
- ]
- })
- table = pa.Table.from_pandas(expected)
- metadata = table.schema.metadata
- assert b'mixed' not in metadata[b'pandas']
+ df = pd.DataFrame({'a': np.array(['1', '2', '3']),
+ 'b': empty_list_array})
+ tbl = pa.Table.from_pandas(df)
- js = json.loads(metadata[b'pandas'].decode('utf8'))
- data_column = js['columns'][0]
- assert data_column['pandas_type'] == 'decimal'
- assert data_column['numpy_type'] == 'object'
- assert data_column['metadata'] == {'precision': 26, 'scale': 11}
+ result = tbl.to_pandas()
- def test_table_empty_str(self):
- values = ['', '', '', '', '']
- df = pd.DataFrame({'strings': values})
- field = pa.field('strings', pa.string())
- schema = pa.schema([field])
- table = pa.Table.from_pandas(df, schema=schema)
+ tm.assert_frame_equal(result, df)
- result1 = table.to_pandas(strings_to_categorical=False)
- expected1 = pd.DataFrame({'strings': values})
- tm.assert_frame_equal(result1, expected1, check_dtype=True)
- result2 = table.to_pandas(strings_to_categorical=True)
- expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
- tm.assert_frame_equal(result2, expected2, check_dtype=True)
+class TestConvertStructTypes(object):
+ """
+ Conversion tests for struct types.
+ """
- def test_table_str_to_categorical_without_na(self):
- values = ['a', 'a', 'b', 'b', 'c']
- df = pd.DataFrame({'strings': values})
- field = pa.field('strings', pa.string())
- schema = pa.schema([field])
- table = pa.Table.from_pandas(df, schema=schema)
+ def test_structarray(self):
+ ints = pa.array([None, 2, 3], type=pa.int64())
+ strs = pa.array([u'a', None, u'c'], type=pa.string())
+ bools = pa.array([True, False, None], type=pa.bool_())
+ arr = pa.StructArray.from_arrays(
+ [ints, strs, bools],
+ ['ints', 'strs', 'bools'])
- result = table.to_pandas(strings_to_categorical=True)
- expected = pd.DataFrame({'strings': pd.Categorical(values)})
- tm.assert_frame_equal(result, expected, check_dtype=True)
+ expected = pd.Series([
+ {'ints': None, 'strs': u'a', 'bools': True},
+ {'ints': 2, 'strs': None, 'bools': False},
+ {'ints': 3, 'strs': u'c', 'bools': None},
+ ])
- with pytest.raises(pa.ArrowInvalid):
- table.to_pandas(strings_to_categorical=True,
- zero_copy_only=True)
+ series = pd.Series(arr.to_pandas())
+ tm.assert_series_equal(series, expected)
- def test_table_str_to_categorical_with_na(self):
- values = [None, 'a', 'b', np.nan]
- df = pd.DataFrame({'strings': values})
- field = pa.field('strings', pa.string())
- schema = pa.schema([field])
- table = pa.Table.from_pandas(df, schema=schema)
- result = table.to_pandas(strings_to_categorical=True)
- expected = pd.DataFrame({'strings': pd.Categorical(values)})
- tm.assert_frame_equal(result, expected, check_dtype=True)
+class TestZeroCopyConversion(object):
+ """
+ Tests that zero-copy conversion works with some types.
+ """
- with pytest.raises(pa.ArrowInvalid):
- table.to_pandas(strings_to_categorical=True,
- zero_copy_only=True)
+ def test_zero_copy_success(self):
+ result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True)
+ npt.assert_array_equal(result, [0, 1, 2])
- def test_table_batch_empty_dataframe(self):
- df = pd.DataFrame({})
+ def test_zero_copy_dictionaries(self):
+ arr = pa.DictionaryArray.from_arrays(
+ np.array([0, 0]),
+ np.array([5]))
+
+ result = arr.to_pandas(zero_copy_only=True)
+ values = pd.Categorical([5, 5])
+
+ tm.assert_series_equal(pd.Series(result), pd.Series(values),
+ check_names=False)
+
+ def test_zero_copy_failure_on_object_types(self):
+ with pytest.raises(pa.ArrowException):
+ pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True)
+
+ def test_zero_copy_failure_with_int_when_nulls(self):
+ with pytest.raises(pa.ArrowException):
+ pa.array([0, 1, None]).to_pandas(zero_copy_only=True)
+
+ def test_zero_copy_failure_with_float_when_nulls(self):
+ with pytest.raises(pa.ArrowException):
+ pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True)
+
+ def test_zero_copy_failure_on_bool_types(self):
+ with pytest.raises(pa.ArrowException):
+ pa.array([True, False]).to_pandas(zero_copy_only=True)
+
+ def test_zero_copy_failure_on_list_types(self):
+ arr = np.array([[1, 2], [8, 9]], dtype=object)
+
+ with pytest.raises(pa.ArrowException):
+ pa.array(arr).to_pandas(zero_copy_only=True)
+
+ def test_zero_copy_failure_on_timestamp_types(self):
+ arr = np.array(['2007-07-13'], dtype='datetime64[ns]')
+
+ with pytest.raises(pa.ArrowException):
+ pa.array(arr).to_pandas(zero_copy_only=True)
+
+
+class TestConvertMisc(object):
+ """
+ Miscellaneous conversion tests.
+ """
+
+ type_pairs = [
+ (np.int8, pa.int8()),
+ (np.int16, pa.int16()),
+ (np.int32, pa.int32()),
+ (np.int64, pa.int64()),
+ (np.uint8, pa.uint8()),
+ (np.uint16, pa.uint16()),
+ (np.uint32, pa.uint32()),
+ (np.uint64, pa.uint64()),
+ # (np.float16, pa.float16()), # XXX unsupported
+ (np.float32, pa.float32()),
+ (np.float64, pa.float64()),
+ # XXX unsupported
+ # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])),
+ (np.object, pa.string()),
+ # (np.object, pa.binary()), # XXX unsupported
+ (np.object, pa.binary(10)),
+ (np.object, pa.list_(pa.int64())),
+ ]
+
+ def test_all_none_objects(self):
+ df = pd.DataFrame({'a': [None, None, None]})
_check_pandas_roundtrip(df)
- _check_pandas_roundtrip(df, as_batch=True)
- df2 = pd.DataFrame({}, index=[0, 1, 2])
- _check_pandas_roundtrip(df2, preserve_index=True)
- _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
+ def test_all_none_category(self):
+ df = pd.DataFrame({'a': [None, None, None]})
+ df['a'] = df['a'].astype('category')
+ _check_pandas_roundtrip(df)
- def test_convert_empty_table(self):
- arr = pa.array([], type=pa.int64())
- tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=np.int64))
- arr = pa.array([], type=pa.string())
- tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
- arr = pa.array([], type=pa.list_(pa.int64()))
- tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
- arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
- tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
+ def test_empty_arrays(self):
+ for dtype, pa_type in self.type_pairs:
+ arr = np.array([], dtype=dtype)
+ _check_array_roundtrip(arr, type=pa_type)
- def test_array_from_pandas_date_with_mask(self):
- m = np.array([True, False, True])
- data = pd.Series([
- date(1990, 1, 1),
- date(1991, 1, 1),
- date(1992, 1, 1)
- ])
+ def test_threaded_conversion(self):
+ df = _alltypes_example()
+ _check_pandas_roundtrip(df, nthreads=2)
+ _check_pandas_roundtrip(df, nthreads=2, as_batch=True)
- result = pa.Array.from_pandas(data, mask=m)
+ def test_category(self):
+ repeats = 5
+ v1 = ['foo', None, 'bar', 'qux', np.nan]
+ v2 = [4, 5, 6, 7, 8]
+ v3 = [b'foo', None, b'bar', b'qux', np.nan]
+ df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
+ 'cat_ints': pd.Categorical(v2 * repeats),
+ 'cat_binary': pd.Categorical(v3 * repeats),
+ 'cat_strings_ordered': pd.Categorical(
+ v1 * repeats, categories=['bar', 'qux', 'foo'],
+ ordered=True),
+ 'ints': v2 * repeats,
+ 'ints2': v2 * repeats,
+ 'strings': v1 * repeats,
+ 'strings2': v1 * repeats,
+ 'strings3': v3 * repeats})
+ _check_pandas_roundtrip(df)
- expected = pd.Series([None, date(1991, 1, 1), None])
- assert pa.Array.from_pandas(expected).equals(result)
+ arrays = [
+ pd.Categorical(v1 * repeats),
+ pd.Categorical(v2 * repeats),
+ pd.Categorical(v3 * repeats)
+ ]
+ for values in arrays:
+ _check_array_roundtrip(values)
- @pytest.mark.parametrize('t,data,expected', [
- (
- pa.int64,
- [[1, 2], [3], None],
- [None, [3], None]
- ),
- (
- pa.string,
- [[u'aaa', u'bb'], [u'c'], None],
- [None, [u'c'], None]
- ),
- (
- pa.null,
- [[None, None], [None], None],
- [None, [None], None]
- )
- ])
- def test_array_from_pandas_typed_array_with_mask(self, t, data, expected):
- m = np.array([True, False, True])
+ def test_mixed_types_fails(self):
+ data = pd.DataFrame({'a': ['a', 1, 2.0]})
+ with pytest.raises(pa.ArrowException):
+ pa.Table.from_pandas(data)
- s = pd.Series(data)
- result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t()))
+ data = pd.DataFrame({'a': [1, True]})
+ with pytest.raises(pa.ArrowException):
+ pa.Table.from_pandas(data)
- assert pa.Array.from_pandas(expected,
- type=pa.list_(t())).equals(result)
+ def test_strided_data_import(self):
+ cases = []
- def test_table_column_subset_metadata(self):
- # ARROW-1883
- df = pd.DataFrame({
- 'a': [1, 2, 3],
- 'b': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
- table = pa.Table.from_pandas(df)
+ columns = ['a', 'b', 'c']
+ N, K = 100, 3
+ random_numbers = np.random.randn(N, K).copy() * 100
- table_subset = table.remove_column(1)
- result = table_subset.to_pandas()
- tm.assert_frame_equal(result, df[['a']])
+ numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8',
+ 'f4', 'f8']
- table_subset2 = table_subset.remove_column(1)
- result = table_subset2.to_pandas()
- tm.assert_frame_equal(result, df[['a']])
+ for type_name in numeric_dtypes:
+ cases.append(random_numbers.astype(type_name))
- # non-default index
- for index in [
- pd.Index(['a', 'b', 'c'], name='index'),
- pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]:
- df = pd.DataFrame({'a': [1, 2, 3],
- 'b': [.1, .2, .3]}, index=index)
- table = pa.Table.from_pandas(df)
+ # strings
+ cases.append(np.array([tm.rands(10) for i in range(N * K)],
+ dtype=object)
+ .reshape(N, K).copy())
- table_subset = table.remove_column(1)
- result = table_subset.to_pandas()
- tm.assert_frame_equal(result, df[['a']])
+ # booleans
+ boolean_objects = (np.array([True, False, True] * N, dtype=object)
+ .reshape(N, K).copy())
- table_subset2 = table_subset.remove_column(1)
- result = table_subset2.to_pandas()
- tm.assert_frame_equal(result, df[['a']].reset_index(drop=True))
+ # add some nulls, so dtype comes back as objects
+ boolean_objects[5] = None
+ cases.append(boolean_objects)
- def test_empty_list_roundtrip(self):
- empty_list_array = np.empty((3,), dtype=object)
- empty_list_array.fill([])
+ cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
+ dtype='datetime64[ms]')
+ .reshape(N, K).copy())
- df = pd.DataFrame({'a': np.array(['1', '2', '3']),
- 'b': empty_list_array})
- tbl = pa.Table.from_pandas(df)
+ strided_mask = (random_numbers > 0).astype(bool)[:, 0]
- result = tbl.to_pandas()
+ for case in cases:
+ df = pd.DataFrame(case, columns=columns)
+ col = df['a']
- tm.assert_frame_equal(result, df)
+ _check_pandas_roundtrip(df)
+ _check_array_roundtrip(col)
+ _check_array_roundtrip(col, mask=strided_mask)
- def test_empty_list_metadata(self):
- # Create table with array of empty lists, forced to have type
- # list(string) in pyarrow
- c1 = [["test"], ["a", "b"], None]
- c2 = [[], [], []]
- arrays = OrderedDict([
- ('c1', pa.array(c1, type=pa.list_(pa.string()))),
- ('c2', pa.array(c2, type=pa.list_(pa.string()))),
+ def test_all_nones(self):
+ def _check_series(s):
+ converted = pa.array(s)
+ assert isinstance(converted, pa.NullArray)
+ assert len(converted) == 3
+ assert converted.null_count == 3
+ assert converted[0] is pa.NA
+
+ _check_series(pd.Series([None] * 3, dtype=object))
+ _check_series(pd.Series([np.nan] * 3, dtype=object))
+ _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object))
+
+ def test_partial_schema(self):
+ data = OrderedDict([
+ ('a', [0, 1, 2, 3, 4]),
+ ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)),
+ ('c', [-10, -5, 0, 5, 10])
])
- rb = pa.RecordBatch.from_arrays(
- list(arrays.values()),
- list(arrays.keys())
- )
- tbl = pa.Table.from_batches([rb])
+ df = pd.DataFrame(data)
- # First roundtrip changes schema, because pandas cannot preserve the
- # type of empty lists
- df = tbl.to_pandas()
- tbl2 = pa.Table.from_pandas(df, preserve_index=True)
- md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8'))
+ partial_schema = pa.schema([
+ pa.field('a', pa.int64()),
+ pa.field('b', pa.int32())
+ ])
- # Second roundtrip
- df2 = tbl2.to_pandas()
- expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)]))
+ expected_schema = pa.schema([
+ pa.field('a', pa.int64()),
+ pa.field('b', pa.int32()),
+ pa.field('c', pa.int64())
+ ])
- tm.assert_frame_equal(df2, expected)
+ _check_pandas_roundtrip(df, schema=partial_schema,
+ expected_schema=expected_schema)
- assert md2['columns'] == [
- {
- 'name': 'c1',
- 'field_name': 'c1',
- 'metadata': None,
- 'numpy_type': 'object',
- 'pandas_type': 'list[unicode]',
- },
- {
- 'name': 'c2',
- 'field_name': 'c2',
- 'metadata': None,
- 'numpy_type': 'object',
- 'pandas_type': 'list[empty]',
- },
- {
- 'name': None,
- 'field_name': '__index_level_0__',
- 'metadata': None,
- 'numpy_type': 'int64',
- 'pandas_type': 'int64',
- }
- ]
+ def test_table_batch_empty_dataframe(self):
+ df = pd.DataFrame({})
+ _check_pandas_roundtrip(df)
+ _check_pandas_roundtrip(df, as_batch=True)
+
+ df2 = pd.DataFrame({}, index=[0, 1, 2])
+ _check_pandas_roundtrip(df2, preserve_index=True)
+ _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True)
+
+ def test_convert_empty_table(self):
+ arr = pa.array([], type=pa.int64())
+ tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=np.int64))
+ arr = pa.array([], type=pa.string())
+ tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
+ arr = pa.array([], type=pa.list_(pa.int64()))
+ tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
+ arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())]))
+ tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object))
def _fully_loaded_dataframe_example():
--
To stop receiving notification emails like this one, please contact
[email protected].