This is an automated email from the ASF dual-hosted git repository. wesm pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new c78c92a ARROW-2149: [Python] Reorganize test_convert_pandas.py c78c92a is described below commit c78c92a04f3db0bd7dd9a34e6f36743050871acc Author: Antoine Pitrou <anto...@python.org> AuthorDate: Tue Feb 13 11:19:44 2018 -0500 ARROW-2149: [Python] Reorganize test_convert_pandas.py This PR simply shuffles things around, reorganizing the test methods under several classes. Author: Antoine Pitrou <anto...@python.org> Closes #1601 from pitrou/ARROW-2149-reorganize-test-convert-pandas and squashes the following commits: 821c59be [Antoine Pitrou] ARROW-2149: [Python] Reorganize test_convert_pandas.py --- python/pyarrow/tests/test_convert_pandas.py | 1447 ++++++++++++++------------- 1 file changed, 749 insertions(+), 698 deletions(-) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 987ac23..5b6f6bc 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -117,41 +117,10 @@ def _check_array_from_pandas_roundtrip(np_array): npt.assert_array_equal(result, np_array) -class TestPandasConversion(object): - - type_pairs = [ - (np.int8, pa.int8()), - (np.int16, pa.int16()), - (np.int32, pa.int32()), - (np.int64, pa.int64()), - (np.uint8, pa.uint8()), - (np.uint16, pa.uint16()), - (np.uint32, pa.uint32()), - (np.uint64, pa.uint64()), - # (np.float16, pa.float16()), # XXX unsupported - (np.float32, pa.float32()), - (np.float64, pa.float64()), - # XXX unsupported - # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])), - (np.object, pa.string()), - # (np.object, pa.binary()), # XXX unsupported - (np.object, pa.binary(10)), - (np.object, pa.list_(pa.int64())), - ] - - def test_all_none_objects(self): - df = pd.DataFrame({'a': [None, None, None]}) - _check_pandas_roundtrip(df) - - def test_all_none_category(self): - df = pd.DataFrame({'a': [None, None, None]}) - df['a'] = df['a'].astype('category') - _check_pandas_roundtrip(df) - - def test_empty_arrays(self): - for dtype, pa_type in self.type_pairs: - arr = np.array([], dtype=dtype) - _check_array_roundtrip(arr, type=pa_type) +class TestConvertMetadata(object): + """ + Conversion tests for Pandas metadata & indices. + """ def test_non_string_columns(self): df = pd.DataFrame({0: [1, 2, 3]}) @@ -302,25 +271,6 @@ class TestPandasConversion(object): _check_pandas_roundtrip(df, preserve_index=True) - def test_float_no_nulls(self): - data = {} - fields = [] - dtypes = [('f4', pa.float32()), ('f8', pa.float64())] - num_values = 100 - - for numpy_dtype, arrow_dtype in dtypes: - values = np.random.randn(num_values) - data[numpy_dtype] = values.astype(numpy_dtype) - fields.append(pa.field(numpy_dtype, arrow_dtype)) - - df = pd.DataFrame(data) - schema = pa.schema(fields) - _check_pandas_roundtrip(df, expected_schema=schema) - - def test_zero_copy_success(self): - result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True) - npt.assert_array_equal(result, [0, 1, 2]) - def test_duplicate_column_names_does_not_crash(self): df = pd.DataFrame([(1, 'a'), (2, 'b')], columns=list('aa')) with pytest.raises(ValueError): @@ -341,44 +291,187 @@ class TestPandasConversion(object): with pytest.raises(pa.ArrowException): table.to_pandas() - def test_zero_copy_dictionaries(self): - arr = pa.DictionaryArray.from_arrays( - np.array([0, 0]), - np.array([5])) + def test_unicode_with_unicode_column_and_index(self): + df = pd.DataFrame({u'あ': [u'い']}, index=[u'う']) - result = arr.to_pandas(zero_copy_only=True) - values = pd.Categorical([5, 5]) + _check_pandas_roundtrip(df, preserve_index=True) - tm.assert_series_equal(pd.Series(result), pd.Series(values), - check_names=False) + def test_mixed_unicode_column_names(self): + df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う']) - def test_zero_copy_failure_on_object_types(self): - with pytest.raises(pa.ArrowException): - pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True) + # TODO(phillipc): Should this raise? + with pytest.raises(AssertionError): + _check_pandas_roundtrip(df, preserve_index=True) - def test_zero_copy_failure_with_int_when_nulls(self): - with pytest.raises(pa.ArrowException): - pa.array([0, 1, None]).to_pandas(zero_copy_only=True) + def test_binary_column_name(self): + column_data = [u'い'] + data = {u'あ'.encode('utf8'): column_data} + df = pd.DataFrame(data) - def test_zero_copy_failure_with_float_when_nulls(self): - with pytest.raises(pa.ArrowException): - pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True) + # we can't use _check_pandas_roundtrip here because our metdata + # is always decoded as utf8: even if binary goes in, utf8 comes out + t = pa.Table.from_pandas(df, preserve_index=True) + df2 = t.to_pandas() + assert df.values[0] == df2.values[0] + assert df.index.values[0] == df2.index.values[0] + assert df.columns[0] == df2.columns[0].encode('utf8') - def test_zero_copy_failure_on_bool_types(self): - with pytest.raises(pa.ArrowException): - pa.array([True, False]).to_pandas(zero_copy_only=True) + def test_multiindex_duplicate_values(self): + num_rows = 3 + numbers = list(range(num_rows)) + index = pd.MultiIndex.from_arrays( + [['foo', 'foo', 'bar'], numbers], + names=['foobar', 'some_numbers'], + ) - def test_zero_copy_failure_on_list_types(self): - arr = np.array([[1, 2], [8, 9]], dtype=object) + df = pd.DataFrame({'numbers': numbers}, index=index) - with pytest.raises(pa.ArrowException): - pa.array(arr).to_pandas(zero_copy_only=True) + table = pa.Table.from_pandas(df) + result_df = table.to_pandas() + tm.assert_frame_equal(result_df, df) - def test_zero_copy_failure_on_timestamp_types(self): - arr = np.array(['2007-07-13'], dtype='datetime64[ns]') + def test_metadata_with_mixed_types(self): + df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']}) + table = pa.Table.from_pandas(df) + metadata = table.schema.metadata + assert b'mixed' not in metadata[b'pandas'] - with pytest.raises(pa.ArrowException): - pa.array(arr).to_pandas(zero_copy_only=True) + js = json.loads(metadata[b'pandas'].decode('utf8')) + data_column = js['columns'][0] + assert data_column['pandas_type'] == 'bytes' + assert data_column['numpy_type'] == 'object' + + def test_list_metadata(self): + df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]}) + schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))]) + table = pa.Table.from_pandas(df, schema=schema) + metadata = table.schema.metadata + assert b'mixed' not in metadata[b'pandas'] + + js = json.loads(metadata[b'pandas'].decode('utf8')) + data_column = js['columns'][0] + assert data_column['pandas_type'] == 'list[int64]' + assert data_column['numpy_type'] == 'object' + + def test_decimal_metadata(self): + expected = pd.DataFrame({ + 'decimals': [ + decimal.Decimal('394092382910493.12341234678'), + -decimal.Decimal('314292388910493.12343437128'), + ] + }) + table = pa.Table.from_pandas(expected) + metadata = table.schema.metadata + assert b'mixed' not in metadata[b'pandas'] + + js = json.loads(metadata[b'pandas'].decode('utf8')) + data_column = js['columns'][0] + assert data_column['pandas_type'] == 'decimal' + assert data_column['numpy_type'] == 'object' + assert data_column['metadata'] == {'precision': 26, 'scale': 11} + + def test_table_column_subset_metadata(self): + # ARROW-1883 + df = pd.DataFrame({ + 'a': [1, 2, 3], + 'b': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) + table = pa.Table.from_pandas(df) + + table_subset = table.remove_column(1) + result = table_subset.to_pandas() + tm.assert_frame_equal(result, df[['a']]) + + table_subset2 = table_subset.remove_column(1) + result = table_subset2.to_pandas() + tm.assert_frame_equal(result, df[['a']]) + + # non-default index + for index in [ + pd.Index(['a', 'b', 'c'], name='index'), + pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]: + df = pd.DataFrame({'a': [1, 2, 3], + 'b': [.1, .2, .3]}, index=index) + table = pa.Table.from_pandas(df) + + table_subset = table.remove_column(1) + result = table_subset.to_pandas() + tm.assert_frame_equal(result, df[['a']]) + + table_subset2 = table_subset.remove_column(1) + result = table_subset2.to_pandas() + tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) + + def test_empty_list_metadata(self): + # Create table with array of empty lists, forced to have type + # list(string) in pyarrow + c1 = [["test"], ["a", "b"], None] + c2 = [[], [], []] + arrays = OrderedDict([ + ('c1', pa.array(c1, type=pa.list_(pa.string()))), + ('c2', pa.array(c2, type=pa.list_(pa.string()))), + ]) + rb = pa.RecordBatch.from_arrays( + list(arrays.values()), + list(arrays.keys()) + ) + tbl = pa.Table.from_batches([rb]) + + # First roundtrip changes schema, because pandas cannot preserve the + # type of empty lists + df = tbl.to_pandas() + tbl2 = pa.Table.from_pandas(df, preserve_index=True) + md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8')) + + # Second roundtrip + df2 = tbl2.to_pandas() + expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)])) + + tm.assert_frame_equal(df2, expected) + + assert md2['columns'] == [ + { + 'name': 'c1', + 'field_name': 'c1', + 'metadata': None, + 'numpy_type': 'object', + 'pandas_type': 'list[unicode]', + }, + { + 'name': 'c2', + 'field_name': 'c2', + 'metadata': None, + 'numpy_type': 'object', + 'pandas_type': 'list[empty]', + }, + { + 'name': None, + 'field_name': '__index_level_0__', + 'metadata': None, + 'numpy_type': 'int64', + 'pandas_type': 'int64', + } + ] + + +class TestConvertPrimitiveTypes(object): + """ + Conversion tests for primitive (e.g. numeric) types. + """ + + def test_float_no_nulls(self): + data = {} + fields = [] + dtypes = [('f4', pa.float32()), ('f8', pa.float64())] + num_values = 100 + + for numpy_dtype, arrow_dtype in dtypes: + values = np.random.randn(num_values) + data[numpy_dtype] = values.astype(numpy_dtype) + fields.append(pa.field(numpy_dtype, arrow_dtype)) + + df = pd.DataFrame(data) + schema = pa.schema(fields) + _check_pandas_roundtrip(df, expected_schema=schema) def test_float_nulls(self): num_values = 100 @@ -408,24 +501,6 @@ class TestPandasConversion(object): result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) - def test_float_object_nulls(self): - arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) - df = pd.DataFrame({'floats': arr}) - expected = pd.DataFrame({'floats': pd.to_numeric(arr)}) - field = pa.field('floats', pa.float64()) - schema = pa.schema([field]) - _check_pandas_roundtrip(df, expected=expected, - expected_schema=schema) - - def test_int_object_nulls(self): - arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) - df = pd.DataFrame({'ints': arr}) - expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) - field = pa.field('ints', pa.int64()) - schema = pa.schema([field]) - _check_pandas_roundtrip(df, expected=expected, - expected_schema=schema) - def test_integer_no_nulls(self): data = OrderedDict() fields = [] @@ -522,6 +597,24 @@ class TestPandasConversion(object): tm.assert_frame_equal(result, ex_frame) + def test_float_object_nulls(self): + arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object) + df = pd.DataFrame({'floats': arr}) + expected = pd.DataFrame({'floats': pd.to_numeric(arr)}) + field = pa.field('floats', pa.float64()) + schema = pa.schema([field]) + _check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + + def test_int_object_nulls(self): + arr = np.array([None, 1, np.int64(3)] * 5, dtype=object) + df = pd.DataFrame({'ints': arr}) + expected = pd.DataFrame({'ints': pd.to_numeric(arr)}) + field = pa.field('ints', pa.int64()) + schema = pa.schema([field]) + _check_pandas_roundtrip(df, expected=expected, + expected_schema=schema) + def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) @@ -540,81 +633,11 @@ class TestPandasConversion(object): _check_type(pa.int32()) _check_type(pa.float64()) - def test_unicode(self): - repeats = 1000 - values = [u'foo', None, u'bar', u'mañana', np.nan] - df = pd.DataFrame({'strings': values * repeats}) - field = pa.field('strings', pa.string()) - schema = pa.schema([field]) - - _check_pandas_roundtrip(df, expected_schema=schema) - - def test_unicode_with_unicode_column_and_index(self): - df = pd.DataFrame({u'あ': [u'い']}, index=[u'う']) - - _check_pandas_roundtrip(df, preserve_index=True) - - def test_mixed_unicode_column_names(self): - df = pd.DataFrame({u'あ': [u'い'], b'a': 1}, index=[u'う']) - - # TODO(phillipc): Should this raise? - with pytest.raises(AssertionError): - _check_pandas_roundtrip(df, preserve_index=True) - - def test_binary_column_name(self): - column_data = [u'い'] - data = {u'あ'.encode('utf8'): column_data} - df = pd.DataFrame(data) - - # we can't use _check_pandas_roundtrip here because our metdata - # is always decoded as utf8: even if binary goes in, utf8 comes out - t = pa.Table.from_pandas(df, preserve_index=True) - df2 = t.to_pandas() - assert df.values[0] == df2.values[0] - assert df.index.values[0] == df2.index.values[0] - assert df.columns[0] == df2.columns[0].encode('utf8') - - def test_bytes_to_binary(self): - values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] - df = pd.DataFrame({'strings': values}) - - table = pa.Table.from_pandas(df) - assert table[0].type == pa.binary() - - values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] - expected = pd.DataFrame({'strings': values2}) - _check_pandas_roundtrip(df, expected) - - @pytest.mark.large_memory - def test_bytes_exceed_2gb(self): - val = 'x' * (1 << 20) - df = pd.DataFrame({ - 'strings': np.array([val] * 4000, dtype=object) - }) - arr = pa.array(df['strings']) - assert isinstance(arr, pa.ChunkedArray) - assert arr.num_chunks == 2 - arr = None - - table = pa.Table.from_pandas(df) - assert table[0].data.num_chunks == 2 - - def test_fixed_size_bytes(self): - values = [b'foo', None, b'bar', None, None, b'hey'] - df = pd.DataFrame({'strings': values}) - schema = pa.schema([pa.field('strings', pa.binary(3))]) - table = pa.Table.from_pandas(df, schema=schema) - assert table.schema[0].type == schema[0].type - assert table.schema[0].name == schema[0].name - result = table.to_pandas() - tm.assert_frame_equal(result, df) - def test_fixed_size_bytes_does_not_accept_varying_lengths(self): - values = [b'foo', None, b'ba', None, None, b'hey'] - df = pd.DataFrame({'strings': values}) - schema = pa.schema([pa.field('strings', pa.binary(3))]) - with pytest.raises(pa.ArrowInvalid): - pa.Table.from_pandas(df, schema=schema) +class TestConvertDateTimeLikeTypes(object): + """ + Conversion tests for datetime- and timestamp-like types (date64, etc.). + """ def test_timestamps_notimezone_no_nulls(self): df = pd.DataFrame({ @@ -788,205 +811,257 @@ class TestPandasConversion(object): }) pa.Table.from_pandas(df) - def test_column_of_arrays(self): - df, schema = dataframe_with_arrays() - _check_pandas_roundtrip(df, schema=schema, expected_schema=schema) - table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) - assert table.schema.equals(schema) + def test_pytime_from_pandas(self): + pytimes = [time(1, 2, 3, 1356), + time(4, 5, 6, 1356)] - for column in df.columns: - field = schema.field_by_name(column) - _check_array_roundtrip(df[column], type=field.type) + # microseconds + t1 = pa.time64('us') - def test_column_of_arrays_to_py(self): - # Test regression in ARROW-1199 not caught in above test - dtype = 'i1' - arr = np.array([ - np.arange(10, dtype=dtype), - np.arange(5, dtype=dtype), - None, - np.arange(1, dtype=dtype) - ]) - type_ = pa.list_(pa.int8()) - parr = pa.array(arr, type=type_) + aobjs = np.array(pytimes + [None], dtype=object) + parr = pa.array(aobjs) + assert parr.type == t1 + assert parr[0].as_py() == pytimes[0] + assert parr[1].as_py() == pytimes[1] + assert parr[2] is pa.NA - assert parr[0].as_py() == list(range(10)) - assert parr[1].as_py() == list(range(5)) - assert parr[2].as_py() is None - assert parr[3].as_py() == [0] + # DataFrame + df = pd.DataFrame({'times': aobjs}) + batch = pa.RecordBatch.from_pandas(df) + assert batch[0].equals(parr) - def test_column_of_lists(self): - df, schema = dataframe_with_lists() - _check_pandas_roundtrip(df, schema=schema, expected_schema=schema) - table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) - assert table.schema.equals(schema) + # Test ndarray of int64 values + arr = np.array([_pytime_to_micros(v) for v in pytimes], + dtype='int64') - for column in df.columns: - field = schema.field_by_name(column) - _check_array_roundtrip(df[column], type=field.type) + a1 = pa.array(arr, type=pa.time64('us')) + assert a1[0].as_py() == pytimes[0] - def test_column_of_lists_first_empty(self): - # ARROW-2124 - num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]] - series = pd.Series([np.array(s, dtype=float) for s in num_lists]) - arr = pa.array(series) - result = pd.Series(arr.to_pandas()) - tm.assert_series_equal(result, series) + a2 = pa.array(arr * 1000, type=pa.time64('ns')) + assert a2[0].as_py() == pytimes[0] - def test_column_of_lists_chunked(self): - # ARROW-1357 - df = pd.DataFrame({ - 'lists': np.array([ - [1, 2], - None, - [2, 3], - [4, 5], - [6, 7], - [8, 9] - ], dtype=object) - }) + a3 = pa.array((arr / 1000).astype('i4'), + type=pa.time32('ms')) + assert a3[0].as_py() == pytimes[0].replace(microsecond=1000) - schema = pa.schema([ - pa.field('lists', pa.list_(pa.int64())) - ]) + a4 = pa.array((arr / 1000000).astype('i4'), + type=pa.time32('s')) + assert a4[0].as_py() == pytimes[0].replace(microsecond=0) - t1 = pa.Table.from_pandas(df[:2], schema=schema) - t2 = pa.Table.from_pandas(df[2:], schema=schema) + def test_arrow_time_to_pandas(self): + pytimes = [time(1, 2, 3, 1356), + time(4, 5, 6, 1356), + time(0, 0, 0)] - table = pa.concat_tables([t1, t2]) - result = table.to_pandas() + expected = np.array(pytimes[:2] + [None]) + expected_ms = np.array([x.replace(microsecond=1000) + for x in pytimes[:2]] + + [None]) + expected_s = np.array([x.replace(microsecond=0) + for x in pytimes[:2]] + + [None]) - tm.assert_frame_equal(result, df) + arr = np.array([_pytime_to_micros(v) for v in pytimes], + dtype='int64') + arr = np.array([_pytime_to_micros(v) for v in pytimes], + dtype='int64') - def test_column_of_lists_chunked2(self): - data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11], - [12, 13], [14, 15], [16, 17]] - data2 = [[8, 9], [18, 19]] + null_mask = np.array([False, False, True], dtype=bool) - a1 = pa.array(data1) - a2 = pa.array(data2) + a1 = pa.array(arr, mask=null_mask, type=pa.time64('us')) + a2 = pa.array(arr * 1000, mask=null_mask, + type=pa.time64('ns')) - t1 = pa.Table.from_arrays([a1], names=['a']) - t2 = pa.Table.from_arrays([a2], names=['a']) + a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask, + type=pa.time32('ms')) + a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask, + type=pa.time32('s')) - concatenated = pa.concat_tables([t1, t2]) + names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]'] + batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names) + arr = a1.to_pandas() + assert (arr == expected).all() - result = concatenated.to_pandas() - expected = pd.DataFrame({'a': data1 + data2}) + arr = a2.to_pandas() + assert (arr == expected).all() - tm.assert_frame_equal(result, expected) + arr = a3.to_pandas() + assert (arr == expected_ms).all() - def test_column_of_lists_strided(self): - df, schema = dataframe_with_lists() - df = pd.concat([df] * 6, ignore_index=True) + arr = a4.to_pandas() + assert (arr == expected_s).all() - arr = df['int64'].values[::3] - assert arr.strides[0] != 8 + df = batch.to_pandas() + expected_df = pd.DataFrame({'time64[us]': expected, + 'time64[ns]': expected, + 'time32[ms]': expected_ms, + 'time32[s]': expected_s}, + columns=names) - _check_array_roundtrip(arr) + tm.assert_frame_equal(df, expected_df) - def test_nested_lists_all_none(self): - data = np.array([[None, None], None], dtype=object) + def test_numpy_datetime64_columns(self): + datetime64_ns = np.array([ + '2007-07-13T01:23:34.123456789', + None, + '2006-01-13T12:34:56.432539784', + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ns]') + _check_array_from_pandas_roundtrip(datetime64_ns) - arr = pa.array(data) - expected = pa.array(list(data)) - assert arr.equals(expected) - assert arr.type == pa.list_(pa.null()) + datetime64_us = np.array([ + '2007-07-13T01:23:34.123456', + None, + '2006-01-13T12:34:56.432539', + '2010-08-13T05:46:57.437699'], + dtype='datetime64[us]') + _check_array_from_pandas_roundtrip(datetime64_us) - data2 = np.array([None, None, [None, None], - np.array([None, None], dtype=object)], - dtype=object) - arr = pa.array(data2) - expected = pa.array([None, None, [None, None], [None, None]]) - assert arr.equals(expected) + datetime64_ms = np.array([ + '2007-07-13T01:23:34.123', + None, + '2006-01-13T12:34:56.432', + '2010-08-13T05:46:57.437'], + dtype='datetime64[ms]') + _check_array_from_pandas_roundtrip(datetime64_ms) - def test_nested_lists_all_empty(self): - # ARROW-2128 - data = pd.Series([[], [], []]) - arr = pa.array(data) - expected = pa.array(list(data)) - assert arr.equals(expected) - assert arr.type == pa.list_(pa.null()) + datetime64_s = np.array([ + '2007-07-13T01:23:34', + None, + '2006-01-13T12:34:56', + '2010-08-13T05:46:57'], + dtype='datetime64[s]') + _check_array_from_pandas_roundtrip(datetime64_s) - def test_threaded_conversion(self): - df = _alltypes_example() - _check_pandas_roundtrip(df, nthreads=2) - _check_pandas_roundtrip(df, nthreads=2, as_batch=True) + def test_numpy_datetime64_day_unit(self): + datetime64_d = np.array([ + '2007-07-13', + None, + '2006-01-15', + '2010-08-19'], + dtype='datetime64[D]') + _check_array_from_pandas_roundtrip(datetime64_d) - def test_category(self): - repeats = 5 - v1 = ['foo', None, 'bar', 'qux', np.nan] - v2 = [4, 5, 6, 7, 8] - v3 = [b'foo', None, b'bar', b'qux', np.nan] - df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats), - 'cat_ints': pd.Categorical(v2 * repeats), - 'cat_binary': pd.Categorical(v3 * repeats), - 'cat_strings_ordered': pd.Categorical( - v1 * repeats, categories=['bar', 'qux', 'foo'], - ordered=True), - 'ints': v2 * repeats, - 'ints2': v2 * repeats, - 'strings': v1 * repeats, - 'strings2': v1 * repeats, - 'strings3': v3 * repeats}) - _check_pandas_roundtrip(df) + def test_array_from_pandas_date_with_mask(self): + m = np.array([True, False, True]) + data = pd.Series([ + date(1990, 1, 1), + date(1991, 1, 1), + date(1992, 1, 1) + ]) - arrays = [ - pd.Categorical(v1 * repeats), - pd.Categorical(v2 * repeats), - pd.Categorical(v3 * repeats) - ] - for values in arrays: - _check_array_roundtrip(values) + result = pa.Array.from_pandas(data, mask=m) - def test_mixed_types_fails(self): - data = pd.DataFrame({'a': ['a', 1, 2.0]}) - with pytest.raises(pa.ArrowException): - pa.Table.from_pandas(data) + expected = pd.Series([None, date(1991, 1, 1), None]) + assert pa.Array.from_pandas(expected).equals(result) - data = pd.DataFrame({'a': [1, True]}) - with pytest.raises(pa.ArrowException): - pa.Table.from_pandas(data) - def test_strided_data_import(self): - cases = [] +class TestConvertStringLikeTypes(object): + """ + Conversion tests for string and binary types. + """ - columns = ['a', 'b', 'c'] - N, K = 100, 3 - random_numbers = np.random.randn(N, K).copy() * 100 + def test_unicode(self): + repeats = 1000 + values = [u'foo', None, u'bar', u'mañana', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) - numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', - 'f4', 'f8'] + _check_pandas_roundtrip(df, expected_schema=schema) - for type_name in numeric_dtypes: - cases.append(random_numbers.astype(type_name)) + def test_bytes_to_binary(self): + values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values}) - # strings - cases.append(np.array([tm.rands(10) for i in range(N * K)], - dtype=object) - .reshape(N, K).copy()) + table = pa.Table.from_pandas(df) + assert table[0].type == pa.binary() - # booleans - boolean_objects = (np.array([True, False, True] * N, dtype=object) - .reshape(N, K).copy()) + values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] + expected = pd.DataFrame({'strings': values2}) + _check_pandas_roundtrip(df, expected) - # add some nulls, so dtype comes back as objects - boolean_objects[5] = None - cases.append(boolean_objects) + @pytest.mark.large_memory + def test_bytes_exceed_2gb(self): + val = 'x' * (1 << 20) + df = pd.DataFrame({ + 'strings': np.array([val] * 4000, dtype=object) + }) + arr = pa.array(df['strings']) + assert isinstance(arr, pa.ChunkedArray) + assert arr.num_chunks == 2 + arr = None - cases.append(np.arange("2016-01-01T00:00:00.001", N * K, - dtype='datetime64[ms]') - .reshape(N, K).copy()) + table = pa.Table.from_pandas(df) + assert table[0].data.num_chunks == 2 - strided_mask = (random_numbers > 0).astype(bool)[:, 0] + def test_fixed_size_bytes(self): + values = [b'foo', None, b'bar', None, None, b'hey'] + df = pd.DataFrame({'strings': values}) + schema = pa.schema([pa.field('strings', pa.binary(3))]) + table = pa.Table.from_pandas(df, schema=schema) + assert table.schema[0].type == schema[0].type + assert table.schema[0].name == schema[0].name + result = table.to_pandas() + tm.assert_frame_equal(result, df) - for case in cases: - df = pd.DataFrame(case, columns=columns) - col = df['a'] + def test_fixed_size_bytes_does_not_accept_varying_lengths(self): + values = [b'foo', None, b'ba', None, None, b'hey'] + df = pd.DataFrame({'strings': values}) + schema = pa.schema([pa.field('strings', pa.binary(3))]) + with pytest.raises(pa.ArrowInvalid): + pa.Table.from_pandas(df, schema=schema) - _check_pandas_roundtrip(df) - _check_array_roundtrip(col) - _check_array_roundtrip(col, mask=strided_mask) + def test_table_empty_str(self): + values = ['', '', '', '', ''] + df = pd.DataFrame({'strings': values}) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) + table = pa.Table.from_pandas(df, schema=schema) + + result1 = table.to_pandas(strings_to_categorical=False) + expected1 = pd.DataFrame({'strings': values}) + tm.assert_frame_equal(result1, expected1, check_dtype=True) + + result2 = table.to_pandas(strings_to_categorical=True) + expected2 = pd.DataFrame({'strings': pd.Categorical(values)}) + tm.assert_frame_equal(result2, expected2, check_dtype=True) + + def test_table_str_to_categorical_without_na(self): + values = ['a', 'a', 'b', 'b', 'c'] + df = pd.DataFrame({'strings': values}) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) + table = pa.Table.from_pandas(df, schema=schema) + + result = table.to_pandas(strings_to_categorical=True) + expected = pd.DataFrame({'strings': pd.Categorical(values)}) + tm.assert_frame_equal(result, expected, check_dtype=True) + + with pytest.raises(pa.ArrowInvalid): + table.to_pandas(strings_to_categorical=True, + zero_copy_only=True) + + def test_table_str_to_categorical_with_na(self): + values = [None, 'a', 'b', np.nan] + df = pd.DataFrame({'strings': values}) + field = pa.field('strings', pa.string()) + schema = pa.schema([field]) + table = pa.Table.from_pandas(df, schema=schema) + + result = table.to_pandas(strings_to_categorical=True) + expected = pd.DataFrame({'strings': pd.Categorical(values)}) + tm.assert_frame_equal(result, expected, check_dtype=True) + + with pytest.raises(pa.ArrowInvalid): + table.to_pandas(strings_to_categorical=True, + zero_copy_only=True) + + +class TestConvertDecimalTypes(object): + """ + Conversion test for decimal types. + """ def test_decimal_32_from_pandas(self): expected = pd.DataFrame({ @@ -1057,201 +1132,131 @@ class TestPandasConversion(object): df = converted.to_pandas() tm.assert_frame_equal(df, expected) - def test_pytime_from_pandas(self): - pytimes = [time(1, 2, 3, 1356), - time(4, 5, 6, 1356)] - - # microseconds - t1 = pa.time64('us') - aobjs = np.array(pytimes + [None], dtype=object) - parr = pa.array(aobjs) - assert parr.type == t1 - assert parr[0].as_py() == pytimes[0] - assert parr[1].as_py() == pytimes[1] - assert parr[2] is pa.NA +class TestListTypes(object): + """ + Conversion tests for list<> types. + """ - # DataFrame - df = pd.DataFrame({'times': aobjs}) - batch = pa.RecordBatch.from_pandas(df) - assert batch[0].equals(parr) + def test_column_of_arrays(self): + df, schema = dataframe_with_arrays() + _check_pandas_roundtrip(df, schema=schema, expected_schema=schema) + table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) + assert table.schema.equals(schema) - # Test ndarray of int64 values - arr = np.array([_pytime_to_micros(v) for v in pytimes], - dtype='int64') + for column in df.columns: + field = schema.field_by_name(column) + _check_array_roundtrip(df[column], type=field.type) - a1 = pa.array(arr, type=pa.time64('us')) - assert a1[0].as_py() == pytimes[0] + def test_column_of_arrays_to_py(self): + # Test regression in ARROW-1199 not caught in above test + dtype = 'i1' + arr = np.array([ + np.arange(10, dtype=dtype), + np.arange(5, dtype=dtype), + None, + np.arange(1, dtype=dtype) + ]) + type_ = pa.list_(pa.int8()) + parr = pa.array(arr, type=type_) - a2 = pa.array(arr * 1000, type=pa.time64('ns')) - assert a2[0].as_py() == pytimes[0] + assert parr[0].as_py() == list(range(10)) + assert parr[1].as_py() == list(range(5)) + assert parr[2].as_py() is None + assert parr[3].as_py() == [0] - a3 = pa.array((arr / 1000).astype('i4'), - type=pa.time32('ms')) - assert a3[0].as_py() == pytimes[0].replace(microsecond=1000) - - a4 = pa.array((arr / 1000000).astype('i4'), - type=pa.time32('s')) - assert a4[0].as_py() == pytimes[0].replace(microsecond=0) - - def test_arrow_time_to_pandas(self): - pytimes = [time(1, 2, 3, 1356), - time(4, 5, 6, 1356), - time(0, 0, 0)] - - expected = np.array(pytimes[:2] + [None]) - expected_ms = np.array([x.replace(microsecond=1000) - for x in pytimes[:2]] + - [None]) - expected_s = np.array([x.replace(microsecond=0) - for x in pytimes[:2]] + - [None]) - - arr = np.array([_pytime_to_micros(v) for v in pytimes], - dtype='int64') - arr = np.array([_pytime_to_micros(v) for v in pytimes], - dtype='int64') - - null_mask = np.array([False, False, True], dtype=bool) - - a1 = pa.array(arr, mask=null_mask, type=pa.time64('us')) - a2 = pa.array(arr * 1000, mask=null_mask, - type=pa.time64('ns')) - - a3 = pa.array((arr / 1000).astype('i4'), mask=null_mask, - type=pa.time32('ms')) - a4 = pa.array((arr / 1000000).astype('i4'), mask=null_mask, - type=pa.time32('s')) - - names = ['time64[us]', 'time64[ns]', 'time32[ms]', 'time32[s]'] - batch = pa.RecordBatch.from_arrays([a1, a2, a3, a4], names) - arr = a1.to_pandas() - assert (arr == expected).all() - - arr = a2.to_pandas() - assert (arr == expected).all() - - arr = a3.to_pandas() - assert (arr == expected_ms).all() - - arr = a4.to_pandas() - assert (arr == expected_s).all() + def test_column_of_lists(self): + df, schema = dataframe_with_lists() + _check_pandas_roundtrip(df, schema=schema, expected_schema=schema) + table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) + assert table.schema.equals(schema) - df = batch.to_pandas() - expected_df = pd.DataFrame({'time64[us]': expected, - 'time64[ns]': expected, - 'time32[ms]': expected_ms, - 'time32[s]': expected_s}, - columns=names) + for column in df.columns: + field = schema.field_by_name(column) + _check_array_roundtrip(df[column], type=field.type) - tm.assert_frame_equal(df, expected_df) + def test_column_of_lists_first_empty(self): + # ARROW-2124 + num_lists = [[], [2, 3, 4], [3, 6, 7, 8], [], [2]] + series = pd.Series([np.array(s, dtype=float) for s in num_lists]) + arr = pa.array(series) + result = pd.Series(arr.to_pandas()) + tm.assert_series_equal(result, series) - def test_numpy_datetime64_columns(self): - datetime64_ns = np.array([ - '2007-07-13T01:23:34.123456789', + def test_column_of_lists_chunked(self): + # ARROW-1357 + df = pd.DataFrame({ + 'lists': np.array([ + [1, 2], None, - '2006-01-13T12:34:56.432539784', - '2010-08-13T05:46:57.437699912'], - dtype='datetime64[ns]') - _check_array_from_pandas_roundtrip(datetime64_ns) + [2, 3], + [4, 5], + [6, 7], + [8, 9] + ], dtype=object) + }) - datetime64_us = np.array([ - '2007-07-13T01:23:34.123456', - None, - '2006-01-13T12:34:56.432539', - '2010-08-13T05:46:57.437699'], - dtype='datetime64[us]') - _check_array_from_pandas_roundtrip(datetime64_us) + schema = pa.schema([ + pa.field('lists', pa.list_(pa.int64())) + ]) - datetime64_ms = np.array([ - '2007-07-13T01:23:34.123', - None, - '2006-01-13T12:34:56.432', - '2010-08-13T05:46:57.437'], - dtype='datetime64[ms]') - _check_array_from_pandas_roundtrip(datetime64_ms) + t1 = pa.Table.from_pandas(df[:2], schema=schema) + t2 = pa.Table.from_pandas(df[2:], schema=schema) - datetime64_s = np.array([ - '2007-07-13T01:23:34', - None, - '2006-01-13T12:34:56', - '2010-08-13T05:46:57'], - dtype='datetime64[s]') - _check_array_from_pandas_roundtrip(datetime64_s) + table = pa.concat_tables([t1, t2]) + result = table.to_pandas() - def test_numpy_datetime64_day_unit(self): - datetime64_d = np.array([ - '2007-07-13', - None, - '2006-01-15', - '2010-08-19'], - dtype='datetime64[D]') - _check_array_from_pandas_roundtrip(datetime64_d) + tm.assert_frame_equal(result, df) - def test_all_nones(self): - def _check_series(s): - converted = pa.array(s) - assert isinstance(converted, pa.NullArray) - assert len(converted) == 3 - assert converted.null_count == 3 - assert converted[0] is pa.NA + def test_column_of_lists_chunked2(self): + data1 = [[0, 1], [2, 3], [4, 5], [6, 7], [10, 11], + [12, 13], [14, 15], [16, 17]] + data2 = [[8, 9], [18, 19]] - _check_series(pd.Series([None] * 3, dtype=object)) - _check_series(pd.Series([np.nan] * 3, dtype=object)) - _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object)) + a1 = pa.array(data1) + a2 = pa.array(data2) - def test_multiindex_duplicate_values(self): - num_rows = 3 - numbers = list(range(num_rows)) - index = pd.MultiIndex.from_arrays( - [['foo', 'foo', 'bar'], numbers], - names=['foobar', 'some_numbers'], - ) + t1 = pa.Table.from_arrays([a1], names=['a']) + t2 = pa.Table.from_arrays([a2], names=['a']) - df = pd.DataFrame({'numbers': numbers}, index=index) + concatenated = pa.concat_tables([t1, t2]) - table = pa.Table.from_pandas(df) - result_df = table.to_pandas() - tm.assert_frame_equal(result_df, df) + result = concatenated.to_pandas() + expected = pd.DataFrame({'a': data1 + data2}) - def test_partial_schema(self): - data = OrderedDict([ - ('a', [0, 1, 2, 3, 4]), - ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)), - ('c', [-10, -5, 0, 5, 10]) - ]) - df = pd.DataFrame(data) + tm.assert_frame_equal(result, expected) - partial_schema = pa.schema([ - pa.field('a', pa.int64()), - pa.field('b', pa.int32()) - ]) + def test_column_of_lists_strided(self): + df, schema = dataframe_with_lists() + df = pd.concat([df] * 6, ignore_index=True) - expected_schema = pa.schema([ - pa.field('a', pa.int64()), - pa.field('b', pa.int32()), - pa.field('c', pa.int64()) - ]) + arr = df['int64'].values[::3] + assert arr.strides[0] != 8 - _check_pandas_roundtrip(df, schema=partial_schema, - expected_schema=expected_schema) + _check_array_roundtrip(arr) - def test_structarray(self): - ints = pa.array([None, 2, 3], type=pa.int64()) - strs = pa.array([u'a', None, u'c'], type=pa.string()) - bools = pa.array([True, False, None], type=pa.bool_()) - arr = pa.StructArray.from_arrays( - [ints, strs, bools], - ['ints', 'strs', 'bools']) + def test_nested_lists_all_none(self): + data = np.array([[None, None], None], dtype=object) - expected = pd.Series([ - {'ints': None, 'strs': u'a', 'bools': True}, - {'ints': 2, 'strs': None, 'bools': False}, - {'ints': 3, 'strs': u'c', 'bools': None}, - ]) + arr = pa.array(data) + expected = pa.array(list(data)) + assert arr.equals(expected) + assert arr.type == pa.list_(pa.null()) - series = pd.Series(arr.to_pandas()) - tm.assert_series_equal(series, expected) + data2 = np.array([None, None, [None, None], + np.array([None, None], dtype=object)], + dtype=object) + arr = pa.array(data2) + expected = pa.array([None, None, [None, None], [None, None]]) + assert arr.equals(expected) + + def test_nested_lists_all_empty(self): + # ARROW-2128 + data = pd.Series([[], [], []]) + arr = pa.array(data) + expected = pa.array(list(data)) + assert arr.equals(expected) + assert arr.type == pa.list_(pa.null()) def test_infer_lists(self): data = OrderedDict([ @@ -1285,242 +1290,288 @@ class TestPandasConversion(object): _check_pandas_roundtrip(df, expected_schema=expected_schema) - def test_metadata_with_mixed_types(self): - df = pd.DataFrame({'data': [b'some_bytes', u'some_unicode']}) - table = pa.Table.from_pandas(df) - metadata = table.schema.metadata - assert b'mixed' not in metadata[b'pandas'] + @pytest.mark.parametrize('t,data,expected', [ + ( + pa.int64, + [[1, 2], [3], None], + [None, [3], None] + ), + ( + pa.string, + [[u'aaa', u'bb'], [u'c'], None], + [None, [u'c'], None] + ), + ( + pa.null, + [[None, None], [None], None], + [None, [None], None] + ) + ]) + def test_array_from_pandas_typed_array_with_mask(self, t, data, expected): + m = np.array([True, False, True]) - js = json.loads(metadata[b'pandas'].decode('utf8')) - data_column = js['columns'][0] - assert data_column['pandas_type'] == 'bytes' - assert data_column['numpy_type'] == 'object' + s = pd.Series(data) + result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t())) - def test_list_metadata(self): - df = pd.DataFrame({'data': [[1], [2, 3, 4], [5] * 7]}) - schema = pa.schema([pa.field('data', type=pa.list_(pa.int64()))]) - table = pa.Table.from_pandas(df, schema=schema) - metadata = table.schema.metadata - assert b'mixed' not in metadata[b'pandas'] + assert pa.Array.from_pandas(expected, + type=pa.list_(t())).equals(result) - js = json.loads(metadata[b'pandas'].decode('utf8')) - data_column = js['columns'][0] - assert data_column['pandas_type'] == 'list[int64]' - assert data_column['numpy_type'] == 'object' + def test_empty_list_roundtrip(self): + empty_list_array = np.empty((3,), dtype=object) + empty_list_array.fill([]) - def test_decimal_metadata(self): - expected = pd.DataFrame({ - 'decimals': [ - decimal.Decimal('394092382910493.12341234678'), - -decimal.Decimal('314292388910493.12343437128'), - ] - }) - table = pa.Table.from_pandas(expected) - metadata = table.schema.metadata - assert b'mixed' not in metadata[b'pandas'] + df = pd.DataFrame({'a': np.array(['1', '2', '3']), + 'b': empty_list_array}) + tbl = pa.Table.from_pandas(df) - js = json.loads(metadata[b'pandas'].decode('utf8')) - data_column = js['columns'][0] - assert data_column['pandas_type'] == 'decimal' - assert data_column['numpy_type'] == 'object' - assert data_column['metadata'] == {'precision': 26, 'scale': 11} + result = tbl.to_pandas() - def test_table_empty_str(self): - values = ['', '', '', '', ''] - df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) - schema = pa.schema([field]) - table = pa.Table.from_pandas(df, schema=schema) + tm.assert_frame_equal(result, df) - result1 = table.to_pandas(strings_to_categorical=False) - expected1 = pd.DataFrame({'strings': values}) - tm.assert_frame_equal(result1, expected1, check_dtype=True) - result2 = table.to_pandas(strings_to_categorical=True) - expected2 = pd.DataFrame({'strings': pd.Categorical(values)}) - tm.assert_frame_equal(result2, expected2, check_dtype=True) +class TestConvertStructTypes(object): + """ + Conversion tests for struct types. + """ - def test_table_str_to_categorical_without_na(self): - values = ['a', 'a', 'b', 'b', 'c'] - df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) - schema = pa.schema([field]) - table = pa.Table.from_pandas(df, schema=schema) + def test_structarray(self): + ints = pa.array([None, 2, 3], type=pa.int64()) + strs = pa.array([u'a', None, u'c'], type=pa.string()) + bools = pa.array([True, False, None], type=pa.bool_()) + arr = pa.StructArray.from_arrays( + [ints, strs, bools], + ['ints', 'strs', 'bools']) - result = table.to_pandas(strings_to_categorical=True) - expected = pd.DataFrame({'strings': pd.Categorical(values)}) - tm.assert_frame_equal(result, expected, check_dtype=True) + expected = pd.Series([ + {'ints': None, 'strs': u'a', 'bools': True}, + {'ints': 2, 'strs': None, 'bools': False}, + {'ints': 3, 'strs': u'c', 'bools': None}, + ]) - with pytest.raises(pa.ArrowInvalid): - table.to_pandas(strings_to_categorical=True, - zero_copy_only=True) + series = pd.Series(arr.to_pandas()) + tm.assert_series_equal(series, expected) - def test_table_str_to_categorical_with_na(self): - values = [None, 'a', 'b', np.nan] - df = pd.DataFrame({'strings': values}) - field = pa.field('strings', pa.string()) - schema = pa.schema([field]) - table = pa.Table.from_pandas(df, schema=schema) - result = table.to_pandas(strings_to_categorical=True) - expected = pd.DataFrame({'strings': pd.Categorical(values)}) - tm.assert_frame_equal(result, expected, check_dtype=True) +class TestZeroCopyConversion(object): + """ + Tests that zero-copy conversion works with some types. + """ - with pytest.raises(pa.ArrowInvalid): - table.to_pandas(strings_to_categorical=True, - zero_copy_only=True) + def test_zero_copy_success(self): + result = pa.array([0, 1, 2]).to_pandas(zero_copy_only=True) + npt.assert_array_equal(result, [0, 1, 2]) - def test_table_batch_empty_dataframe(self): - df = pd.DataFrame({}) + def test_zero_copy_dictionaries(self): + arr = pa.DictionaryArray.from_arrays( + np.array([0, 0]), + np.array([5])) + + result = arr.to_pandas(zero_copy_only=True) + values = pd.Categorical([5, 5]) + + tm.assert_series_equal(pd.Series(result), pd.Series(values), + check_names=False) + + def test_zero_copy_failure_on_object_types(self): + with pytest.raises(pa.ArrowException): + pa.array(['A', 'B', 'C']).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_with_int_when_nulls(self): + with pytest.raises(pa.ArrowException): + pa.array([0, 1, None]).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_with_float_when_nulls(self): + with pytest.raises(pa.ArrowException): + pa.array([0.0, 1.0, None]).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_on_bool_types(self): + with pytest.raises(pa.ArrowException): + pa.array([True, False]).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_on_list_types(self): + arr = np.array([[1, 2], [8, 9]], dtype=object) + + with pytest.raises(pa.ArrowException): + pa.array(arr).to_pandas(zero_copy_only=True) + + def test_zero_copy_failure_on_timestamp_types(self): + arr = np.array(['2007-07-13'], dtype='datetime64[ns]') + + with pytest.raises(pa.ArrowException): + pa.array(arr).to_pandas(zero_copy_only=True) + + +class TestConvertMisc(object): + """ + Miscellaneous conversion tests. + """ + + type_pairs = [ + (np.int8, pa.int8()), + (np.int16, pa.int16()), + (np.int32, pa.int32()), + (np.int64, pa.int64()), + (np.uint8, pa.uint8()), + (np.uint16, pa.uint16()), + (np.uint32, pa.uint32()), + (np.uint64, pa.uint64()), + # (np.float16, pa.float16()), # XXX unsupported + (np.float32, pa.float32()), + (np.float64, pa.float64()), + # XXX unsupported + # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])), + (np.object, pa.string()), + # (np.object, pa.binary()), # XXX unsupported + (np.object, pa.binary(10)), + (np.object, pa.list_(pa.int64())), + ] + + def test_all_none_objects(self): + df = pd.DataFrame({'a': [None, None, None]}) _check_pandas_roundtrip(df) - _check_pandas_roundtrip(df, as_batch=True) - df2 = pd.DataFrame({}, index=[0, 1, 2]) - _check_pandas_roundtrip(df2, preserve_index=True) - _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True) + def test_all_none_category(self): + df = pd.DataFrame({'a': [None, None, None]}) + df['a'] = df['a'].astype('category') + _check_pandas_roundtrip(df) - def test_convert_empty_table(self): - arr = pa.array([], type=pa.int64()) - tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=np.int64)) - arr = pa.array([], type=pa.string()) - tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object)) - arr = pa.array([], type=pa.list_(pa.int64())) - tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object)) - arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())])) - tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object)) + def test_empty_arrays(self): + for dtype, pa_type in self.type_pairs: + arr = np.array([], dtype=dtype) + _check_array_roundtrip(arr, type=pa_type) - def test_array_from_pandas_date_with_mask(self): - m = np.array([True, False, True]) - data = pd.Series([ - date(1990, 1, 1), - date(1991, 1, 1), - date(1992, 1, 1) - ]) + def test_threaded_conversion(self): + df = _alltypes_example() + _check_pandas_roundtrip(df, nthreads=2) + _check_pandas_roundtrip(df, nthreads=2, as_batch=True) - result = pa.Array.from_pandas(data, mask=m) + def test_category(self): + repeats = 5 + v1 = ['foo', None, 'bar', 'qux', np.nan] + v2 = [4, 5, 6, 7, 8] + v3 = [b'foo', None, b'bar', b'qux', np.nan] + df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats), + 'cat_ints': pd.Categorical(v2 * repeats), + 'cat_binary': pd.Categorical(v3 * repeats), + 'cat_strings_ordered': pd.Categorical( + v1 * repeats, categories=['bar', 'qux', 'foo'], + ordered=True), + 'ints': v2 * repeats, + 'ints2': v2 * repeats, + 'strings': v1 * repeats, + 'strings2': v1 * repeats, + 'strings3': v3 * repeats}) + _check_pandas_roundtrip(df) - expected = pd.Series([None, date(1991, 1, 1), None]) - assert pa.Array.from_pandas(expected).equals(result) + arrays = [ + pd.Categorical(v1 * repeats), + pd.Categorical(v2 * repeats), + pd.Categorical(v3 * repeats) + ] + for values in arrays: + _check_array_roundtrip(values) - @pytest.mark.parametrize('t,data,expected', [ - ( - pa.int64, - [[1, 2], [3], None], - [None, [3], None] - ), - ( - pa.string, - [[u'aaa', u'bb'], [u'c'], None], - [None, [u'c'], None] - ), - ( - pa.null, - [[None, None], [None], None], - [None, [None], None] - ) - ]) - def test_array_from_pandas_typed_array_with_mask(self, t, data, expected): - m = np.array([True, False, True]) + def test_mixed_types_fails(self): + data = pd.DataFrame({'a': ['a', 1, 2.0]}) + with pytest.raises(pa.ArrowException): + pa.Table.from_pandas(data) - s = pd.Series(data) - result = pa.Array.from_pandas(s, mask=m, type=pa.list_(t())) + data = pd.DataFrame({'a': [1, True]}) + with pytest.raises(pa.ArrowException): + pa.Table.from_pandas(data) - assert pa.Array.from_pandas(expected, - type=pa.list_(t())).equals(result) + def test_strided_data_import(self): + cases = [] - def test_table_column_subset_metadata(self): - # ARROW-1883 - df = pd.DataFrame({ - 'a': [1, 2, 3], - 'b': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')}) - table = pa.Table.from_pandas(df) + columns = ['a', 'b', 'c'] + N, K = 100, 3 + random_numbers = np.random.randn(N, K).copy() * 100 - table_subset = table.remove_column(1) - result = table_subset.to_pandas() - tm.assert_frame_equal(result, df[['a']]) + numeric_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8', + 'f4', 'f8'] - table_subset2 = table_subset.remove_column(1) - result = table_subset2.to_pandas() - tm.assert_frame_equal(result, df[['a']]) + for type_name in numeric_dtypes: + cases.append(random_numbers.astype(type_name)) - # non-default index - for index in [ - pd.Index(['a', 'b', 'c'], name='index'), - pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')]: - df = pd.DataFrame({'a': [1, 2, 3], - 'b': [.1, .2, .3]}, index=index) - table = pa.Table.from_pandas(df) + # strings + cases.append(np.array([tm.rands(10) for i in range(N * K)], + dtype=object) + .reshape(N, K).copy()) - table_subset = table.remove_column(1) - result = table_subset.to_pandas() - tm.assert_frame_equal(result, df[['a']]) + # booleans + boolean_objects = (np.array([True, False, True] * N, dtype=object) + .reshape(N, K).copy()) - table_subset2 = table_subset.remove_column(1) - result = table_subset2.to_pandas() - tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) + # add some nulls, so dtype comes back as objects + boolean_objects[5] = None + cases.append(boolean_objects) - def test_empty_list_roundtrip(self): - empty_list_array = np.empty((3,), dtype=object) - empty_list_array.fill([]) + cases.append(np.arange("2016-01-01T00:00:00.001", N * K, + dtype='datetime64[ms]') + .reshape(N, K).copy()) - df = pd.DataFrame({'a': np.array(['1', '2', '3']), - 'b': empty_list_array}) - tbl = pa.Table.from_pandas(df) + strided_mask = (random_numbers > 0).astype(bool)[:, 0] - result = tbl.to_pandas() + for case in cases: + df = pd.DataFrame(case, columns=columns) + col = df['a'] - tm.assert_frame_equal(result, df) + _check_pandas_roundtrip(df) + _check_array_roundtrip(col) + _check_array_roundtrip(col, mask=strided_mask) - def test_empty_list_metadata(self): - # Create table with array of empty lists, forced to have type - # list(string) in pyarrow - c1 = [["test"], ["a", "b"], None] - c2 = [[], [], []] - arrays = OrderedDict([ - ('c1', pa.array(c1, type=pa.list_(pa.string()))), - ('c2', pa.array(c2, type=pa.list_(pa.string()))), + def test_all_nones(self): + def _check_series(s): + converted = pa.array(s) + assert isinstance(converted, pa.NullArray) + assert len(converted) == 3 + assert converted.null_count == 3 + assert converted[0] is pa.NA + + _check_series(pd.Series([None] * 3, dtype=object)) + _check_series(pd.Series([np.nan] * 3, dtype=object)) + _check_series(pd.Series([np.sqrt(-1)] * 3, dtype=object)) + + def test_partial_schema(self): + data = OrderedDict([ + ('a', [0, 1, 2, 3, 4]), + ('b', np.array([-10, -5, 0, 5, 10], dtype=np.int32)), + ('c', [-10, -5, 0, 5, 10]) ]) - rb = pa.RecordBatch.from_arrays( - list(arrays.values()), - list(arrays.keys()) - ) - tbl = pa.Table.from_batches([rb]) + df = pd.DataFrame(data) - # First roundtrip changes schema, because pandas cannot preserve the - # type of empty lists - df = tbl.to_pandas() - tbl2 = pa.Table.from_pandas(df, preserve_index=True) - md2 = json.loads(tbl2.schema.metadata[b'pandas'].decode('utf8')) + partial_schema = pa.schema([ + pa.field('a', pa.int64()), + pa.field('b', pa.int32()) + ]) - # Second roundtrip - df2 = tbl2.to_pandas() - expected = pd.DataFrame(OrderedDict([('c1', c1), ('c2', c2)])) + expected_schema = pa.schema([ + pa.field('a', pa.int64()), + pa.field('b', pa.int32()), + pa.field('c', pa.int64()) + ]) - tm.assert_frame_equal(df2, expected) + _check_pandas_roundtrip(df, schema=partial_schema, + expected_schema=expected_schema) - assert md2['columns'] == [ - { - 'name': 'c1', - 'field_name': 'c1', - 'metadata': None, - 'numpy_type': 'object', - 'pandas_type': 'list[unicode]', - }, - { - 'name': 'c2', - 'field_name': 'c2', - 'metadata': None, - 'numpy_type': 'object', - 'pandas_type': 'list[empty]', - }, - { - 'name': None, - 'field_name': '__index_level_0__', - 'metadata': None, - 'numpy_type': 'int64', - 'pandas_type': 'int64', - } - ] + def test_table_batch_empty_dataframe(self): + df = pd.DataFrame({}) + _check_pandas_roundtrip(df) + _check_pandas_roundtrip(df, as_batch=True) + + df2 = pd.DataFrame({}, index=[0, 1, 2]) + _check_pandas_roundtrip(df2, preserve_index=True) + _check_pandas_roundtrip(df2, as_batch=True, preserve_index=True) + + def test_convert_empty_table(self): + arr = pa.array([], type=pa.int64()) + tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=np.int64)) + arr = pa.array([], type=pa.string()) + tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object)) + arr = pa.array([], type=pa.list_(pa.int64())) + tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object)) + arr = pa.array([], type=pa.struct([pa.field('a', pa.int64())])) + tm.assert_almost_equal(arr.to_pandas(), np.array([], dtype=object)) def _fully_loaded_dataframe_example(): -- To stop receiving notification emails like this one, please contact w...@apache.org.