jorisvandenbossche commented on code in PR #14804: URL: https://github.com/apache/arrow/pull/14804#discussion_r1054202181
########## python/pyarrow/tests/interchange/test_extra.py: ########## @@ -0,0 +1,371 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datetime import datetime as dt +import numpy as np +import pyarrow as pa +from pyarrow.vendored.version import Version +import pytest + +import pyarrow.interchange as pi +from pyarrow.interchange.column import ( + _PyArrowColumn, + ColumnNullType, + DtypeKind, +) +from pyarrow.interchange.from_dataframe import _from_dataframe + +try: + import pandas as pd + # import pandas.testing as tm +except ImportError: + pass + + [email protected]("unit", ['s', 'ms', 'us', 'ns']) [email protected]("tz", ['', 'America/New_York', '+07:30', '-04:30']) +def test_datetime(unit, tz): + dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), None] + table = pa.table({"A": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))}) + col = table.__dataframe__().get_column_by_name("A") + + assert col.size() == 3 + assert col.offset == 0 + assert col.null_count == 1 + assert col.dtype[0] == DtypeKind.DATETIME + assert col.describe_null == (ColumnNullType.USE_BITMASK, 0) + + [email protected]( + ["test_data", "kind"], + [ + (["foo", "bar"], 21), + ([1.5, 2.5, 3.5], 2), + ([1, 2, 3, 4], 0), + ], +) +def test_array_to_pyarrowcolumn(test_data, kind): + arr = pa.array(test_data) + arr_column = _PyArrowColumn(arr) + + assert arr_column._col == arr + assert arr_column.size() == len(test_data) + assert arr_column.dtype[0] == kind + assert arr_column.num_chunks() == 1 + assert arr_column.null_count == 0 + assert arr_column.get_buffers()["validity"] is None + assert len(list(arr_column.get_chunks())) == 1 + + for chunk in arr_column.get_chunks(): + assert chunk == arr_column + + +def test_offset_of_sliced_array(): + arr = pa.array([1, 2, 3, 4]) + arr_sliced = arr.slice(2, 2) + + table = pa.table([arr], names=["arr"]) + table_sliced = pa.table([arr_sliced], names=["arr_sliced"]) + + col = table_sliced.__dataframe__().get_column(0) + assert col.offset == 2 + + result = _from_dataframe(table_sliced.__dataframe__()) + assert table_sliced.equals(result) + assert not table.equals(result) + + # pandas hardcodes offset to 0: + # https://github.com/pandas-dev/pandas/blob/5c66e65d7b9fef47ccb585ce2fd0b3ea18dc82ea/pandas/core/interchange/from_dataframe.py#L247 + # so conversion to pandas can't be tested currently + + # df = pandas_from_dataframe(table) + # df_sliced = pandas_from_dataframe(table_sliced) + + # tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"], + # check_index=False, check_names=False) + + +# Currently errors due to string conversion +# as col.size is called as a property not method in pandas +# see L255-L257 in pandas/core/interchange/from_dataframe.py [email protected] +def test_categorical_roundtrip(): + pytest.skip("Bug in pandas implementation") + + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"] + table = pa.table( + {"weekday": pa.array(arr).dictionary_encode()} + ) + + pandas_df = table.to_pandas() + result = pi.from_dataframe(pandas_df) + + # Checking equality for the values + # As the dtype of the indices is changed from int32 in pa.Table + # to int64 in pandas interchange protocol implementation + assert result[0].chunk(0).dictionary == table[0].chunk(0).dictionary + + table_protocol = table.__dataframe__() + result_protocol = result.__dataframe__() + + assert table_protocol.num_columns() == result_protocol.num_columns() + assert table_protocol.num_rows() == result_protocol.num_rows() + assert table_protocol.num_chunks() == result_protocol.num_chunks() + assert table_protocol.column_names() == result_protocol.column_names() + + col_table = table_protocol.get_column(0) + col_result = result_protocol.get_column(0) + + assert col_result.dtype[0] == DtypeKind.CATEGORICAL + assert col_result.dtype[0] == col_table.dtype[0] + assert col_result.size == col_table.size + assert col_result.offset == col_table.offset + + desc_cat_table = col_result.describe_categorical + desc_cat_result = col_result.describe_categorical + + assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"] + assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"] + assert isinstance(desc_cat_result["categories"]._col, pa.Array) + + [email protected] [email protected]( + "uint", [pa.uint8(), pa.uint16(), pa.uint32()] +) [email protected]( + "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] +) [email protected]( + "float, np_float", [ + # (pa.float16(), np.float16), #not supported by pandas + (pa.float32(), np.float32), + (pa.float64(), np.float64) + ] +) +def test_pandas_roundtrip(uint, int, float, np_float): + if Version(pd.__version__) < Version("1.5.0"): + pytest.skip("__dataframe__ added to pandas in 1.5.0") + + arr = [1, 2, 3] Review Comment: To ensure to have coverage of missing values, maybe add a separate test that just does pandas->pyarrow (so creating the pandas DataFrame, without doing the pyarrow->pandas part of the roundtrip, which currently fails) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
