Repository: arrow Updated Branches: refs/heads/master c5663c6d0 -> 5ffbda1b4
ARROW-479: Python: Test for expected schema in Pandas conversion Author: Uwe L. Korn <uw...@xhochy.com> Closes #281 from xhochy/ARROW-479 and squashes the following commits: acd9abd [Uwe L. Korn] Use arrow::timestamp() 43dba37 [Uwe L. Korn] Fix tests 7a3f5b8 [Uwe L. Korn] ARROW-479: Python: Test for expected schema in Pandas conversion Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5ffbda1b Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5ffbda1b Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5ffbda1b Branch: refs/heads/master Commit: 5ffbda1b408951cb5cf49008920f1054544148d3 Parents: c5663c6 Author: Uwe L. Korn <uw...@xhochy.com> Authored: Fri Jan 13 08:46:48 2017 -0500 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Fri Jan 13 08:46:48 2017 -0500 ---------------------------------------------------------------------- python/pyarrow/includes/libarrow.pxd | 2 + python/pyarrow/includes/pyarrow.pxd | 4 +- python/pyarrow/schema.pyx | 38 ++++++++++- python/pyarrow/tests/test_convert_builtin.py | 2 +- python/pyarrow/tests/test_convert_pandas.py | 77 ++++++++++++++++------- python/pyarrow/tests/test_parquet.py | 2 +- python/src/pyarrow/helpers.cc | 3 - 7 files changed, 97 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index d1970e5..8cfaaf7 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -60,6 +60,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string ToString() + shared_ptr[CDataType] timestamp(TimeUnit unit) + cdef cppclass MemoryPool" arrow::MemoryPool": int64_t bytes_allocated() http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/includes/pyarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index dc6ccd2..901e6c9 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -19,13 +19,15 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable, - CDataType, CStatus, Type, MemoryPool) + CDataType, CStatus, Type, MemoryPool, + TimeUnit) cimport pyarrow.includes.libarrow_io as arrow_io cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: shared_ptr[CDataType] GetPrimitiveType(Type type) + shared_ptr[CDataType] GetTimestampType(TimeUnit unit) CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out) CStatus PandasToArrow(MemoryPool* pool, object ao, http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/schema.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index d91ae7c..f6a1a10 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -23,8 +23,20 @@ # cython: embedsignature = True from pyarrow.compat import frombytes, tobytes -from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow cimport (CDataType, CStructType, CListType, + Type_NA, Type_BOOL, + Type_UINT8, Type_INT8, + Type_UINT16, Type_INT16, + Type_UINT32, Type_INT32, + Type_UINT64, Type_INT64, + Type_TIMESTAMP, Type_DATE, + Type_FLOAT, Type_DOUBLE, + Type_STRING, Type_BINARY, + TimeUnit_SECOND, TimeUnit_MILLI, + TimeUnit_MICRO, TimeUnit_NANO, + Type, TimeUnit) cimport pyarrow.includes.pyarrow as pyarrow +cimport pyarrow.includes.libarrow as libarrow cimport cpython @@ -197,8 +209,28 @@ def uint64(): def int64(): return primitive_type(Type_INT64) -def timestamp(): - return primitive_type(Type_TIMESTAMP) +cdef dict _timestamp_type_cache = {} + +def timestamp(unit_str): + cdef TimeUnit unit + if unit_str == "s": + unit = TimeUnit_SECOND + elif unit_str == 'ms': + unit = TimeUnit_MILLI + elif unit_str == 'us': + unit = TimeUnit_MICRO + elif unit_str == 'ns': + unit = TimeUnit_NANO + else: + raise TypeError('Invalid TimeUnit string') + + if unit in _timestamp_type_cache: + return _timestamp_type_cache[unit] + + cdef DataType out = DataType() + out.init(libarrow.timestamp(unit)) + _timestamp_type_cache[unit] = out + return out def date(): return primitive_type(Type_DATE) http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_convert_builtin.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 6116742..72e4389 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -112,7 +112,7 @@ class TestConvertList(unittest.TestCase): ] arr = pyarrow.from_pylist(data) assert len(arr) == 4 - assert arr.type == pyarrow.timestamp() + assert arr.type == pyarrow.timestamp('us') assert arr.null_count == 1 assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 12e7a08..261eaa8 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -60,65 +60,79 @@ class TestPandasConversion(unittest.TestCase): pass def _check_pandas_roundtrip(self, df, expected=None, nthreads=1, - timestamps_to_ms=False): + timestamps_to_ms=False, expected_schema=None): table = A.Table.from_pandas(df, timestamps_to_ms=timestamps_to_ms) result = table.to_pandas(nthreads=nthreads) + if expected_schema: + assert table.schema.equals(expected_schema) if expected is None: expected = df tm.assert_frame_equal(result, expected) def test_float_no_nulls(self): data = {} - numpy_dtypes = ['f4', 'f8'] + fields = [] + dtypes = [('f4', A.float_()), ('f8', A.double())] num_values = 100 - for dtype in numpy_dtypes: + for numpy_dtype, arrow_dtype in dtypes: values = np.random.randn(num_values) - data[dtype] = values.astype(dtype) + data[numpy_dtype] = values.astype(numpy_dtype) + fields.append(A.Field.from_py(numpy_dtype, arrow_dtype)) df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) + schema = A.Schema.from_fields(fields) + self._check_pandas_roundtrip(df, expected_schema=schema) def test_float_nulls(self): num_values = 100 null_mask = np.random.randint(0, 10, size=num_values) < 3 - dtypes = ['f4', 'f8'] + dtypes = [('f4', A.float_()), ('f8', A.double())] + names = ['f4', 'f8'] expected_cols = [] arrays = [] - for name in dtypes: + fields = [] + for name, arrow_dtype in dtypes: values = np.random.randn(num_values).astype(name) arr = A.from_pandas_series(values, null_mask) arrays.append(arr) - + fields.append(A.Field.from_py(name, arrow_dtype)) values[null_mask] = np.nan expected_cols.append(values) - ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), - columns=dtypes) + ex_frame = pd.DataFrame(dict(zip(names, expected_cols)), + columns=names) - table = A.Table.from_arrays(dtypes, arrays) + table = A.Table.from_arrays(names, arrays) + assert table.schema.equals(A.Schema.from_fields(fields)) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) def test_integer_no_nulls(self): data = {} + fields = [] - numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + numpy_dtypes = [('i1', A.int8()), ('i2', A.int16()), + ('i4', A.int32()), ('i8', A.int64()), + ('u1', A.uint8()), ('u2', A.uint16()), + ('u4', A.uint32()), ('u8', A.uint64())] num_values = 100 - for dtype in numpy_dtypes: + for dtype, arrow_dtype in numpy_dtypes: info = np.iinfo(dtype) values = np.random.randint(info.min, min(info.max, np.iinfo('i8').max), size=num_values) data[dtype] = values.astype(dtype) + fields.append(A.Field.from_py(dtype, arrow_dtype)) df = pd.DataFrame(data) - self._check_pandas_roundtrip(df) + schema = A.Schema.from_fields(fields) + self._check_pandas_roundtrip(df, expected_schema=schema) def test_integer_with_nulls(self): # pandas requires upcast to float dtype @@ -155,7 +169,9 @@ class TestPandasConversion(unittest.TestCase): np.random.seed(0) df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) - self._check_pandas_roundtrip(df) + field = A.Field.from_py('bools', A.bool_()) + schema = A.Schema.from_fields([field]) + self._check_pandas_roundtrip(df, expected_schema=schema) def test_boolean_nulls(self): # pandas requires upcast to object dtype @@ -170,9 +186,12 @@ class TestPandasConversion(unittest.TestCase): expected = values.astype(object) expected[mask] = None + field = A.Field.from_py('bools', A.bool_()) + schema = A.Schema.from_fields([field]) ex_frame = pd.DataFrame({'bools': expected}) table = A.Table.from_arrays(['bools'], [arr]) + assert table.schema.equals(schema) result = table.to_pandas() tm.assert_frame_equal(result, ex_frame) @@ -180,14 +199,18 @@ class TestPandasConversion(unittest.TestCase): def test_boolean_object_nulls(self): arr = np.array([False, None, True] * 100, dtype=object) df = pd.DataFrame({'bools': arr}) - self._check_pandas_roundtrip(df) + field = A.Field.from_py('bools', A.bool_()) + schema = A.Schema.from_fields([field]) + self._check_pandas_roundtrip(df, expected_schema=schema) def test_unicode(self): repeats = 1000 values = [u'foo', None, u'bar', u'mañana', np.nan] df = pd.DataFrame({'strings': values * repeats}) + field = A.Field.from_py('strings', A.string()) + schema = A.Schema.from_fields([field]) - self._check_pandas_roundtrip(df) + self._check_pandas_roundtrip(df, expected_schema=schema) def test_bytes_to_binary(self): values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] @@ -208,7 +231,9 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - self._check_pandas_roundtrip(df, timestamps_to_ms=True) + field = A.Field.from_py('datetime64', A.timestamp('ms')) + schema = A.Schema.from_fields([field]) + self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) df = pd.DataFrame({ 'datetime64': np.array([ @@ -217,7 +242,9 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - self._check_pandas_roundtrip(df, timestamps_to_ms=False) + field = A.Field.from_py('datetime64', A.timestamp('ns')) + schema = A.Schema.from_fields([field]) + self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema) def test_timestamps_notimezone_nulls(self): df = pd.DataFrame({ @@ -227,8 +254,9 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437'], dtype='datetime64[ms]') }) - df.info() - self._check_pandas_roundtrip(df, timestamps_to_ms=True) + field = A.Field.from_py('datetime64', A.timestamp('ms')) + schema = A.Schema.from_fields([field]) + self._check_pandas_roundtrip(df, timestamps_to_ms=True, expected_schema=schema) df = pd.DataFrame({ 'datetime64': np.array([ @@ -237,7 +265,9 @@ class TestPandasConversion(unittest.TestCase): '2010-08-13T05:46:57.437699912'], dtype='datetime64[ns]') }) - self._check_pandas_roundtrip(df, timestamps_to_ms=False) + field = A.Field.from_py('datetime64', A.timestamp('ns')) + schema = A.Schema.from_fields([field]) + self._check_pandas_roundtrip(df, timestamps_to_ms=False, expected_schema=schema) def test_date(self): df = pd.DataFrame({ @@ -246,6 +276,9 @@ class TestPandasConversion(unittest.TestCase): datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)]}) table = A.Table.from_pandas(df) + field = A.Field.from_py('date', A.date()) + schema = A.Schema.from_fields([field]) + assert table.schema.equals(schema) result = table.to_pandas() expected = df.copy() expected['date'] = pd.to_datetime(df['date']) http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index ad4bc58..e157155 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -244,7 +244,7 @@ def test_parquet_metadata_api(): a_table = A.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() - pq.write_table(a_table, buf, compression='snappy', version='2.0') + pq.write_table(a_table, buf, compression='SNAPPY', version='2.0') buf.seek(0) fileh = pq.ParquetFile(buf) http://git-wip-us.apache.org/repos/asf/arrow/blob/5ffbda1b/python/src/pyarrow/helpers.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index 3f65032..78fad16 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -41,9 +41,6 @@ std::shared_ptr<DataType> GetPrimitiveType(Type::type type) { GET_PRIMITIVE_TYPE(UINT64, uint64); GET_PRIMITIVE_TYPE(INT64, int64); GET_PRIMITIVE_TYPE(DATE, date); - case Type::TIMESTAMP: - return arrow::timestamp(arrow::TimeUnit::MICRO); - break; GET_PRIMITIVE_TYPE(BOOL, boolean); GET_PRIMITIVE_TYPE(FLOAT, float32); GET_PRIMITIVE_TYPE(DOUBLE, float64);