Repository: arrow Updated Branches: refs/heads/master 0ae4d86e5 -> c05292faf
ARROW-523: Python: Account for changes in PARQUET-834 Author: Uwe L. Korn <uw...@xhochy.com> Closes #313 from xhochy/ARROW-523 and squashes the following commits: ff699ea [Uwe L. Korn] Use relative import e36dcc8 [Uwe L. Korn] ARROW-523: Python: Account for changes in PARQUET-834 Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c05292fa Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c05292fa Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c05292fa Branch: refs/heads/master Commit: c05292faf74111e826dbbafe9a1e108346eb10dc Parents: 0ae4d86 Author: Uwe L. Korn <uw...@xhochy.com> Authored: Thu Feb 2 16:13:40 2017 -0500 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Thu Feb 2 16:13:40 2017 -0500 ---------------------------------------------------------------------- python/pyarrow/_parquet.pxd | 8 +-- python/pyarrow/_parquet.pyx | 8 +-- python/pyarrow/tests/pandas_examples.py | 78 ++++++++++++++++++++++++ python/pyarrow/tests/test_convert_pandas.py | 47 +------------- python/pyarrow/tests/test_parquet.py | 11 ++++ 5 files changed, 100 insertions(+), 52 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/_parquet.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index fabee5d..6b9350a 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -211,9 +211,9 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader) - CStatus ReadFlatColumn(int i, shared_ptr[CArray]* out); - CStatus ReadFlatTable(shared_ptr[CTable]* out); - CStatus ReadFlatTable(const vector[int]& column_indices, + CStatus ReadColumn(int i, shared_ptr[CArray]* out); + CStatus ReadTable(shared_ptr[CTable]* out); + CStatus ReadTable(const vector[int]& column_indices, shared_ptr[CTable]* out); const ParquetFileReader* parquet_reader(); @@ -228,7 +228,7 @@ cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: - cdef CStatus WriteFlatTable( + cdef CStatus WriteTable( const CTable* table, MemoryPool* pool, const shared_ptr[OutputStream]& sink, int64_t chunk_size, http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/_parquet.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 3f847e9..fd4670a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -397,12 +397,12 @@ cdef class ParquetReader: with nogil: check_status(self.reader.get() - .ReadFlatTable(c_column_indices, &ctable)) + .ReadTable(c_column_indices, &ctable)) else: # Read all columns with nogil: check_status(self.reader.get() - .ReadFlatTable(&ctable)) + .ReadTable(&ctable)) table.init(ctable) return table @@ -442,7 +442,7 @@ cdef class ParquetReader: with nogil: check_status(self.reader.get() - .ReadFlatColumn(column_index, &carray)) + .ReadColumn(column_index, &carray)) array.init(carray) return array @@ -540,6 +540,6 @@ cdef class ParquetWriter: cdef int c_row_group_size = row_group_size with nogil: - check_status(WriteFlatTable(ctable, default_memory_pool(), + check_status(WriteTable(ctable, default_memory_pool(), self.sink, c_row_group_size, self.properties)) http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/tests/pandas_examples.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py new file mode 100644 index 0000000..63af423 --- /dev/null +++ b/python/pyarrow/tests/pandas_examples.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections import OrderedDict + +import numpy as np +import pandas as pd +import pyarrow as pa + + +def dataframe_with_arrays(): + """ + Dataframe with numpy arrays columns of every possible primtive type. + + Returns + ------- + df: pandas.DataFrame + schema: pyarrow.Schema + Arrow schema definition that is in line with the constructed df. + """ + dtypes = [('i1', pa.int8()), ('i2', pa.int16()), + ('i4', pa.int32()), ('i8', pa.int64()), + ('u1', pa.uint8()), ('u2', pa.uint16()), + ('u4', pa.uint32()), ('u8', pa.uint64()), + ('f4', pa.float_()), ('f8', pa.double())] + + arrays = OrderedDict() + fields = [] + for dtype, arrow_dtype in dtypes: + fields.append(pa.field(dtype, pa.list_(arrow_dtype))) + arrays[dtype] = [ + np.arange(10, dtype=dtype), + np.arange(5, dtype=dtype), + None, + np.arange(1, dtype=dtype) + ] + + fields.append(pa.field('str', pa.list_(pa.string()))) + arrays['str'] = [ + np.array([u"1", u"ä"], dtype="object"), + None, + np.array([u"1"], dtype="object"), + np.array([u"1", u"2", u"3"], dtype="object") + ] + + fields.append(pa.field('datetime64', pa.list_(pa.timestamp('ms')))) + arrays['datetime64'] = [ + np.array(['2007-07-13T01:23:34.123456789', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + None, + None, + np.array(['2007-07-13T02', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ms]'), + ] + + df = pd.DataFrame(arrays) + schema = pa.Schema.from_fields(fields) + + return df, schema http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 674a436..ddbb02a 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -29,6 +29,8 @@ import pandas.util.testing as tm from pyarrow.compat import u import pyarrow as A +from .pandas_examples import dataframe_with_arrays + def _alltypes_example(size=100): return pd.DataFrame({ @@ -325,54 +327,11 @@ class TestPandasConversion(unittest.TestCase): tm.assert_frame_equal(result, expected) def test_column_of_lists(self): - dtypes = [('i1', A.int8()), ('i2', A.int16()), - ('i4', A.int32()), ('i8', A.int64()), - ('u1', A.uint8()), ('u2', A.uint16()), - ('u4', A.uint32()), ('u8', A.uint64()), - ('f4', A.float_()), ('f8', A.double())] - - arrays = OrderedDict() - fields = [] - for dtype, arrow_dtype in dtypes: - fields.append(A.field(dtype, A.list_(arrow_dtype))) - arrays[dtype] = [ - np.arange(10, dtype=dtype), - np.arange(5, dtype=dtype), - None, - np.arange(1, dtype=dtype) - ] - - fields.append(A.field('str', A.list_(A.string()))) - arrays['str'] = [ - np.array([u"1", u"ä"], dtype="object"), - None, - np.array([u"1"], dtype="object"), - np.array([u"1", u"2", u"3"], dtype="object") - ] - - fields.append(A.field('datetime64', A.list_(A.timestamp('ns')))) - arrays['datetime64'] = [ - np.array(['2007-07-13T01:23:34.123456789', - None, - '2010-08-13T05:46:57.437699912'], - dtype='datetime64[ns]'), - None, - None, - np.array(['2007-07-13T02', - None, - '2010-08-13T05:46:57.437699912'], - dtype='datetime64[ns]'), - ] - - df = pd.DataFrame(arrays) - schema = A.Schema.from_fields(fields) + df, schema = dataframe_with_arrays() self._check_pandas_roundtrip(df, schema=schema, expected_schema=schema) table = A.Table.from_pandas(df, schema=schema) assert table.schema.equals(schema) - # it works! - table.to_pandas(nthreads=1) - def test_threaded_conversion(self): df = _alltypes_example() self._check_pandas_roundtrip(df, nthreads=2, http://git-wip-us.apache.org/repos/asf/arrow/blob/c05292fa/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index d85f0e5..80a995f 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -23,6 +23,7 @@ import pytest from pyarrow.compat import guid import pyarrow as pa import pyarrow.io as paio +from .pandas_examples import dataframe_with_arrays import numpy as np import pandas as pd @@ -319,6 +320,16 @@ def test_compare_schemas(): assert fileh.schema[0].equals(fileh.schema[0]) assert not fileh.schema[0].equals(fileh.schema[1]) +@parquet +def test_column_of_lists(tmpdir): + df, schema = dataframe_with_arrays() + + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema) + pq.write_table(arrow_table, filename.strpath, version="2.0") + table_read = pq.read_table(filename.strpath) + df_read = table_read.to_pandas() + pdt.assert_frame_equal(df, df_read) @parquet def test_multithreaded_read():