Repository: arrow Updated Branches: refs/heads/master 362e754b3 -> c5a89b7b4
ARROW-1120: Support for writing timestamp(ns) to Int96 cc @c-nichols Author: Uwe L. Korn <[email protected]> Author: Colin Nichols <[email protected]> Closes #865 from xhochy/ARROW-1120 and squashes the following commits: ff70832f [Uwe L. Korn] Use integer division 99f825d3 [Uwe L. Korn] Add flag for timestamp[ns] roundtrips 7c28835b [Colin Nichols] ARROW-1120 Support for writing timestamp(ns) to Int96 Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c5a89b7b Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c5a89b7b Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c5a89b7b Branch: refs/heads/master Commit: c5a89b7b4fb94c3988677c5d80405ff7e9cfbd18 Parents: 362e754 Author: Uwe L. Korn <[email protected]> Authored: Tue Jul 18 15:18:03 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Tue Jul 18 15:18:03 2017 -0400 ---------------------------------------------------------------------- python/pyarrow/_parquet.pxd | 9 +++++++++ python/pyarrow/_parquet.pyx | 17 ++++++++++++++-- python/pyarrow/parquet.py | 11 +++++++---- python/pyarrow/tests/test_parquet.py | 33 +++++++++++++++++++++++++++---- 4 files changed, 60 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/_parquet.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 3d2d0c8..b1cd5eb 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -247,8 +247,17 @@ cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: CStatus Open(const CSchema& schema, CMemoryPool* pool, const shared_ptr[OutputStream]& sink, const shared_ptr[WriterProperties]& properties, + const shared_ptr[ArrowWriterProperties]& arrow_properties, unique_ptr[FileWriter]* writer) CStatus WriteTable(const CTable& table, int64_t chunk_size) CStatus NewRowGroup(int64_t chunk_size) CStatus Close() + + cdef cppclass ArrowWriterProperties: + cppclass Builder: + Builder() + Builder* disable_deprecated_int96_timestamps() + Builder* enable_deprecated_int96_timestamps() + shared_ptr[ArrowWriterProperties] build() + c_bool support_deprecated_int96_timestamps() http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/_parquet.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 0e0d58e..bbe5203 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -545,13 +545,14 @@ cdef class ParquetWriter: cdef readonly: object use_dictionary + object use_deprecated_int96_timestamps object compression object version int row_group_size def __cinit__(self, where, Schema schema, use_dictionary=None, compression=None, version=None, - MemoryPool memory_pool=None): + MemoryPool memory_pool=None, use_deprecated_int96_timestamps=False): cdef: shared_ptr[FileOutputStream] filestream shared_ptr[OutputStream] sink @@ -566,6 +567,7 @@ cdef class ParquetWriter: self.use_dictionary = use_dictionary self.compression = compression self.version = version + self.use_deprecated_int96_timestamps = use_deprecated_int96_timestamps cdef WriterProperties.Builder properties_builder self._set_version(&properties_builder) @@ -573,10 +575,21 @@ cdef class ParquetWriter: self._set_dictionary_props(&properties_builder) properties = properties_builder.build() + cdef ArrowWriterProperties.Builder arrow_properties_builder + self._set_int96_support(&arrow_properties_builder) + arrow_properties = arrow_properties_builder.build() + check_status( FileWriter.Open(deref(schema.schema), maybe_unbox_memory_pool(memory_pool), - sink, properties, &self.writer)) + sink, properties, arrow_properties, + &self.writer)) + + cdef void _set_int96_support(self, ArrowWriterProperties.Builder* props): + if self.use_deprecated_int96_timestamps: + props.enable_deprecated_int96_timestamps() + else: + props.disable_deprecated_int96_timestamps() cdef void _set_version(self, WriterProperties.Builder* props): if self.version is not None: http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 06b3a3d..64cf330 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -743,7 +743,8 @@ def read_pandas(source, columns=None, nthreads=1, metadata=None): def write_table(table, where, row_group_size=None, version='1.0', - use_dictionary=True, compression='snappy', **kwargs): + use_dictionary=True, compression='snappy', + use_deprecated_int96_timestamps=False, **kwargs): """ Write a Table to Parquet format @@ -766,12 +767,13 @@ def write_table(table, where, row_group_size=None, version='1.0', writer = ParquetWriter(where, table.schema, use_dictionary=use_dictionary, compression=compression, - version=version) + version=version, + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) writer.write_table(table, row_group_size=row_group_size) writer.close() -def write_metadata(schema, where, version='1.0'): +def write_metadata(schema, where, version='1.0', use_deprecated_int96_timestamps=False): """ Write metadata-only Parquet file from schema @@ -782,5 +784,6 @@ def write_metadata(schema, where, version='1.0'): version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 """ - writer = ParquetWriter(where, schema, version=version) + writer = ParquetWriter(where, schema, version=version, + use_deprecated_int96_timestamps=use_deprecated_int96_timestamps) writer.close() http://git-wip-us.apache.org/repos/asf/arrow/blob/c5a89b7b/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index d17eb14..40e44b3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -456,20 +456,45 @@ def test_date_time_types(): ex_t6 = pa.time32('ms') ex_a6 = pa.Array.from_pandas(data4 * 1000, type=ex_t6) - table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6], + t7 = pa.timestamp('ns') + start = pd.Timestamp('2001-01-01').value + data7 = np.array([start, start + 1, start + 2], dtype='int64') + a7 = pa.Array.from_pandas(data7, type=t7) + + t7_us = pa.timestamp('us') + start = pd.Timestamp('2001-01-01').value + data7_us = np.array([start, start + 1, start + 2], dtype='int64') // 1000 + a7_us = pa.Array.from_pandas(data7_us, type=t7_us) + + table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', - 'time32_from64[s]']) + 'time32_from64[s]', + 'timestamp[ns]']) # date64 as date32 # time32[s] to time32[ms] - expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6], + # 'timestamp[ns]' to 'timestamp[us]' + expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7_us], ['date32', 'date64', 'timestamp[us]', 'time32[s]', 'time64[us]', - 'time32_from64[s]']) + 'time32_from64[s]', + 'timestamp[ns]']) _check_roundtrip(table, expected=expected, version='2.0') + # date64 as date32 + # time32[s] to time32[ms] + # 'timestamp[ns]' is saved as INT96 timestamp + expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7], + ['date32', 'date64', 'timestamp[us]', + 'time32[s]', 'time64[us]', + 'time32_from64[s]', + 'timestamp[ns]']) + + _check_roundtrip(table, expected=expected, version='2.0', + use_deprecated_int96_timestamps=True) + # Unsupported stuff def _assert_unsupported(array): table = pa.Table.from_arrays([array], ['unsupported'])
