Repository: arrow Updated Branches: refs/heads/master 9f5e17448 -> fd000964d
ARROW-723: [Python] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop This should also be fixed in parquet-cpp, will open a JIRA. Author: Wes McKinney <[email protected]> Closes #468 from wesm/ARROW-723 and squashes the following commits: f938703 [Wes McKinney] Raise if row group size is 0, use default if -1 5f83850 [Wes McKinney] Ensure that passing chunk_size=0 when writing Parquet file does not enter infinite loop Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fd000964 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fd000964 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fd000964 Branch: refs/heads/master Commit: fd000964d218b355e725d8eced1d1301f36dc092 Parents: 9f5e174 Author: Wes McKinney <[email protected]> Authored: Sat Apr 1 11:19:09 2017 -0400 Committer: Wes McKinney <[email protected]> Committed: Sat Apr 1 11:19:09 2017 -0400 ---------------------------------------------------------------------- python/pyarrow/_parquet.pyx | 5 ++++- python/pyarrow/parquet.py | 2 +- python/pyarrow/tests/test_parquet.py | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/_parquet.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 8e67da9..c4cbd28 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -538,10 +538,13 @@ cdef class ParquetWriter: def write_table(self, Table table, row_group_size=None): cdef CTable* ctable = table.table - if row_group_size is None: + if row_group_size is None or row_group_size == -1: row_group_size = ctable.num_rows() + elif row_group_size == 0: + raise ValueError('Row group size cannot be 0') cdef int c_row_group_size = row_group_size + with nogil: check_status(WriteTable(deref(ctable), self.allocator, self.sink, c_row_group_size, http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index fa96f95..2985316 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -187,7 +187,7 @@ def write_table(table, sink, chunk_size=None, version='1.0', ---------- table : pyarrow.Table sink: string or pyarrow.io.NativeFile - chunk_size : int + chunk_size : int, default None The maximum number of rows in each Parquet RowGroup. As a default, we will write a single RowGroup per file. version : {"1.0", "2.0"}, default "1.0" http://git-wip-us.apache.org/repos/asf/arrow/blob/fd000964/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index fc32b9f..b8b2800 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -366,6 +366,23 @@ def test_multithreaded_read(): @parquet +def test_min_chunksize(): + data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D']) + table = pa.Table.from_pandas(data.reset_index()) + + buf = io.BytesIO() + pq.write_table(table, buf, chunk_size=-1) + + buf.seek(0) + result = pq.read_table(buf) + + assert result.equals(table) + + with pytest.raises(ValueError): + pq.write_table(table, buf, chunk_size=0) + + +@parquet def test_pass_separate_metadata(): # ARROW-471 df = alltypes_sample(size=10000)
