[
https://issues.apache.org/jira/browse/ARROW-2450?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16446823#comment-16446823
]
ASF GitHub Bot commented on ARROW-2450:
---
xhochy closed pull request #1891: ARROW-2450: [Python] Test for Parquet
roundtrip of null lists
URL: https://github.com/apache/arrow/pull/1891
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 325272fb18..5177a6c622 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -136,6 +136,6 @@ if [ "$ARROW_TRAVIS_PYTHON_BENCHMARKS" == "1" ] && [
"$PYTHON_VERSION" == "3.6"
# Generate machine information (mandatory)
asv machine --yes
# Run benchmarks on the changeset being tested
- asv run --no-pull --show-stderr --quick ${TRAVIS_COMMIT}^!
+ asv run --no-pull --show-stderr --quick HEAD^!
popd
fi
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 101fcd165e..0d0fde8e84 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -221,6 +221,8 @@ cdef class ColumnChunkMetaData:
property statistics:
def __get__(self):
+if not self.metadata.is_stats_set():
+return None
statistics = RowGroupStatistics()
statistics.init(self.metadata.statistics())
return statistics
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index 74e1147363..3fec0f7c4c 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -56,6 +56,39 @@ def _read_table(*args, **kwargs):
return pq.read_table(*args, **kwargs)
+def _roundtrip_table(table, **params):
+buf = io.BytesIO()
+_write_table(table, buf, **params)
+buf.seek(0)
+
+return _read_table(buf)
+
+
+def _check_roundtrip(table, expected=None, **params):
+if expected is None:
+expected = table
+
+result = _roundtrip_table(table, **params)
+if not result.equals(expected):
+print(expected)
+print(result)
+assert result.equals(expected)
+
+result = _roundtrip_table(result, **params)
+assert result.equals(expected)
+
+
+def _roundtrip_pandas_dataframe(df, write_kwargs):
+table = pa.Table.from_pandas(df)
+
+buf = io.BytesIO()
+_write_table(table, buf, **write_kwargs)
+
+buf.seek(0)
+table1 = _read_table(buf)
+return table1.to_pandas()
+
+
@parquet
def test_single_pylist_column_roundtrip(tmpdir):
for dtype in [int, float]:
@@ -76,7 +109,7 @@ def test_single_pylist_column_roundtrip(tmpdir):
def alltypes_sample(size=1, seed=0):
np.random.seed(seed)
-df = pd.DataFrame({
+arrays = {
'uint8': np.arange(size, dtype=np.uint8),
'uint16': np.arange(size, dtype=np.uint16),
'uint32': np.arange(size, dtype=np.uint32),
@@ -93,10 +126,12 @@ def alltypes_sample(size=1, seed=0):
'datetime': np.arange("2016-01-01T00:00:00.001", size,
dtype='datetime64[ms]'),
'str': [str(x) for x in range(size)],
+'empty_str': [''] * size,
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
-'empty_str': [''] * size
-})
-return df
+'null': [None] * size,
+'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
+}
+return pd.DataFrame(arrays)
@parquet
@@ -137,6 +172,23 @@ def test_chunked_table_write(tmpdir):
_check_roundtrip(table, version='2.0')
+@parquet
+def test_empty_table_roundtrip(tmpdir):
+df = alltypes_sample(size=10)
+# The nanosecond->us conversion is a nuisance, so we just avoid it here
+del df['datetime']
+
+# Create a non-empty table to infer the types correctly, then slice to 0
+table = pa.Table.from_pandas(df)
+table = pa.Table.from_arrays(
+[col.data.chunk(0)[:0] for col in table.itercolumns()],
+names=table.schema.names)
+
+assert table.schema.field_by_name('null').type == pa.null()
+assert table.schema.field_by_name('null_list').type == pa.list_(pa.null())
+_check_roundtrip(table, version='2.0')
+
+
@parquet
def test_pandas_parquet_datetime_tz():
import pyarrow.parquet as pq
@@ -765,17 +817,6 @@ def test_sanitized_spark_field_names():
assert result.schema[0].name == expected_name
-def _roundtrip_pandas_dataframe(df, write_kwargs):
-table = pa.Table.from_pandas(df)
-
-buf = io.BytesIO()
-_write_table(table, buf, **write_kwargs)
-
-buf.seek(0)
-table1 = _read_table(buf)
-return table1.to_pandas()
-
-
@parquet
def