Repository: arrow Updated Branches: refs/heads/master a81aefbd8 -> 8f2b44b89
ARROW-1051: [Python] Opt in to Parquet unit tests to avoid accidental suppression of dynamic linking errors Author: Wes McKinney <[email protected]> Closes #729 from wesm/ARROW-1051 and squashes the following commits: 019b9ec [Wes McKinney] Statically link boost in parquet-cpp 5103077 [Wes McKinney] See if updating conda helps 7eac948 [Wes McKinney] See if setting PATH solves problem e246e19 [Wes McKinney] Red herring, issue was runtime library loading 6bc0492 [Wes McKinney] Set PARQUET_ARROW_VERSION in Windows build a1f2d2b [Wes McKinney] Opt in to Parquet unit tests so that import errors from pyarrow.parquet bubble up Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8f2b44b8 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8f2b44b8 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8f2b44b8 Branch: refs/heads/master Commit: 8f2b44b897b7083ee2a296c70397dc2d7d21d95e Parents: a81aefb Author: Wes McKinney <[email protected]> Authored: Mon Jun 5 12:18:32 2017 +0200 Committer: Uwe L. Korn <[email protected]> Committed: Mon Jun 5 12:18:32 2017 +0200 ---------------------------------------------------------------------- ci/msvc-build.bat | 10 ++- ci/travis_script_python.sh | 2 +- python/pyarrow/tests/conftest.py | 2 +- python/pyarrow/tests/test_parquet.py | 132 +++++++++++++++++++----------- 4 files changed, 92 insertions(+), 54 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/ci/msvc-build.bat ---------------------------------------------------------------------- diff --git a/ci/msvc-build.bat b/ci/msvc-build.bat index d13c11f..263d4bc 100644 --- a/ci/msvc-build.bat +++ b/ci/msvc-build.bat @@ -17,6 +17,8 @@ @echo on +conda update --yes --quiet conda + conda create -n arrow -q -y python=%PYTHON% ^ six pytest setuptools numpy pandas cython conda install -n arrow -q -y -c conda-forge ^ @@ -43,7 +45,7 @@ cmake -G "%GENERATOR%" ^ cmake --build . --target INSTALL --config Release || exit /B @rem Needed so python-test.exe works -set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX% +set PYTHONPATH=%CONDA_PREFIX%\Lib;%CONDA_PREFIX%\Lib\site-packages;%CONDA_PREFIX%\python35.zip;%CONDA_PREFIX%\DLLs;%CONDA_PREFIX%;%PYTHONPATH% ctest -VV || exit /B popd @@ -59,15 +61,17 @@ set PARQUET_HOME=%CONDA_PREFIX%\Library cmake -G "%GENERATOR%" ^ -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ -DCMAKE_BUILD_TYPE=Release ^ + -DPARQUET_BOOST_USE_SHARED=OFF ^ -DPARQUET_ZLIB_VENDORED=off ^ -DPARQUET_BUILD_TESTS=off .. || exit /B cmake --build . --target INSTALL --config Release || exit /B popd @rem Build and import pyarrow -set PYTHONPATH= +@rem parquet-cpp has some additional runtime dependencies that we need to figure out +@rem see PARQUET-1018 pushd python python setup.py build_ext --inplace --with-parquet --bundle-arrow-cpp bdist_wheel || exit /B -py.test pyarrow -v -s || exit /B +py.test pyarrow -v -s --parquet || exit /B popd http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/ci/travis_script_python.sh ---------------------------------------------------------------------- diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index c3735cc..904db52 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -111,7 +111,7 @@ python_version_tests() { python -c "import pyarrow.parquet" python -c "import pyarrow._jemalloc" - python -m pytest -vv -r sxX pyarrow + python -m pytest -vv -r sxX pyarrow --parquet # Build documentation once if [[ "$PYTHON_VERSION" == "3.6" ]] http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/python/pyarrow/tests/conftest.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index d5b4b69..9b767fc 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -18,7 +18,7 @@ from pytest import skip -groups = ['hdfs'] +groups = ['hdfs', 'parquet'] def pytest_configure(config): http://git-wip-us.apache.org/repos/asf/arrow/blob/8f2b44b8/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 5f65f28..052d395 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -32,13 +32,20 @@ import pandas as pd import pandas.util.testing as tm -# Skip all parquet tests if we can't import pyarrow.parquet -pq = pytest.importorskip('pyarrow.parquet') - # Ignore these with pytest ... -m 'not parquet' parquet = pytest.mark.parquet +def _write_table(*args, **kwargs): + import pyarrow.parquet as pq + return pq.write_table(*args, **kwargs) + + +def _read_table(*args, **kwargs): + import pyarrow.parquet as pq + return pq.read_table(*args, **kwargs) + + @parquet def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: @@ -46,8 +53,8 @@ def test_single_pylist_column_roundtrip(tmpdir): .format(dtype.__name__)) data = [pa.array(list(map(dtype, range(5))))] table = pa.Table.from_arrays(data, names=('a', 'b')) - pq.write_table(table, filename.strpath) - table_read = pq.read_table(filename.strpath) + _write_table(table, filename.strpath) + table_read = _read_table(filename.strpath) for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): assert col_written.name == col_read.name @@ -84,13 +91,14 @@ def alltypes_sample(size=10000, seed=0): @parquet def test_pandas_parquet_2_0_rountrip(tmpdir): + import pyarrow.parquet as pq df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) assert b'pandas' in arrow_table.schema.metadata - pq.write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_pandas(filename.strpath) assert b'pandas' in table_read.schema.metadata @@ -102,13 +110,15 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): @parquet def test_pandas_parquet_custom_metadata(tmpdir): + import pyarrow.parquet as pq + df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True) assert b'pandas' in arrow_table.schema.metadata - pq.write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0") pf = pq.ParquetFile(filename.strpath) md = pf.metadata.metadata @@ -120,6 +130,8 @@ def test_pandas_parquet_custom_metadata(tmpdir): @parquet def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir): + import pyarrow.parquet as pq + df = alltypes_sample(size=10000) filename = tmpdir.join('pandas_rountrip.parquet') @@ -129,7 +141,7 @@ def test_pandas_parquet_2_0_rountrip_read_pandas_no_index_written(tmpdir): js = json.loads(arrow_table.schema.metadata[b'pandas'].decode('utf8')) assert not js['index_columns'] - pq.write_table(arrow_table, filename.strpath, version="2.0") + _write_table(arrow_table, filename.strpath, version="2.0") table_read = pq.read_pandas(filename.strpath) js = json.loads(table_read.schema.metadata[b'pandas'].decode('utf8')) @@ -163,8 +175,8 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) - pq.write_table(arrow_table, filename.strpath, version="1.0") - table_read = pq.read_table(filename.strpath) + _write_table(arrow_table, filename.strpath, version="1.0") + table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 @@ -183,8 +195,8 @@ def test_pandas_column_selection(tmpdir): }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df) - pq.write_table(arrow_table, filename.strpath) - table_read = pq.read_table(filename.strpath, columns=['uint8']) + _write_table(arrow_table, filename.strpath) + table_read = _read_table(filename.strpath, columns=['uint8']) df_read = table_read.to_pandas() tm.assert_frame_equal(df[['uint8']], df_read) @@ -223,19 +235,21 @@ def test_pandas_parquet_native_file_roundtrip(tmpdir): df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() - pq.write_table(arrow_table, imos, version="2.0") + _write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = pa.BufferReader(buf) - df_read = pq.read_table(reader).to_pandas() + df_read = _read_table(reader).to_pandas() tm.assert_frame_equal(df, df_read) @parquet def test_read_pandas_column_subset(tmpdir): + import pyarrow.parquet as pq + df = _test_dataframe(10000) arrow_table = pa.Table.from_pandas(df) imos = pa.BufferOutputStream() - pq.write_table(arrow_table, imos, version="2.0") + _write_table(arrow_table, imos, version="2.0") buf = imos.get_result() reader = pa.BufferReader(buf) df_read = pq.read_pandas(reader, columns=['strings', 'uint8']).to_pandas() @@ -257,11 +271,11 @@ def test_pandas_parquet_pyfile_roundtrip(tmpdir): arrow_table = pa.Table.from_pandas(df) with open(filename, 'wb') as f: - pq.write_table(arrow_table, f, version="1.0") + _write_table(arrow_table, f, version="1.0") data = io.BytesIO(open(filename, 'rb').read()) - table_read = pq.read_table(data) + table_read = _read_table(data) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -287,27 +301,29 @@ def test_pandas_parquet_configuration_options(tmpdir): arrow_table = pa.Table.from_pandas(df) for use_dictionary in [True, False]: - pq.write_table(arrow_table, filename.strpath, - version="2.0", - use_dictionary=use_dictionary) - table_read = pq.read_table(filename.strpath) + _write_table(arrow_table, filename.strpath, + version="2.0", + use_dictionary=use_dictionary) + table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) for compression in ['NONE', 'SNAPPY', 'GZIP']: - pq.write_table(arrow_table, filename.strpath, - version="2.0", - compression=compression) - table_read = pq.read_table(filename.strpath) + _write_table(arrow_table, filename.strpath, + version="2.0", + compression=compression) + table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) def make_sample_file(df): + import pyarrow.parquet as pq + a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() - pq.write_table(a_table, buf, compression='SNAPPY', version='2.0') + _write_table(a_table, buf, compression='SNAPPY', version='2.0') buf.seek(0) return pq.ParquetFile(buf) @@ -384,8 +400,8 @@ def test_column_of_arrays(tmpdir): filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema) - pq.write_table(arrow_table, filename.strpath, version="2.0") - table_read = pq.read_table(filename.strpath) + _write_table(arrow_table, filename.strpath, version="2.0") + table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -397,8 +413,8 @@ def test_column_of_lists(tmpdir): filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = pa.Table.from_pandas(df, timestamps_to_ms=True, schema=schema) - pq.write_table(arrow_table, filename.strpath, version="2.0") - table_read = pq.read_table(filename.strpath) + _write_table(arrow_table, filename.strpath, version="2.0") + table_read = _read_table(filename.strpath) df_read = table_read.to_pandas() tm.assert_frame_equal(df, df_read) @@ -449,7 +465,7 @@ def test_date_time_types(): buf = io.BytesIO() with pytest.raises(NotImplementedError): - pq.write_table(table, buf, version="2.0") + _write_table(table, buf, version="2.0") t7 = pa.time64('ns') a7 = pa.Array.from_pandas(data4.astype('int64'), type=t7) @@ -470,13 +486,13 @@ def test_fixed_size_binary(): def _check_roundtrip(table, expected=None, **params): buf = io.BytesIO() - pq.write_table(table, buf, **params) + _write_table(table, buf, **params) buf.seek(0) if expected is None: expected = table - result = pq.read_table(buf) + result = _read_table(buf) assert result.equals(expected) @@ -487,13 +503,13 @@ def test_multithreaded_read(): table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() - pq.write_table(table, buf, compression='SNAPPY', version='2.0') + _write_table(table, buf, compression='SNAPPY', version='2.0') buf.seek(0) - table1 = pq.read_table(buf, nthreads=4) + table1 = _read_table(buf, nthreads=4) buf.seek(0) - table2 = pq.read_table(buf, nthreads=1) + table2 = _read_table(buf, nthreads=1) assert table1.equals(table2) @@ -504,26 +520,28 @@ def test_min_chunksize(): table = pa.Table.from_pandas(data.reset_index()) buf = io.BytesIO() - pq.write_table(table, buf, chunk_size=-1) + _write_table(table, buf, chunk_size=-1) buf.seek(0) - result = pq.read_table(buf) + result = _read_table(buf) assert result.equals(table) with pytest.raises(ValueError): - pq.write_table(table, buf, chunk_size=0) + _write_table(table, buf, chunk_size=0) @parquet def test_pass_separate_metadata(): + import pyarrow.parquet as pq + # ARROW-471 df = alltypes_sample(size=10000) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() - pq.write_table(a_table, buf, compression='snappy', version='2.0') + _write_table(a_table, buf, compression='snappy', version='2.0') buf.seek(0) metadata = pq.ParquetFile(buf).metadata @@ -537,6 +555,8 @@ def test_pass_separate_metadata(): @parquet def test_read_single_row_group(): + import pyarrow.parquet as pq + # ARROW-471 N, K = 10000, 4 df = alltypes_sample(size=N) @@ -544,8 +564,8 @@ def test_read_single_row_group(): a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() - pq.write_table(a_table, buf, row_group_size=N / K, - compression='snappy', version='2.0') + _write_table(a_table, buf, row_group_size=N / K, + compression='snappy', version='2.0') buf.seek(0) @@ -560,13 +580,15 @@ def test_read_single_row_group(): @parquet def test_read_single_row_group_with_column_subset(): + import pyarrow.parquet as pq + N, K = 10000, 4 df = alltypes_sample(size=N) a_table = pa.Table.from_pandas(df, timestamps_to_ms=True) buf = io.BytesIO() - pq.write_table(a_table, buf, row_group_size=N / K, - compression='snappy', version='2.0') + _write_table(a_table, buf, row_group_size=N / K, + compression='snappy', version='2.0') buf.seek(0) pf = pq.ParquetFile(buf) @@ -579,11 +601,13 @@ def test_read_single_row_group_with_column_subset(): @parquet def test_parquet_piece_read(tmpdir): + import pyarrow.parquet as pq + df = _test_dataframe(1000) table = pa.Table.from_pandas(df) path = tmpdir.join('parquet_piece_read.parquet').strpath - pq.write_table(table, path, version='2.0') + _write_table(table, path, version='2.0') piece1 = pq.ParquetDatasetPiece(path) @@ -593,6 +617,8 @@ def test_parquet_piece_read(tmpdir): @parquet def test_parquet_piece_basics(): + import pyarrow.parquet as pq + path = '/baz.parq' piece1 = pq.ParquetDatasetPiece(path) @@ -612,6 +638,8 @@ def test_parquet_piece_basics(): @parquet def test_partition_set_dictionary_type(): + import pyarrow.parquet as pq + set1 = pq.PartitionSet('key1', [u('foo'), u('bar'), u('baz')]) set2 = pq.PartitionSet('key2', [2007, 2008, 2009]) @@ -625,6 +653,8 @@ def test_partition_set_dictionary_type(): @parquet def test_read_partitioned_directory(tmpdir): + import pyarrow.parquet as pq + foo_keys = [0, 1] bar_keys = ['a', 'b', 'c'] partition_spec = [ @@ -681,7 +711,7 @@ def _generate_partition_directories(base_dir, partition_spec, df): filtered_df = _filter_partition(df, this_part_keys) part_table = pa.Table.from_pandas(filtered_df) - pq.write_table(part_table, file_path) + _write_table(part_table, file_path) else: _visit_level(level_dir, level + 1, this_part_keys) @@ -690,6 +720,8 @@ def _generate_partition_directories(base_dir, partition_spec, df): @parquet def test_read_common_metadata_files(tmpdir): + import pyarrow.parquet as pq + N = 100 df = pd.DataFrame({ 'index': np.arange(N), @@ -700,7 +732,7 @@ def test_read_common_metadata_files(tmpdir): data_path = pjoin(base_path, 'data.parquet') table = pa.Table.from_pandas(df) - pq.write_table(table, data_path) + _write_table(table, data_path) metadata_path = pjoin(base_path, '_metadata') pq.write_metadata(table.schema, metadata_path) @@ -729,6 +761,8 @@ def _filter_partition(df, part_keys): @parquet def test_read_multiple_files(tmpdir): + import pyarrow.parquet as pq + nfiles = 10 size = 5 @@ -746,7 +780,7 @@ def test_read_multiple_files(tmpdir): path = pjoin(dirpath, '{0}.parquet'.format(i)) table = pa.Table.from_pandas(df) - pq.write_table(table, path) + _write_table(table, path) test_data.append(table) paths.append(path) @@ -792,7 +826,7 @@ def test_read_multiple_files(tmpdir): bad_apple_path = tmpdir.join('{0}.parquet'.format(guid())).strpath t = pa.Table.from_pandas(bad_apple) - pq.write_table(t, bad_apple_path) + _write_table(t, bad_apple_path) bad_meta = pq.ParquetFile(bad_apple_path).metadata
