Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-dask for openSUSE:Factory checked in at 2023-03-28 17:50:14 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-dask (Old) and /work/SRC/openSUSE:Factory/.python-dask.new.31432 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-dask" Tue Mar 28 17:50:14 2023 rev:64 rq:1074794 version:2023.3.2 Changes: -------- --- /work/SRC/openSUSE:Factory/python-dask/python-dask.changes 2023-03-15 18:54:26.372361646 +0100 +++ /work/SRC/openSUSE:Factory/.python-dask.new.31432/python-dask.changes 2023-03-28 17:50:19.515252752 +0200 @@ -1,0 +2,38 @@ +Mon Mar 27 16:40:11 UTC 2023 - Ben Greiner <c...@bnavigator.de> + +- Update to 2023.3.2 + ## Enhancements + * Deprecate observed=False for groupby with categoricals + (GH#10095) Irina Truong + * Deprecate axis= for some groupby operations (GH#10094) James + Bourbeau + * The axis keyword in DataFrame.rolling/Series.rolling is + deprecated (GH#10110) Irina Truong + * DataFrame._data deprecation in pandas (GH#10081) Irina Truong + * Use importlib_metadata backport to avoid CLI UserWarning + (GH#10070) Thomas Grainger + * Port option parsing logic from dask.dataframe.read_parquet to + to_parquet (GH#9981) Anton Loukianov + ## Bug Fixes + * Avoid using dd.shuffle in groupby-apply (GH#10043) Richard + (Rick) Zamora + * Enable null hive partitions with pyarrow parquet engine + (GH#10007) Richard (Rick) Zamora + * Support unknown shapes in *_like functions (GH#10064) Doug + Davis + ## Maintenance + * Restore Entrypoints compatibility (GH#10113) Jacob Tomlinson + * Allow pyarrow build to continue on failures (GH#10097) James + Bourbeau + * Fix test_set_index_on_empty with pyarrow strings active + (GH#10054) Irina Truong + * Temporarily skip pyarrow_compat tests with pandas 2.0 + (GH#10063) James Bourbeau + +------------------------------------------------------------------- +Sun Mar 26 17:13:15 UTC 2023 - Ben Greiner <c...@bnavigator.de> + +- Add dask-pr10042-parquetstats.patch gh#dask/dask#10042 +- Enable python311 build: numba is not a strict requirement + +------------------------------------------------------------------- Old: ---- dask-2023.3.1.tar.gz New: ---- dask-2023.3.2.tar.gz dask-pr10042-parquetstats.patch ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-dask.spec ++++++ --- /var/tmp/diff_new_pack.XnGtOj/_old 2023-03-28 17:50:20.127255705 +0200 +++ /var/tmp/diff_new_pack.XnGtOj/_new 2023-03-28 17:50:20.131255724 +0200 @@ -49,18 +49,18 @@ %if "%{flavor}" == "" %bcond_with test %endif -# Numba is not ready for python 3.11 yet gh#numba/numba#8304 -%define skip_python311 1 Name: python-dask%{psuffix} # ===> Note: python-dask MUST be updated in sync with python-distributed! <=== -Version: 2023.3.1 +Version: 2023.3.2 Release: 0 Summary: Minimal task scheduling abstraction License: BSD-3-Clause URL: https://dask.org # SourceRepository: https://github.com/dask/dask Source0: https://files.pythonhosted.org/packages/source/d/dask/dask-%{version}.tar.gz +# PATCH-FIX-UPSTREAM dask-pr10042-parquetstats.patch gh#dask/dask#10042 +Patch0: dask-pr10042-parquetstats.patch BuildRequires: %{python_module base >= 3.8} BuildRequires: %{python_module packaging >= 20.0} BuildRequires: %{python_module pip} @@ -72,6 +72,7 @@ Requires: python-click >= 7 Requires: python-cloudpickle >= 1.1.1 Requires: python-fsspec >= 0.6.0 +Requires: python-importlib-metadata >= 4.13.0 Requires: python-packaging >= 20.0 Requires: python-partd >= 1.2.0 Requires: python-toolz >= 0.8.2 @@ -116,13 +117,13 @@ BuildRequires: %{python_module matplotlib} BuildRequires: %{python_module mimesis} BuildRequires: %{python_module multipledispatch} -BuildRequires: %{python_module numba} +BuildRequires: %{python_module numba if %python-base < 3.11} # snappy required for using fastparquet BuildRequires: %{python_module python-snappy} BuildRequires: %{python_module requests} BuildRequires: %{python_module scikit-image} BuildRequires: %{python_module scipy} -BuildRequires: %{python_module sparse} +BuildRequires: %{python_module sparse if %python-base < 3.11} BuildRequires: %{python_module tables} BuildRequires: %{python_module xarray if %python-base >= 3.9} BuildRequires: %{python_module zarr} @@ -153,9 +154,9 @@ Requires: %{name}-diagnostics = %{version} Requires: %{name}-distributed = %{version} Requires: %{name}-dot = %{version} +Requires: python-lz4 >= 4.3.2 # Added to the [complete] extra in 2023.3.1, not available for TW yet #Requires: python-pyarrow >= 7 -Requires: python-lz4 >= 4.3.2 Provides: %{name}-all = %{version}-%{release} Obsoletes: %{name}-all < %{version}-%{release} @@ -375,9 +376,6 @@ donttest="(test_datasets and test_deterministic)" # upstreams test if their ci is up to date, irrelevant for obs donttest+=" or test_development_guidelines_matches_ci" -# requires otherwise optional pyarrow (not available on TW) -- https://github.com/dask/dask/issues/10042 -donttest+=" or (test_select_filtered_column and fastparquet)" -donttest+=" or test_read_parquet_convert_string_fastparquet_warns" if [[ $(getconf LONG_BIT) -eq 32 ]]; then # https://github.com/dask/dask/issues/8620 donttest+=" or test_query_with_meta" ++++++ _multibuild ++++++ --- /var/tmp/diff_new_pack.XnGtOj/_old 2023-03-28 17:50:20.163255878 +0200 +++ /var/tmp/diff_new_pack.XnGtOj/_new 2023-03-28 17:50:20.167255898 +0200 @@ -2,6 +2,6 @@ <package>test-py38</package> <package>test-py39</package> <package>test-py310</package> - <!-- package>test-py311</package --> + <package>test-py311</package> </multibuild> ++++++ dask-2023.3.1.tar.gz -> dask-2023.3.2.tar.gz ++++++ /work/SRC/openSUSE:Factory/python-dask/dask-2023.3.1.tar.gz /work/SRC/openSUSE:Factory/.python-dask.new.31432/dask-2023.3.2.tar.gz differ: char 5, line 1 ++++++ dask-pr10042-parquetstats.patch ++++++ diff --git a/dask/dataframe/io/parquet/core.py b/dask/dataframe/io/parquet/core.py index 07fa3a0e8f6..cc9ecaddb20 100644 --- a/dask/dataframe/io/parquet/core.py +++ b/dask/dataframe/io/parquet/core.py @@ -1375,14 +1375,20 @@ def apply_conjunction(parts, statistics, conjunction): out_statistics.append(stats) else: if ( - operator != "is not" - and min is None - and max is None - and null_count + # Must allow row-groups with "missing" stats + (min is None and max is None and not null_count) + # Check "is" and "is not" fiters first or operator == "is" and null_count or operator == "is not" and (not pd.isna(min) or not pd.isna(max)) + # Allow all-null row-groups if not fitering out nulls + or operator != "is not" + and min is None + and max is None + and null_count + # Start conventional (non-null) fitering + # (main/max cannot be None for remaining checks) or operator in ("==", "=") and min <= value <= max or operator == "!=" diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py index f2403106eb9..f9c10d573dc 100644 --- a/dask/dataframe/io/tests/test_parquet.py +++ b/dask/dataframe/io/tests/test_parquet.py @@ -4718,10 +4718,28 @@ def test_fsspec_to_parquet_filesystem_option(tmp_path): def test_select_filtered_column(tmp_path, engine): df = pd.DataFrame({"a": range(10), "b": ["cat"] * 10}) path = tmp_path / "test_select_filtered_column.parquet" - df.to_parquet(path, index=False) + stats = {"write_statistics" if engine == "pyarrow" else "stats": True} + df.to_parquet(path, engine=engine, index=False, **stats) with pytest.warns(UserWarning, match="Sorted columns detected"): ddf = dd.read_parquet(path, engine=engine, filters=[("b", "==", "cat")]) + assert_eq(df, ddf) + + with pytest.warns(UserWarning, match="Sorted columns detected"): + ddf = dd.read_parquet(path, engine=engine, filters=[("b", "is not", None)]) + assert_eq(df, ddf) + + +def test_select_filtered_column_no_stats(tmp_path, engine): + df = pd.DataFrame({"a": range(10), "b": ["cat"] * 10}) + path = tmp_path / "test_select_filtered_column_no_stats.parquet" + stats = {"write_statistics" if engine == "pyarrow" else "stats": False} + df.to_parquet(path, engine=engine, **stats) + + ddf = dd.read_parquet(path, engine=engine, filters=[("b", "==", "cat")]) + assert_eq(df, ddf) + + ddf = dd.read_parquet(path, engine=engine, filters=[("b", "is not", None)]) assert_eq(df, ddf) @@ -4793,6 +4811,7 @@ def test_read_parquet_convert_string_nullable_mapper(tmp_path, engine): assert_eq(ddf, expected) +@PYARROW_MARK # We get an error instead of a warning without pyarrow @FASTPARQUET_MARK @pytest.mark.skipif( not PANDAS_GT_200, reason="dataframe.convert_string requires pandas>=2.0"