Bug#969648: dask, pandas 1.1
On 19/10/2020 20:07, Stefano Rivera wrote: Hi Rebecca (2020.10.19_11:51:33_-0700) Or maybe not an actual regression...it's a ~5e-7 difference and one of the things the patch does (at around dask/dataframe/tests/test_rolling.py:270) is _tighten_ the tolerance on that test. Hrm, I didn't see that failure. Testing again on a 32bit arch to be sure... My log is from amd64, but I don't know if it's reproducible.
Bug#969648: dask, pandas 1.1
Or maybe not an actual regression...it's a ~5e-7 difference and one of the things the patch does (at around dask/dataframe/tests/test_rolling.py:270) is _tighten_ the tolerance on that test. I have filed a separate bug (#972516) for the fsspec issues.
Bug#969648: dask, pandas 1.1
I have now tested it. (The dask tests are run in autopkgtest, not build.) The attached is what I have so far, but it had these failures. The first two happen with or without 969648.patch and (from debci results) appear to be triggered by the new fsspec, but the last is a *regression* caused by this patch. === FAILURES === _ test_errors __ dir_server = '/tmp/tmpuxg_g6b8' def test_errors(dir_server): f = open_files("http://localhost:8999/doesnotexist;)[0] with pytest.raises(requests.exceptions.RequestException): with f as f: > f.read() /usr/lib/python3/dist-packages/dask/bytes/tests/test_http.py:117: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /usr/lib/python3/dist-packages/fsspec/implementations/http.py:343: in read self._fetch_all() /usr/lib/python3/dist-packages/fsspec/asyn.py:121: in wrapper return maybe_sync(func, self, *args, **kwargs) /usr/lib/python3/dist-packages/fsspec/asyn.py:100: in maybe_sync return sync(loop, func, *args, **kwargs) /usr/lib/python3/dist-packages/fsspec/asyn.py:71: in sync raise exc.with_traceback(tb) /usr/lib/python3/dist-packages/fsspec/asyn.py:55: in f result[0] = await future /usr/lib/python3/dist-packages/fsspec/implementations/http.py:360: in async_fetch_all r.raise_for_status() _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = http://localhost:8999/doesnotexist) [404 File not found]> GMT', 'Connection': 'close', 'Content-Type': 'text/html;charset=utf-8', 'Content-Length': '469')> def raise_for_status(self) -> None: if 400 <= self.status: # reason should always be not None for a started response assert self.reason is not None self.release() > raise ClientResponseError( self.request_info, self.history, status=self.status, message=self.reason, headers=self.headers) E aiohttp.client_exceptions.ClientResponseError: 404, message='File not found', url=URL('http://localhost:8999/doesnotexist') /usr/lib/python3/dist-packages/aiohttp/client_reqrep.py:941: ClientResponseError - Captured stderr call - 127.0.0.1 - - [19/Oct/2020 17:38:10] code 404, message File not found 127.0.0.1 - - [19/Oct/2020 17:38:10] "HEAD /doesnotexist HTTP/1.1" 404 - 127.0.0.1 - - [19/Oct/2020 17:38:10] code 404, message File not found 127.0.0.1 - - [19/Oct/2020 17:38:10] "GET /doesnotexist HTTP/1.1" 404 - test_urlpath_inference_errors _ def test_urlpath_inference_errors(): # Empty list with pytest.raises(ValueError, match="empty"): get_fs_token_paths([]) # Protocols differ with pytest.raises(ValueError, match="the same protocol"): get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"]) # Options differ with pytest.raises(ValueError, match="the same file-system options"): get_fs_token_paths( [ "ftp://myu...@node.com/test/path.csv;, "ftp://otheru...@node.com/other/path.csv;, ] ) # Unknown type with pytest.raises(TypeError): > get_fs_token_paths( { "sets/are.csv", "unordered/so/they.csv", "should/not/be.csv", "allowed.csv", } ) E Failed: DID NOT RAISE /usr/lib/python3/dist-packages/dask/bytes/tests/test_local.py:86: Failed __ test_time_rolling_methods[window3-std-args6-True] ___ method = 'std', args = (), window = <5 * Seconds>, check_less_precise = {} @pytest.mark.parametrize( "method,args,check_less_precise", rolling_method_args_check_less_precise ) @pytest.mark.parametrize("window", ["1S", "2S", "3S", pd.offsets.Second(5)]) def test_time_rolling_methods(method, args, window, check_less_precise): if dd._compat.PANDAS_GT_110: check_less_precise = {} else: check_less_precise = {"check_less_precise": check_less_precise} # DataFrame if method == "apply": kwargs = {"raw": False} else: kwargs = {} prolling = ts.rolling(window) drolling = dts.rolling(window) > assert_eq( getattr(prolling, method)(*args, **kwargs), getattr(drolling, method)(*args, **kwargs), **check_less_precise, ) /usr/lib/python3/dist-packages/dask/dataframe/tests/test_rolling.py:288: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
Bug#969648: dask, pandas 1.1
Hi Rebecca (2020.10.19_12:07:08_-0700) > > Or maybe not an actual regression...it's a ~5e-7 difference and one of the > > things the patch does (at around dask/dataframe/tests/test_rolling.py:270) > > is _tighten_ the tolerance on that test. > > Hrm, I didn't see that failure. Testing again on a 32bit arch to be > sure... Aha. Reproduced. And found https://github.com/dask/dask/pull/6502 SR -- Stefano Rivera http://tumbleweed.org.za/ +1 415 683 3272
Bug#969648: dask, pandas 1.1
Hi Rebecca (2020.10.19_11:26:19_-0700) > I have now tested it. (The dask tests are run in autopkgtest, not build.) Thanks. I took your untested patch and tested it, too. It needed some tweaking, which it looks like you've also done. > The attached is what I have so far, but it had these failures. The first > two happen with or without 969648.patch and (from debci results) appear to > be triggered by the new fsspec, but the last is a *regression* caused by > this patch. I cherry picked these to fix these failures: https://github.com/dask/dask/pull/6331 https://github.com/dask/dask/pull/6446 SR -- Stefano Rivera http://tumbleweed.org.za/ +1 415 683 3272
Bug#969648: dask, pandas 1.1
Hi Rebecca (2020.10.19_11:51:33_-0700) > Or maybe not an actual regression...it's a ~5e-7 difference and one of the > things the patch does (at around dask/dataframe/tests/test_rolling.py:270) > is _tighten_ the tolerance on that test. Hrm, I didn't see that failure. Testing again on a 32bit arch to be sure... > That looks like my earlier version, which fails with NameError. Yeah, I applied it as-is first, and then followed up with the fixes, after seeing the test failures. SR -- Stefano Rivera http://tumbleweed.org.za/ +1 415 683 3272
Bug#969648: dask, pandas 1.1
The upstream patch doesn't even apply as-is; this version does, but I don't have time right now to actually test it. There's also a circular dependency problem, as dask indirectly build-depends on itself and my new pandas makes it uninstallable. Description: pandas 1.1 compatibility Origin: part of upstream f212b76fefeb93298205d7d224cbc1f7ed387ce9 Author: Tom Augspurger, Rebecca Palmer diff --git a/dask/dataframe/core.py b/dask/dataframe/core.py index 4a5c6d1f..cedd46fc 100644 --- a/dask/dataframe/core.py +++ b/dask/dataframe/core.py @@ -2487,7 +2487,7 @@ Dask Name: {name}, {task} tasks""" else: is_anchored = offset.isAnchored() -include_right = is_anchored or not hasattr(offset, "_inc") +include_right = is_anchored or not hasattr(offset, "delta") if end == self.npartitions - 1: divs = self.divisions @@ -4106,7 +4106,7 @@ class DataFrame(_Frame): left_index=on is None, right_index=True, left_on=on, -suffixes=[lsuffix, rsuffix], +suffixes=(lsuffix, rsuffix), npartitions=npartitions, shuffle=shuffle, ) diff --git a/dask/dataframe/tests/test_dataframe.py b/dask/dataframe/tests/test_dataframe.py index 64c15000..5e4f2bef 100644 --- a/dask/dataframe/tests/test_dataframe.py +++ b/dask/dataframe/tests/test_dataframe.py @@ -37,6 +37,9 @@ dsk = { meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8")) d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9]) full = d.compute() +CHECK_FREQ = {} +if dd._compat.PANDAS_GT_110: +CHECK_FREQ["check_freq"] = False def test_dataframe_doc(): @@ -222,7 +225,18 @@ def test_index_names(): assert ddf.index.compute().name == "x" -@pytest.mark.parametrize("npartitions", [1, pytest.param(2, marks=pytest.mark.xfail)]) +@pytest.mark.parametrize( +"npartitions", +[ +1, +pytest.param( +2, +marks=pytest.mark.xfail( +not dd._compat.PANDAS_GT_110, reason="Fixed upstream." +), +), +], +) def test_timezone_freq(npartitions): s_naive = pd.Series(pd.date_range("20130101", periods=10)) s_aware = pd.Series(pd.date_range("20130101", periods=10, tz="US/Eastern")) @@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values): (None, None, None, ["c", "d", "g"]), # numeric + bool (None, None, None, ["c", "d", "f", "g"]), # numeric + bool + timedelta (None, None, None, ["f", "g"]), # bool + timedelta -("all", None, None, None), -(["number"], None, [0.25, 0.5], None), -([np.timedelta64], None, None, None), -(["number", "object"], None, [0.25, 0.75], None), -(None, ["number", "object"], None, None), -(["object", "datetime", "bool"], None, None, None), +pytest.param( +"all", +None, +None, +None, +marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"), +), +pytest.param( +["number"], +None, +[0.25, 0.5], +None, +marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"), +), +pytest.param( +[np.timedelta64], +None, +None, +None, +marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"), +), +pytest.param( +["number", "object"], +None, +[0.25, 0.75], +None, +marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"), +), +pytest.param( +None, +["number", "object"], +None, +None, +marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"), +), +pytest.param( +["object", "datetime", "bool"], +None, +None, +None, +marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream changes"), +), ], ) def test_describe(include, exclude, percentiles, subset): @@ -2522,15 +2572,17 @@ def test_to_timestamp(): index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004") df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}, index=index) ddf = dd.from_pandas(df, npartitions=3) -assert_eq(ddf.to_timestamp(), df.to_timestamp()) +assert_eq(ddf.to_timestamp(), df.to_timestamp(), **CHECK_FREQ) assert_eq( ddf.to_timestamp(freq="M", how="s").compute(), df.to_timestamp(freq="M", how="s"), +**CHECK_FREQ ) assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp()) assert_eq( ddf.x.to_timestamp(freq="M", how="s").compute(), df.x.to_timestamp(freq="M", how="s"), +**CHECK_FREQ ) diff --git a/dask/dataframe/tests/test_extensions.py