Bug#969648: dask, pandas 1.1

2020-10-19 Thread Rebecca N. Palmer

On 19/10/2020 20:07, Stefano Rivera wrote:

Hi Rebecca (2020.10.19_11:51:33_-0700)


Or maybe not an actual regression...it's a ~5e-7 difference and one of the
things the patch does (at around dask/dataframe/tests/test_rolling.py:270)
is _tighten_ the tolerance on that test.


Hrm, I didn't see that failure. Testing again on a 32bit arch to be
sure...


My log is from amd64, but I don't know if it's reproducible.



Bug#969648: dask, pandas 1.1

2020-10-19 Thread Rebecca N. Palmer
Or maybe not an actual regression...it's a ~5e-7 difference and one of 
the things the patch does (at around 
dask/dataframe/tests/test_rolling.py:270) is _tighten_ the tolerance on 
that test.


I have filed a separate bug (#972516) for the fsspec issues.



Bug#969648: dask, pandas 1.1

2020-10-19 Thread Rebecca N. Palmer

I have now tested it.  (The dask tests are run in autopkgtest, not build.)

The attached is what I have so far, but it had these failures.  The 
first two happen with or without 969648.patch and (from debci results) 
appear to be triggered by the new fsspec, but the last is a *regression* 
caused by this patch.


=== FAILURES 
===
_ test_errors 
__


dir_server = '/tmp/tmpuxg_g6b8'

def test_errors(dir_server):
f = open_files("http://localhost:8999/doesnotexist;)[0]
with pytest.raises(requests.exceptions.RequestException):
with f as f:
>   f.read()

/usr/lib/python3/dist-packages/dask/bytes/tests/test_http.py:117:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _

/usr/lib/python3/dist-packages/fsspec/implementations/http.py:343: in read
self._fetch_all()
/usr/lib/python3/dist-packages/fsspec/asyn.py:121: in wrapper
return maybe_sync(func, self, *args, **kwargs)
/usr/lib/python3/dist-packages/fsspec/asyn.py:100: in maybe_sync
return sync(loop, func, *args, **kwargs)
/usr/lib/python3/dist-packages/fsspec/asyn.py:71: in sync
raise exc.with_traceback(tb)
/usr/lib/python3/dist-packages/fsspec/asyn.py:55: in f
result[0] = await future
/usr/lib/python3/dist-packages/fsspec/implementations/http.py:360: in 
async_fetch_all

r.raise_for_status()
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
_ _ _ _


self = http://localhost:8999/doesnotexist) [404 File not 
found]>
GMT', 'Connection': 'close', 'Content-Type': 'text/html;charset=utf-8', 
'Content-Length': '469')>



def raise_for_status(self) -> None:
if 400 <= self.status:
# reason should always be not None for a started response
assert self.reason is not None
self.release()
>   raise ClientResponseError(
self.request_info,
self.history,
status=self.status,
message=self.reason,
headers=self.headers)
E   aiohttp.client_exceptions.ClientResponseError: 404, 
message='File not found', url=URL('http://localhost:8999/doesnotexist')


/usr/lib/python3/dist-packages/aiohttp/client_reqrep.py:941: 
ClientResponseError
- Captured stderr call 
-

127.0.0.1 - - [19/Oct/2020 17:38:10] code 404, message File not found
127.0.0.1 - - [19/Oct/2020 17:38:10] "HEAD /doesnotexist HTTP/1.1" 404 -
127.0.0.1 - - [19/Oct/2020 17:38:10] code 404, message File not found
127.0.0.1 - - [19/Oct/2020 17:38:10] "GET /doesnotexist HTTP/1.1" 404 -
 test_urlpath_inference_errors 
_


def test_urlpath_inference_errors():
# Empty list
with pytest.raises(ValueError, match="empty"):
get_fs_token_paths([])

# Protocols differ
with pytest.raises(ValueError, match="the same protocol"):
get_fs_token_paths(["s3://test/path.csv", "/other/path.csv"])

# Options differ
with pytest.raises(ValueError, match="the same file-system 
options"):

get_fs_token_paths(
[
"ftp://myu...@node.com/test/path.csv;,
"ftp://otheru...@node.com/other/path.csv;,
]
)

# Unknown type
with pytest.raises(TypeError):
>   get_fs_token_paths(
{
"sets/are.csv",
"unordered/so/they.csv",
"should/not/be.csv",
"allowed.csv",
}
)
E   Failed: DID NOT RAISE 

/usr/lib/python3/dist-packages/dask/bytes/tests/test_local.py:86: Failed
__ test_time_rolling_methods[window3-std-args6-True] 
___


method = 'std', args = (), window = <5 * Seconds>, check_less_precise = {}

@pytest.mark.parametrize(
"method,args,check_less_precise", 
rolling_method_args_check_less_precise

)
@pytest.mark.parametrize("window", ["1S", "2S", "3S", 
pd.offsets.Second(5)])
def test_time_rolling_methods(method, args, window, 
check_less_precise):

if dd._compat.PANDAS_GT_110:
check_less_precise = {}
else:
check_less_precise = {"check_less_precise": check_less_precise}

# DataFrame
if method == "apply":
kwargs = {"raw": False}
else:
kwargs = {}
prolling = ts.rolling(window)
drolling = dts.rolling(window)
>   assert_eq(
getattr(prolling, method)(*args, **kwargs),
getattr(drolling, method)(*args, **kwargs),
**check_less_precise,
)

/usr/lib/python3/dist-packages/dask/dataframe/tests/test_rolling.py:288:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

Bug#969648: dask, pandas 1.1

2020-10-19 Thread Stefano Rivera
Hi Rebecca (2020.10.19_12:07:08_-0700)
> > Or maybe not an actual regression...it's a ~5e-7 difference and one of the
> > things the patch does (at around dask/dataframe/tests/test_rolling.py:270)
> > is _tighten_ the tolerance on that test.
> 
> Hrm, I didn't see that failure. Testing again on a 32bit arch to be
> sure...

Aha. Reproduced.

And found https://github.com/dask/dask/pull/6502

SR

-- 
Stefano Rivera
  http://tumbleweed.org.za/
  +1 415 683 3272



Bug#969648: dask, pandas 1.1

2020-10-19 Thread Stefano Rivera
Hi Rebecca (2020.10.19_11:26:19_-0700)

> I have now tested it.  (The dask tests are run in autopkgtest, not build.)

Thanks. I took your untested patch and tested it, too.

It needed some tweaking, which it looks like you've also done.

> The attached is what I have so far, but it had these failures.  The first
> two happen with or without 969648.patch and (from debci results) appear to
> be triggered by the new fsspec, but the last is a *regression* caused by
> this patch.

I cherry picked these to fix these failures:
https://github.com/dask/dask/pull/6331
https://github.com/dask/dask/pull/6446

SR

-- 
Stefano Rivera
  http://tumbleweed.org.za/
  +1 415 683 3272



Bug#969648: dask, pandas 1.1

2020-10-19 Thread Stefano Rivera
Hi Rebecca (2020.10.19_11:51:33_-0700)

> Or maybe not an actual regression...it's a ~5e-7 difference and one of the
> things the patch does (at around dask/dataframe/tests/test_rolling.py:270)
> is _tighten_ the tolerance on that test.

Hrm, I didn't see that failure. Testing again on a 32bit arch to be
sure...

> That looks like my earlier version, which fails with NameError.

Yeah, I applied it as-is first, and then followed up with the fixes,
after seeing the test failures.

SR

-- 
Stefano Rivera
  http://tumbleweed.org.za/
  +1 415 683 3272



Bug#969648: dask, pandas 1.1

2020-10-19 Thread Rebecca N. Palmer
The upstream patch doesn't even apply as-is; this version does, but I 
don't have time right now to actually test it.


There's also a circular dependency problem, as dask indirectly 
build-depends on itself and my new pandas makes it uninstallable.


Description: pandas 1.1 compatibility

Origin: part of upstream f212b76fefeb93298205d7d224cbc1f7ed387ce9
Author: Tom Augspurger, Rebecca Palmer

diff --git a/dask/dataframe/core.py b/dask/dataframe/core.py
index 4a5c6d1f..cedd46fc 100644
--- a/dask/dataframe/core.py
+++ b/dask/dataframe/core.py
@@ -2487,7 +2487,7 @@ Dask Name: {name}, {task} tasks"""
 else:
 is_anchored = offset.isAnchored()

-include_right = is_anchored or not hasattr(offset, "_inc")
+include_right = is_anchored or not hasattr(offset, "delta")

 if end == self.npartitions - 1:
 divs = self.divisions
@@ -4106,7 +4106,7 @@ class DataFrame(_Frame):
 left_index=on is None,
 right_index=True,
 left_on=on,
-suffixes=[lsuffix, rsuffix],
+suffixes=(lsuffix, rsuffix),
 npartitions=npartitions,
 shuffle=shuffle,
 )
diff --git a/dask/dataframe/tests/test_dataframe.py 
b/dask/dataframe/tests/test_dataframe.py

index 64c15000..5e4f2bef 100644
--- a/dask/dataframe/tests/test_dataframe.py
+++ b/dask/dataframe/tests/test_dataframe.py
@@ -37,6 +37,9 @@ dsk = {
 meta = make_meta({"a": "i8", "b": "i8"}, index=pd.Index([], "i8"))
 d = dd.DataFrame(dsk, "x", meta, [0, 5, 9, 9])
 full = d.compute()
+CHECK_FREQ = {}
+if dd._compat.PANDAS_GT_110:
+CHECK_FREQ["check_freq"] = False


 def test_dataframe_doc():
@@ -222,7 +225,18 @@ def test_index_names():
 assert ddf.index.compute().name == "x"


-@pytest.mark.parametrize("npartitions", [1, pytest.param(2, 
marks=pytest.mark.xfail)])

+@pytest.mark.parametrize(
+"npartitions",
+[
+1,
+pytest.param(
+2,
+marks=pytest.mark.xfail(
+not dd._compat.PANDAS_GT_110, reason="Fixed upstream."
+),
+),
+],
+)
 def test_timezone_freq(npartitions):
 s_naive = pd.Series(pd.date_range("20130101", periods=10))
 s_aware = pd.Series(pd.date_range("20130101", periods=10, 
tz="US/Eastern"))

@@ -385,12 +399,48 @@ def test_describe_numeric(method, test_values):
 (None, None, None, ["c", "d", "g"]),  # numeric + bool
 (None, None, None, ["c", "d", "f", "g"]),  # numeric + bool + 
timedelta

 (None, None, None, ["f", "g"]),  # bool + timedelta
-("all", None, None, None),
-(["number"], None, [0.25, 0.5], None),
-([np.timedelta64], None, None, None),
-(["number", "object"], None, [0.25, 0.75], None),
-(None, ["number", "object"], None, None),
-(["object", "datetime", "bool"], None, None, None),
+pytest.param(
+"all",
+None,
+None,
+None,
+marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),

+),
+pytest.param(
+["number"],
+None,
+[0.25, 0.5],
+None,
+marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),

+),
+pytest.param(
+[np.timedelta64],
+None,
+None,
+None,
+marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),

+),
+pytest.param(
+["number", "object"],
+None,
+[0.25, 0.75],
+None,
+marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),

+),
+pytest.param(
+None,
+["number", "object"],
+None,
+None,
+marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),

+),
+pytest.param(
+["object", "datetime", "bool"],
+None,
+None,
+None,
+marks=pytest.mark.xfail(PANDAS_GT_110, reason="upstream 
changes"),

+),
 ],
 )
 def test_describe(include, exclude, percentiles, subset):
@@ -2522,15 +2572,17 @@ def test_to_timestamp():
 index = pd.period_range(freq="A", start="1/1/2001", end="12/1/2004")
 df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]}, 
index=index)

 ddf = dd.from_pandas(df, npartitions=3)
-assert_eq(ddf.to_timestamp(), df.to_timestamp())
+assert_eq(ddf.to_timestamp(), df.to_timestamp(), **CHECK_FREQ)
 assert_eq(
 ddf.to_timestamp(freq="M", how="s").compute(),
 df.to_timestamp(freq="M", how="s"),
+**CHECK_FREQ
 )
 assert_eq(ddf.x.to_timestamp(), df.x.to_timestamp())
 assert_eq(
 ddf.x.to_timestamp(freq="M", how="s").compute(),
 df.x.to_timestamp(freq="M", how="s"),
+**CHECK_FREQ
 )


diff --git a/dask/dataframe/tests/test_extensions.py