Source: dask
Version: 2022.02.0+dfsg-1
Severity: normal
Control: forwarded -1 https://github.com/dask/dask/issues/8620
dask 2022.02.0 is failing two CI tests on 32 bit arches (armhf, i386),
one in test_query_with_meta, the other in test_categorize_info
The test_query_with_meta error is reported upstream at
https://github.com/dask/dask/issues/8620
The test_categorize_info error was dealt with upsteam with your patch
applied in https://github.com/dask/dask/pull/8851 which should be
applied in the 2022.04.0 release.
Since we've got the pyarrow dependency getting in the way of upgrading
to the more recent dask releases as noted in Bug#1013080, should we pull
in the PR#8851 patch to debian/patches to fix test_categorize_info ?
_ test_query_with_meta _
db = 'sqlite:tmp/tmp61ugakdn.'
def test_query_with_meta(db):
from sqlalchemy import sql
data = {
"name": pd.Series([], name="name", dtype="str"),
"age": pd.Series([], name="age", dtype="int"),
}
index = pd.Index([], name="number", dtype="int")
meta = pd.DataFrame(data, index=index)
s1 = sql.select(
[sql.column("number"), sql.column("name"), sql.column("age")]
).select_from(sql.table("test"))
out = read_sql_query(s1, db, npartitions=2, index_col="number",
meta=meta)
# Don't check dtype for windows https://github.com/dask/dask/issues/8620
> assert_eq(out, df[["name", "age"]], check_dtype=sys.platform != "win32")
/usr/lib/python3/dist-packages/dask/dataframe/io/tests/test_sql.py:443:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
a =name age
number
0 Alice 33
1 Bob 40
2 Chris 22
3 Dora 16
4 Edith 53
5 Francis 30
6 Garreth 20
b =name age
number
0 Alice 33
1 Bob 40
2 Chris 22
3 Dora 16
4 Edith 53
5 Francis 30
6 Garreth 20
check_names = True, check_dtype = True, check_divisions = True
check_index = True, scheduler = 'sync', kwargs = {}
def assert_eq(
a,
b,
check_names=True,
check_dtype=True,
check_divisions=True,
check_index=True,
scheduler="sync",
**kwargs,
):
if check_divisions:
assert_divisions(a, scheduler=scheduler)
assert_divisions(b, scheduler=scheduler)
if hasattr(a, "divisions") and hasattr(b, "divisions"):
at = type(np.asarray(a.divisions).tolist()[0]) # numpy to
python
bt = type(np.asarray(b.divisions).tolist()[0]) # scalar
conversion
assert at == bt, (at, bt)
assert_sane_keynames(a)
assert_sane_keynames(b)
a = _check_dask(
a, check_names=check_names, check_dtypes=check_dtype,
scheduler=scheduler
)
b = _check_dask(
b, check_names=check_names, check_dtypes=check_dtype,
scheduler=scheduler
)
if hasattr(a, "to_pandas"):
a = a.to_pandas()
if hasattr(b, "to_pandas"):
b = b.to_pandas()
if isinstance(a, (pd.DataFrame, pd.Series)):
a = _maybe_sort(a, check_index)
b = _maybe_sort(b, check_index)
if not check_index:
a = a.reset_index(drop=True)
b = b.reset_index(drop=True)
if isinstance(a, pd.DataFrame):
> tm.assert_frame_equal(
a, b, check_names=check_names, check_dtype=check_dtype, **kwargs
E AssertionError: Attributes of DataFrame.iloc[:, 1] (column
name="age") are different
E
E Attribute "dtype" are different
E [left]: int32
E [right]: int64
/usr/lib/python3/dist-packages/dask/dataframe/utils.py:562: AssertionError
_ test_categorize_info _
@pytest.mark.skipif(not PANDAS_GT_120, reason="need newer version of
Pandas")
def test_categorize_info():
# assert that we can call info after categorize
# workaround for: https://github.com/pydata/pandas/issues/14368
from io import StringIO
pandas_format._put_lines = put_lines
df = pd.DataFrame(
{"x": [1, 2, 3, 4], "y": pd.Series(list("aabc")), "z":
pd.Series(list("aabc"))},
index=[0, 1, 2, 3],
)
ddf = dd.from_pandas(df, npartitions=4).categorize(["y"])
# Verbose=False
buf = StringIO()
ddf.info(buf=buf, verbose=True)
expected = (
"\n"
"Int64Index: 4 entries, 0 to 3\n"
"Data columns (total 3 columns):\n"
" # Column Non-Null Count Dtype\n"
"--- -- -- -\n"
" 0 x 4