Hello community, here is the log from the commit of package python-dask for openSUSE:Factory checked in at 2019-12-09 21:35:45 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-dask (Old) and /work/SRC/openSUSE:Factory/.python-dask.new.4691 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-dask" Mon Dec 9 21:35:45 2019 rev:25 rq:755171 version:2.9.0 Changes: -------- --- /work/SRC/openSUSE:Factory/python-dask/python-dask.changes 2019-11-30 10:40:48.708147466 +0100 +++ /work/SRC/openSUSE:Factory/.python-dask.new.4691/python-dask.changes 2019-12-09 21:36:03.158078558 +0100 @@ -1,0 +2,29 @@ +Sat Dec 7 19:08:29 UTC 2019 - Arun Persaud <[email protected]> + +- update to version 2.9.0: + * Array + + Fix da.std to work with NumPy arrays (:pr:`5681`) James Bourbeau + * Core + + Register sizeof functions for Numba and RMM (:pr:`5668`) John A + Kirkham + + Update meeting time (:pr:`5682`) Tom Augspurger + * DataFrame + + Modify dd.DataFrame.drop to use shallow copy (:pr:`5675`) + Richard J Zamora + + Fix bug in _get_md_row_groups (:pr:`5673`) Richard J Zamora + + Close sqlalchemy engine after querying DB (:pr:`5629`) Krishan + Bhasin + + Allow dd.map_partitions to not enforce meta (:pr:`5660`) Matthew + Rocklin + + Generalize concat_unindexed_dataframes to support cudf-backend + (:pr:`5659`) Richard J Zamora + + Add dataframe resample methods (:pr:`5636`) Ben Zaitlen + + Compute length of dataframe as length of first column + (:pr:`5635`) Matthew Rocklin + * Documentation + + Doc fixup (:pr:`5665`) James Bourbeau + + Update doc build instructions (:pr:`5640`) James Bourbeau + + Fix ADL link (:pr:`5639`) Ray Bell + + Add documentation build (:pr:`5617`) James Bourbeau + +------------------------------------------------------------------- Old: ---- dask-2.8.1.tar.gz New: ---- dask-2.9.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-dask.spec ++++++ --- /var/tmp/diff_new_pack.dDBzzg/_old 2019-12-09 21:36:04.170078159 +0100 +++ /var/tmp/diff_new_pack.dDBzzg/_new 2019-12-09 21:36:04.174078158 +0100 @@ -27,7 +27,7 @@ %endif %define skip_python2 1 Name: python-dask%{psuffix} -Version: 2.8.1 +Version: 2.9.0 Release: 0 Summary: Minimal task scheduling abstraction License: BSD-3-Clause ++++++ dask-2.8.1.tar.gz -> dask-2.9.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/PKG-INFO new/dask-2.9.0/PKG-INFO --- old/dask-2.8.1/PKG-INFO 2019-11-23 05:31:55.000000000 +0100 +++ new/dask-2.9.0/PKG-INFO 2019-12-06 22:48:02.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: dask -Version: 2.8.1 +Version: 2.9.0 Summary: Parallel PyData with Task Scheduling Home-page: https://github.com/dask/dask/ Maintainer: Matthew Rocklin diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/_version.py new/dask-2.9.0/dask/_version.py --- old/dask-2.8.1/dask/_version.py 2019-11-23 05:31:55.000000000 +0100 +++ new/dask-2.9.0/dask/_version.py 2019-12-06 22:48:02.000000000 +0100 @@ -11,8 +11,8 @@ { "dirty": false, "error": null, - "full-revisionid": "eee9b78da60c24897e1df984f01dd9f36245fcb1", - "version": "2.8.1" + "full-revisionid": "5a96ec7c04877487c7c6ae4f9bb1802566f7e36e", + "version": "2.9.0" } ''' # END VERSION_JSON diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/array/reductions.py new/dask-2.9.0/dask/array/reductions.py --- old/dask-2.8.1/dask/array/reductions.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/array/reductions.py 2019-12-06 22:32:26.000000000 +0100 @@ -813,7 +813,8 @@ @wraps(chunk.std) def std(a, axis=None, dtype=None, keepdims=False, ddof=0, split_every=None, out=None): result = sqrt( - a.var( + var( + a, axis=axis, dtype=dtype, keepdims=keepdims, diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/array/tests/test_reductions.py new/dask-2.9.0/dask/array/tests/test_reductions.py --- old/dask-2.8.1/dask/array/tests/test_reductions.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/array/tests/test_reductions.py 2019-12-06 22:32:26.000000000 +0100 @@ -49,6 +49,9 @@ def reduction_1d_test(da_func, darr, np_func, narr, use_dtype=True, split_every=True): assert_eq(da_func(darr), np_func(narr)) + assert_eq( + da_func(narr), np_func(narr) + ) # Ensure Dask reductions work with NumPy arrays assert_eq(da_func(darr, keepdims=True), np_func(narr, keepdims=True)) assert_eq(da_func(darr, axis=()), np_func(narr, axis=())) assert same_keys(da_func(darr), da_func(darr)) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/core.py new/dask-2.9.0/dask/dataframe/core.py --- old/dask-2.8.1/dask/dataframe/core.py 2019-11-23 05:10:40.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/core.py 2019-12-06 22:32:26.000000000 +0100 @@ -70,6 +70,7 @@ valid_divisions, hash_object_dispatch, check_matching_columns, + drop_by_shallow_copy, ) no_default = "__no_default__" @@ -417,13 +418,19 @@ def index(self): """Return dask Index instance""" return self.map_partitions( - getattr, "index", token=self._name + "-index", meta=self._meta.index + getattr, + "index", + token=self._name + "-index", + meta=self._meta.index, + enforce_metadata=False, ) @index.setter def index(self, value): self.divisions = value.divisions - result = map_partitions(methods.assign_index, self, value) + result = map_partitions( + methods.assign_index, self, value, enforce_metadata=False + ) self.dask = result.dask self._name = result._name self._meta = result._meta @@ -448,7 +455,9 @@ drop : boolean, default False Do not try to insert index into dataframe columns. """ - return self.map_partitions(M.reset_index, drop=drop).clear_divisions() + return self.map_partitions( + M.reset_index, drop=drop, enforce_metadata=False + ).clear_divisions() @property def known_divisions(self): @@ -1155,6 +1164,7 @@ limit=limit, axis=axis, meta=meta, + enforce_metadata=False, **kwargs ) @@ -1247,7 +1257,11 @@ @derived_from(pd.DataFrame) def replace(self, to_replace=None, value=None, regex=False): return self.map_partitions( - M.replace, to_replace=to_replace, value=value, regex=regex + M.replace, + to_replace=to_replace, + value=value, + regex=regex, + enforce_metadata=False, ) def to_dask_array(self, lengths=None): @@ -1271,7 +1285,7 @@ ------- """ if lengths is True: - lengths = tuple(self.map_partitions(len).compute()) + lengths = tuple(self.map_partitions(len, enforce_metadata=False).compute()) arr = self.values @@ -1401,7 +1415,9 @@ raise TypeError("periods must be an integer") if axis == 1: - return self.map_partitions(M.diff, token="diff", periods=periods, axis=1) + return self.map_partitions( + M.diff, token="diff", periods=periods, axis=1, enforce_metadata=False + ) before, after = (periods, 0) if periods > 0 else (0, -periods) return self.map_overlap(M.diff, before, after, token="diff", periods=periods) @@ -1414,7 +1430,12 @@ if axis == 1: return self.map_partitions( - M.shift, token="shift", periods=periods, freq=freq, axis=1 + M.shift, + token="shift", + periods=periods, + freq=freq, + axis=1, + enforce_metadata=False, ) if freq is None: @@ -1431,6 +1452,7 @@ periods=periods, freq=freq, meta=meta, + enforce_metadata=False, transform_divisions=False, ) return maybe_shift_divisions(out, periods, freq=freq) @@ -1464,7 +1486,7 @@ def abs(self): _raise_if_object_series(self, "abs") meta = self._meta_nonempty.abs() - return self.map_partitions(M.abs, meta=meta) + return self.map_partitions(M.abs, meta=meta, enforce_metadata=False) @derived_from(pd.DataFrame) def all(self, axis=None, skipna=True, split_every=False, out=None): @@ -1543,6 +1565,7 @@ token=self._token_prefix + fn, skipna=skipna, axis=axis, + enforce_metadata=False, ) else: scalar = not is_series_like(meta) @@ -1575,6 +1598,7 @@ token=self._token_prefix + fn, skipna=skipna, axis=axis, + enforce_metadata=False, ) else: scalar = not is_series_like(meta) @@ -1600,7 +1624,9 @@ token = self._token_prefix + "count" if axis == 1: meta = self._meta_nonempty.count(axis=axis) - return self.map_partitions(M.count, meta=meta, token=token, axis=axis) + return self.map_partitions( + M.count, meta=meta, token=token, axis=axis, enforce_metadata=False + ) else: meta = self._meta_nonempty.count() result = self.reduction( @@ -1627,6 +1653,7 @@ token=self._token_prefix + "mean", axis=axis, skipna=skipna, + enforce_metadata=False, ) return handle_out(out, result) else: @@ -1634,7 +1661,14 @@ s = num.sum(skipna=skipna, split_every=split_every) n = num.count(split_every=split_every) name = self._token_prefix + "mean-%s" % tokenize(self, axis, skipna) - result = map_partitions(methods.mean_aggregate, s, n, token=name, meta=meta) + result = map_partitions( + methods.mean_aggregate, + s, + n, + token=name, + meta=meta, + enforce_metadata=False, + ) if isinstance(self, DataFrame): result.divisions = (min(self.columns), max(self.columns)) return handle_out(out, result) @@ -1655,6 +1689,7 @@ axis=axis, skipna=skipna, ddof=ddof, + enforce_metadata=False, ) return handle_out(out, result) else: @@ -1800,12 +1835,15 @@ axis=axis, skipna=skipna, ddof=ddof, + enforce_metadata=False, ) return handle_out(out, result) else: v = self.var(skipna=skipna, ddof=ddof, split_every=split_every) name = self._token_prefix + "std" - result = map_partitions(np.sqrt, v, meta=meta, token=name) + result = map_partitions( + np.sqrt, v, meta=meta, token=name, enforce_metadata=False + ) return handle_out(out, result) @derived_from(pd.DataFrame) @@ -1828,7 +1866,9 @@ v = num.var(skipna=skipna, ddof=ddof, split_every=split_every) n = num.count(split_every=split_every) name = self._token_prefix + "sem" - result = map_partitions(np.sqrt, v / n, meta=meta, token=name) + result = map_partitions( + np.sqrt, v / n, meta=meta, token=name, enforce_metadata=False + ) if isinstance(self, DataFrame): result.divisions = (min(self.columns), max(self.columns)) @@ -1856,7 +1896,13 @@ # Not supported, the result will have current index as columns raise ValueError("'q' must be scalar when axis=1 is specified") return map_partitions( - M.quantile, self, q, axis, token=keyname, meta=(q, "f8") + M.quantile, + self, + q, + axis, + token=keyname, + enforce_metadata=False, + meta=(q, "f8"), ) else: _raise_if_object_series(self, "quantile") @@ -2136,24 +2182,24 @@ def where(self, cond, other=np.nan): # cond and other may be dask instance, # passing map_partitions via keyword will not be aligned - return map_partitions(M.where, self, cond, other) + return map_partitions(M.where, self, cond, other, enforce_metadata=False) @derived_from(pd.DataFrame) def mask(self, cond, other=np.nan): - return map_partitions(M.mask, self, cond, other) + return map_partitions(M.mask, self, cond, other, enforce_metadata=False) @derived_from(pd.DataFrame) def notnull(self): - return self.map_partitions(M.notnull) + return self.map_partitions(M.notnull, enforce_metadata=False) @derived_from(pd.DataFrame) def isnull(self): - return self.map_partitions(M.isnull) + return self.map_partitions(M.isnull, enforce_metadata=False) @derived_from(pd.DataFrame) def isna(self): if hasattr(pd, "isna"): - return self.map_partitions(M.isna) + return self.map_partitions(M.isna, enforce_metadata=False) else: raise NotImplementedError( "Need more recent version of Pandas " @@ -2174,7 +2220,9 @@ # We wrap values in a delayed for two reasons: # - avoid serializing data in every task # - avoid cost of traversal of large list in optimizations - return self.map_partitions(M.isin, delayed(values), meta=meta) + return self.map_partitions( + M.isin, delayed(values), meta=meta, enforce_metadata=False + ) @derived_from(pd.DataFrame) def astype(self, dtype): @@ -2195,7 +2243,9 @@ meta = clear_known_categories(meta, cols=set_unknown) elif is_categorical_dtype(dtype) and getattr(dtype, "categories", None) is None: meta = clear_known_categories(meta) - return self.map_partitions(M.astype, dtype=dtype, meta=meta) + return self.map_partitions( + M.astype, dtype=dtype, meta=meta, enforce_metadata=False + ) @derived_from(pd.Series) def append(self, other, interleave_partitions=False): @@ -2217,7 +2267,12 @@ M.align, self, other, join, axis=axis, fill_value=fill_value ) aligned = self.map_partitions( - M.align, other, join=join, axis=axis, fill_value=fill_value + M.align, + other, + join=join, + axis=axis, + fill_value=fill_value, + enforce_metadata=False, ) token = tokenize(self, other, join, axis, fill_value) @@ -2602,7 +2657,7 @@ res = self if inplace else self.copy() res.name = index else: - res = self.map_partitions(M.rename, index) + res = self.map_partitions(M.rename, index, enforce_metadata=False) if self.known_divisions: if sorted_index and (callable(index) or is_dict_like(index)): old = pd.Series(range(self.npartitions + 1), index=self.divisions) @@ -2704,7 +2759,7 @@ @derived_from(pd.Series, version="0.25.0") def explode(self): meta = self._meta.explode() - return self.map_partitions(M.explode, meta=meta) + return self.map_partitions(M.explode, meta=meta, enforce_metadata=False) def unique(self, split_every=None, split_out=1): """ @@ -2802,7 +2857,7 @@ @derived_from(pd.Series) def dropna(self): - return self.map_partitions(M.dropna) + return self.map_partitions(M.dropna, enforce_metadata=False) @derived_from(pd.Series) def between(self, left, right, inclusive=True): @@ -2815,15 +2870,21 @@ if out is not None: raise ValueError("'out' must be None") # np.clip may pass out - return self.map_partitions(M.clip, lower=lower, upper=upper) + return self.map_partitions( + M.clip, lower=lower, upper=upper, enforce_metadata=False + ) @derived_from(pd.Series) def clip_lower(self, threshold): - return self.map_partitions(M.clip_lower, threshold=threshold) + return self.map_partitions( + M.clip_lower, threshold=threshold, enforce_metadata=False + ) @derived_from(pd.Series) def clip_upper(self, threshold): - return self.map_partitions(M.clip_upper, threshold=threshold) + return self.map_partitions( + M.clip_upper, threshold=threshold, enforce_metadata=False + ) @derived_from(pd.Series) def align(self, other, join="outer", axis=None, fill_value=None): @@ -2992,7 +3053,9 @@ @derived_from(pd.Series) def memory_usage(self, index=True, deep=False): - result = self.map_partitions(M.memory_usage, index=index, deep=deep) + result = self.map_partitions( + M.memory_usage, index=index, deep=deep, enforce_metadata=False + ) return delayed(sum)(result.to_delayed()) def __divmod__(self, other): @@ -3222,6 +3285,14 @@ return _iLocIndexer(self) + def __len__(self): + try: + s = self[self.columns[0]] + except IndexError: + return super().__len__() + else: + return len(s) + def __getitem__(self, key): name = "getitem-%s" % tokenize(self, key) if np.isscalar(key) or isinstance(key, (tuple, str)): @@ -3582,21 +3653,29 @@ @derived_from(pd.DataFrame) def dropna(self, how="any", subset=None, thresh=None): - return self.map_partitions(M.dropna, how=how, subset=subset, thresh=thresh) + return self.map_partitions( + M.dropna, how=how, subset=subset, thresh=thresh, enforce_metadata=False + ) @derived_from(pd.DataFrame) def clip(self, lower=None, upper=None, out=None): if out is not None: raise ValueError("'out' must be None") - return self.map_partitions(M.clip, lower=lower, upper=upper) + return self.map_partitions( + M.clip, lower=lower, upper=upper, enforce_metadata=False + ) @derived_from(pd.DataFrame) def clip_lower(self, threshold): - return self.map_partitions(M.clip_lower, threshold=threshold) + return self.map_partitions( + M.clip_lower, threshold=threshold, enforce_metadata=False + ) @derived_from(pd.DataFrame) def clip_upper(self, threshold): - return self.map_partitions(M.clip_upper, threshold=threshold) + return self.map_partitions( + M.clip_upper, threshold=threshold, enforce_metadata=False + ) @derived_from(pd.DataFrame) def squeeze(self, axis=None): @@ -3623,7 +3702,7 @@ @derived_from(pd.DataFrame, version="0.25.0") def explode(self, column): meta = self._meta.explode(column) - return self.map_partitions(M.explode, column, meta=meta) + return self.map_partitions(M.explode, column, meta=meta, enforce_metadata=False) def to_bag(self, index=False): """Convert to a dask Bag of tuples of each row. @@ -3672,7 +3751,7 @@ axis = self._validate_axis(axis) if (axis == 1) or (columns is not None): return self.map_partitions( - M.drop, labels=labels, axis=axis, columns=columns, errors=errors + drop_by_shallow_copy, columns or labels, errors=errors ) raise NotImplementedError( "Drop currently only works for axis=1 or when columns is not None" @@ -3876,11 +3955,18 @@ meta=meta, axis=axis, fill_value=fill_value, + enforce_metadata=False, ) meta = _emulate(op, self, other, axis=axis, fill_value=fill_value) return map_partitions( - op, self, other, meta=meta, axis=axis, fill_value=fill_value + op, + self, + other, + meta=meta, + axis=axis, + fill_value=fill_value, + enforce_metadata=False, ) meth.__doc__ = skip_doctest(op.__doc__) @@ -4742,7 +4828,14 @@ @insert_meta_param_description -def map_partitions(func, *args, **kwargs): +def map_partitions( + func, + *args, + meta=no_default, + enforce_metadata=True, + transform_divisions=True, + **kwargs +): """ Apply Python function on each DataFrame partition. Parameters @@ -4755,11 +4848,13 @@ ``Scalar``, ``Delayed`` or regular python objects. DataFrame-like args (both dask and pandas) will be repartitioned to align (if necessary) before applying the function. + enforce_metadata : bool + Whether or not to enforce the structure of the metadata at runtime. + This will rename and reorder columns for each partition, + and will raise an error if this doesn't work or types don't match. $META """ - meta = kwargs.pop("meta", no_default) name = kwargs.pop("token", None) - transform_divisions = kwargs.pop("transform_divisions", True) assert callable(func) if name is not None: @@ -4813,21 +4908,33 @@ args2.append(arg) kwargs3 = {} + simple = True for k, v in kwargs.items(): v = normalize_arg(v) v, collections = unpack_collections(v) dependencies.extend(collections) kwargs3[k] = v + if collections: + simple = False - dsk = partitionwise_graph( - apply_and_enforce, - name, - *args2, - dependencies=dependencies, - _func=func, - _meta=meta, - **kwargs3 - ) + if enforce_metadata: + dsk = partitionwise_graph( + apply_and_enforce, + name, + *args2, + dependencies=dependencies, + _func=func, + _meta=meta, + **kwargs3 + ) + elif not simple: + dsk = partitionwise_graph( + apply, name, func, *args2, **kwargs3, dependencies=dependencies + ) + else: + dsk = partitionwise_graph( + func, name, *args2, **kwargs, dependencies=dependencies + ) divisions = dfs[0].divisions if transform_divisions and isinstance(dfs[0], Index) and len(dfs) == 1: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/io/parquet/arrow.py new/dask-2.9.0/dask/dataframe/io/parquet/arrow.py --- old/dask-2.8.1/dask/dataframe/io/parquet/arrow.py 2019-11-23 05:12:29.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/io/parquet/arrow.py 2019-12-06 22:32:26.000000000 +0100 @@ -31,7 +31,7 @@ row_group = piece.get_metadata().row_group(rg) for c in range(row_group.num_columns): if not row_group.column(c).statistics: - return [] + return (None, None) row_groups.append(row_group) row_groups_per_piece.append(num_row_groups) if len(row_groups) == len(pieces): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/io/sql.py new/dask-2.9.0/dask/dataframe/io/sql.py --- old/dask-2.8.1/dask/dataframe/io/sql.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/io/sql.py 2019-12-06 22:32:26.000000000 +0100 @@ -195,6 +195,8 @@ ) ) + engine.dispose() + return from_delayed(parts, meta, divisions=divisions) @@ -202,8 +204,9 @@ import sqlalchemy as sa engine_kwargs = engine_kwargs or {} - conn = sa.create_engine(uri, **engine_kwargs) - df = pd.read_sql(q, conn, **kwargs) + engine = sa.create_engine(uri, **engine_kwargs) + df = pd.read_sql(q, engine, **kwargs) + engine.dispose() if df.empty: return meta else: diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/multi.py new/dask-2.9.0/dask/dataframe/multi.py --- old/dask-2.8.1/dask/dataframe/multi.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/multi.py 2019-12-06 22:23:39.000000000 +0100 @@ -862,7 +862,7 @@ def concat_and_check(dfs): if len(set(map(len, dfs))) != 1: raise ValueError("Concatenated DataFrames of different lengths") - return pd.concat(dfs, axis=1) + return methods.concat(dfs, axis=1) def concat_unindexed_dataframes(dfs): @@ -873,7 +873,7 @@ for i in range(dfs[0].npartitions) } - meta = pd.concat([df._meta for df in dfs], axis=1) + meta = methods.concat([df._meta for df in dfs], axis=1) graph = HighLevelGraph.from_collections(name, dsk, dependencies=dfs) return new_dd_object(graph, name, meta, dfs[0].divisions) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/tests/test_dataframe.py new/dask-2.9.0/dask/dataframe/tests/test_dataframe.py --- old/dask-2.8.1/dask/dataframe/tests/test_dataframe.py 2019-11-22 04:35:27.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/tests/test_dataframe.py 2019-12-06 22:32:26.000000000 +0100 @@ -259,6 +259,8 @@ assert ds.name == "z" assert_eq(ds, s) + +def test_rename_series_method_2(): # Series index s = pd.Series(["a", "b", "c", "d", "e", "f", "g"], name="x") ds = dd.from_pandas(s, 2) @@ -1018,6 +1020,8 @@ def test_len(): assert len(d) == len(full) assert len(d.a) == len(full.a) + assert len(dd.from_pandas(pd.DataFrame(), npartitions=1)) == 0 + assert len(dd.from_pandas(pd.DataFrame(columns=[1, 2]), npartitions=1)) == 0 def test_size(): @@ -4111,3 +4115,13 @@ assert s.name == "y" assert ddf.columns == ["x"] assert_eq(ddf, df[["x"]]) + + +def test_simple_map_partitions(): + data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]} + df = pd.DataFrame(data) + ddf = dd.from_pandas(df, npartitions=2) + ddf = ddf.clip(-4, 6) + task = ddf.__dask_graph__()[ddf.__dask_keys__()[0]] + [v] = task[0].dsk.values() + assert v[0] == M.clip or v[1] == M.clip diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/tests/test_multi.py new/dask-2.9.0/dask/dataframe/tests/test_multi.py --- old/dask-2.8.1/dask/dataframe/tests/test_multi.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/tests/test_multi.py 2019-12-06 22:23:39.000000000 +0100 @@ -1997,3 +1997,51 @@ dd.multi.warn_dtype_mismatch(df1, df2, "a", "a") assert len(r) == 0 + + [email protected]("engine", ["pandas", "cudf"]) +def test_groupby_concat_cudf(engine): + + # NOTE: Issue #5643 Reproducer + + size = 6 + npartitions = 3 + d1 = pd.DataFrame( + { + "a": np.random.permutation(np.arange(size)), + "b": np.random.randint(100, size=size), + } + ) + d2 = pd.DataFrame( + { + "c": np.random.permutation(np.arange(size)), + "d": np.random.randint(100, size=size), + } + ) + + if engine == "cudf": + # NOTE: engine == "cudf" requires cudf/dask_cudf, + # will be skipped by non-GPU CI. + + cudf = pytest.importorskip("cudf") + dask_cudf = pytest.importorskip("dask_cudf") + + d1 = cudf.from_pandas(d1) + d2 = cudf.from_pandas(d2) + dd1 = dask_cudf.from_cudf(d1, npartitions) + dd2 = dask_cudf.from_cudf(d2, npartitions) + else: + dd1 = dd.from_pandas(d1, npartitions) + dd2 = dd.from_pandas(d2, npartitions) + + grouped_d1 = d1.groupby(["a"]).sum() + grouped_d2 = d2.groupby(["c"]).sum() + res = concat([grouped_d1, grouped_d2], axis=1) + + grouped_dd1 = dd1.groupby(["a"]).sum() + grouped_dd2 = dd2.groupby(["c"]).sum() + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + res_dd = dd.concat([grouped_dd1, grouped_dd2], axis=1) + + assert_eq(res_dd.compute().sort_index(), res.sort_index()) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/tseries/resample.py new/dask-2.9.0/dask/dataframe/tseries/resample.py --- old/dask-2.8.1/dask/dataframe/tseries/resample.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/tseries/resample.py 2019-12-06 22:23:39.000000000 +0100 @@ -213,6 +213,10 @@ return self._agg("max") @derived_from(pd_Resampler) + def nunique(self): + return self._agg("nunique") + + @derived_from(pd_Resampler) def ohlc(self): return self._agg("ohlc") @@ -229,9 +233,17 @@ return self._agg("std") @derived_from(pd_Resampler) + def size(self): + return self._agg("size") + + @derived_from(pd_Resampler) def sum(self): return self._agg("sum") @derived_from(pd_Resampler) def var(self): return self._agg("var") + + @derived_from(pd_Resampler) + def quantile(self): + return self._agg("quantile") diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/tseries/tests/test_resample.py new/dask-2.9.0/dask/dataframe/tseries/tests/test_resample.py --- old/dask-2.8.1/dask/dataframe/tseries/tests/test_resample.py 2019-06-25 19:26:25.000000000 +0200 +++ new/dask-2.9.0/dask/dataframe/tseries/tests/test_resample.py 2019-12-06 22:23:39.000000000 +0100 @@ -3,7 +3,7 @@ import pandas as pd import pytest -from dask.dataframe.utils import assert_eq +from dask.dataframe.utils import assert_eq, PANDAS_VERSION import dask.dataframe as dd @@ -36,6 +36,7 @@ result = resample(ds, freq, how=method, closed=closed, label=label) expected = resample(ps, freq, how=method, closed=closed, label=label) + assert_eq(result, expected, check_dtype=False) divisions = result.divisions @@ -100,3 +101,18 @@ ddf = dd.from_pandas(df, npartitions=4) assert ddf.resample("D").mean().head().index.name == "date" + + [email protected](PANDAS_VERSION <= "0.23.4", reason="quantile not in 0.23") [email protected]("agg", ["nunique", "mean", "count", "size", "quantile"]) +def test_common_aggs(agg): + index = pd.date_range("2000-01-01", "2000-02-15", freq="h") + ps = pd.Series(range(len(index)), index=index) + ds = dd.from_pandas(ps, npartitions=2) + + f = lambda df: getattr(df, agg)() + + res = f(ps.resample("1d")) + expected = f(ds.resample("1d")) + + assert_eq(res, expected, check_dtype=False) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/dataframe/utils.py new/dask-2.9.0/dask/dataframe/utils.py --- old/dask-2.8.1/dask/dataframe/utils.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/dataframe/utils.py 2019-12-06 22:32:26.000000000 +0100 @@ -952,3 +952,13 @@ return False return True + + +def drop_by_shallow_copy(df, columns, errors="raise"): + """ Use shallow copy to drop columns in place + """ + df2 = df.copy(deep=False) + if not pd.api.types.is_list_like(columns): + columns = [columns] + df2.drop(columns=columns, inplace=True, errors=errors) + return df2 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask/sizeof.py new/dask-2.9.0/dask/sizeof.py --- old/dask-2.8.1/dask/sizeof.py 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/dask/sizeof.py 2019-12-06 22:32:26.000000000 +0100 @@ -45,6 +45,27 @@ return int(x.nbytes) [email protected]_lazy("numba") +def register_numba(): + import numba.cuda + + @sizeof.register(numba.cuda.cudadrv.devicearray.DeviceNDArray) + def sizeof_numba_devicendarray(x): + return int(x.nbytes) + + [email protected]_lazy("rmm") +def register_rmm(): + import rmm + + # Only included in 0.11.0+ + if hasattr(rmm, "DeviceBuffer"): + + @sizeof.register(rmm.DeviceBuffer) + def sizeof_rmm_devicebuffer(x): + return int(x.nbytes) + + @sizeof.register_lazy("numpy") def register_numpy(): import numpy as np diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/dask.egg-info/PKG-INFO new/dask-2.9.0/dask.egg-info/PKG-INFO --- old/dask-2.8.1/dask.egg-info/PKG-INFO 2019-11-23 05:31:54.000000000 +0100 +++ new/dask-2.9.0/dask.egg-info/PKG-INFO 2019-12-06 22:48:01.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: dask -Version: 2.8.1 +Version: 2.9.0 Summary: Parallel PyData with Task Scheduling Home-page: https://github.com/dask/dask/ Maintainer: Matthew Rocklin diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/docs/source/changelog.rst new/dask-2.9.0/docs/source/changelog.rst --- old/dask-2.8.1/docs/source/changelog.rst 2019-11-23 05:27:27.000000000 +0100 +++ new/dask-2.9.0/docs/source/changelog.rst 2019-12-06 22:45:08.000000000 +0100 @@ -1,6 +1,36 @@ Changelog ========= +2.9.0 / 2019-12-06 +------------------ + +Array ++++++ +- Fix ``da.std`` to work with NumPy arrays (:pr:`5681`) `James Bourbeau`_ + +Core +++++ +- Register ``sizeof`` functions for Numba and RMM (:pr:`5668`) `John A Kirkham`_ +- Update meeting time (:pr:`5682`) `Tom Augspurger`_ + +DataFrame ++++++++++ +- Modify ``dd.DataFrame.drop`` to use shallow copy (:pr:`5675`) `Richard J Zamora`_ +- Fix bug in ``_get_md_row_groups`` (:pr:`5673`) `Richard J Zamora`_ +- Close sqlalchemy engine after querying DB (:pr:`5629`) `Krishan Bhasin`_ +- Allow ``dd.map_partitions`` to not enforce meta (:pr:`5660`) `Matthew Rocklin`_ +- Generalize ``concat_unindexed_dataframes`` to support cudf-backend (:pr:`5659`) `Richard J Zamora`_ +- Add dataframe resample methods (:pr:`5636`) `Ben Zaitlen`_ +- Compute length of dataframe as length of first column (:pr:`5635`) `Matthew Rocklin`_ + +Documentation ++++++++++++++ +- Doc fixup (:pr:`5665`) `James Bourbeau`_ +- Update doc build instructions (:pr:`5640`) `James Bourbeau`_ +- Fix ADL link (:pr:`5639`) `Ray Bell`_ +- Add documentation build (:pr:`5617`) `James Bourbeau`_ + + 2.8.1 / 2019-11-22 ------------------ @@ -2730,3 +2760,4 @@ .. _`Gina Helfrich`: https://github.com/Dr-G .. _`ossdev07`: https://github.com/ossdev07 .. _`Nuno Gomes Silva`: https://github.com/mgsnuno +.. _`Ray Bell`: https://github.com/raybellwaves diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/docs/source/scheduling.rst new/dask-2.9.0/docs/source/scheduling.rst --- old/dask-2.8.1/docs/source/scheduling.rst 2019-06-23 05:15:12.000000000 +0200 +++ new/dask-2.9.0/docs/source/scheduling.rst 2019-12-06 22:32:26.000000000 +0100 @@ -47,7 +47,7 @@ it incurs no costs to transfer data between tasks. However, due to Python's Global Interpreter Lock (GIL), this scheduler only provides parallelism when your computation is dominated by non-Python code, -such as is the case when operating on numeric data in NumPy arrays, Pandas DataFrames, +as is primarily the case when operating on numeric data in NumPy arrays, Pandas DataFrames, or using any of the other C/C++/Cython based projects in the ecosystem. The threaded scheduler is the default choice for diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/docs/source/setup/cloud.rst new/dask-2.9.0/docs/source/setup/cloud.rst --- old/dask-2.8.1/docs/source/setup/cloud.rst 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/docs/source/setup/cloud.rst 2019-12-06 22:23:39.000000000 +0100 @@ -26,7 +26,7 @@ - `s3fs <https://s3fs.readthedocs.io/>`_ for Amazon's S3 - `gcsfs <https://gcsfs.readthedocs.io/>`_ for Google's GCS -- `adlfs <https://azure-datalake-store.readthedocs.io/>`_ for Microsoft's ADL +- `adlfs <https://github.com/dask/adlfs/>`_ for Microsoft's ADL Historical Libraries -------------------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-2.8.1/docs/source/support.rst new/dask-2.9.0/docs/source/support.rst --- old/dask-2.8.1/docs/source/support.rst 2019-11-20 02:10:36.000000000 +0100 +++ new/dask-2.9.0/docs/source/support.rst 2019-12-06 22:32:26.000000000 +0100 @@ -22,13 +22,17 @@ questions and bug reports on gitter and instead ask people to use Stack Overflow or GitHub. 4. **Monthly developer meeting** happens the first Thursday of the month at - 11:00 US Central Time in `this video meeting <https://zoom.us/j/802251830>`_. + 10:00 US Central Time in `this video meeting <https://zoom.us/j/802251830>`_. Meeting notes are available at https://docs.google.com/document/d/1UqNAP87a56ERH_xkQsS5Q_0PKYybd5Lj2WANy_hRzI0/edit + .. raw:: html + + <iframe src="https://calendar.google.com/calendar/embed?src=4l0vts0c1cgdbq5jhcogj55sfs%40group.calendar.google.com" style="border: 0" width="800" height="600" frameborder="0" scrolling="no"></iframe> + You can subscribe to this calendar to be notified of changes: - * `Google Calendar <https://calendar.google.com/calendar/embed?src=4l0vts0c1cgdbq5jhcogj55sfs%40group.calendar.google.com&ctz=America%2FChicago>`__ + * `Google Calendar <https://calendar.google.com/calendar/embed?src=4l0vts0c1cgdbq5jhcogj55sfs%40group.calendar.google.com>`__ * `iCal <https://calendar.google.com/calendar/ical/4l0vts0c1cgdbq5jhcogj55sfs%40group.calendar.google.com/public/basic.ics>`__ .. _`Stack Overflow with the #dask tag`: https://stackoverflow.com/questions/tagged/dask
