Hello community, here is the log from the commit of package python-dask for openSUSE:Factory checked in at 2018-11-26 10:29:33 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-dask (Old) and /work/SRC/openSUSE:Factory/.python-dask.new.19453 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-dask" Mon Nov 26 10:29:33 2018 rev:13 rq:651331 version:0.20.2 Changes: -------- --- /work/SRC/openSUSE:Factory/python-dask/python-dask.changes 2018-11-12 09:46:50.128699242 +0100 +++ /work/SRC/openSUSE:Factory/.python-dask.new.19453/python-dask.changes 2018-11-26 10:29:46.425065452 +0100 @@ -1,0 +2,19 @@ +Thu Nov 22 22:46:17 UTC 2018 - Arun Persaud <[email protected]> + +- update to version 0.20.2: + * Array + + Avoid fusing dependencies of atop reductions (:pr:`4207`) + Matthew Rocklin + * Dataframe + + Improve memory footprint for dataframe correlation (:pr:`4193`) + Damien Garaud + + Add empty DataFrame check to boundary_slice (:pr:`4212`) James + Bourbeau + * Documentation + + Copy edit documentation (:pr:`4197`) (:pr:`4204`) (:pr:`4198`) + (:pr:`4199`) (:pr:`4200`) (:pr:`4202`) (:pr:`4209`) Miguel + Farrajota + + Add stats module namespace (:pr:`4206`) James Bourbeau + + Fix link in dataframe documentation (:pr:`4208`) James Bourbeau + +------------------------------------------------------------------- Old: ---- dask-0.20.1.tar.gz New: ---- dask-0.20.2.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-dask.spec ++++++ --- /var/tmp/diff_new_pack.PYvgUv/_old 2018-11-26 10:29:47.025064749 +0100 +++ /var/tmp/diff_new_pack.PYvgUv/_new 2018-11-26 10:29:47.025064749 +0100 @@ -22,7 +22,7 @@ # python(2/3)-distributed has a dependency loop with python(2/3)-dask %bcond_with test_distributed Name: python-dask -Version: 0.20.1 +Version: 0.20.2 Release: 0 Summary: Minimal task scheduling abstraction License: BSD-3-Clause ++++++ dask-0.20.1.tar.gz -> dask-0.20.2.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/PKG-INFO new/dask-0.20.2/PKG-INFO --- old/dask-0.20.1/PKG-INFO 2018-11-09 19:45:38.000000000 +0100 +++ new/dask-0.20.2/PKG-INFO 2018-11-15 15:10:02.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: dask -Version: 0.20.1 +Version: 0.20.2 Summary: Parallel PyData with Task Scheduling Home-page: http://github.com/dask/dask/ Maintainer: Matthew Rocklin @@ -44,9 +44,9 @@ Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.* -Provides-Extra: complete -Provides-Extra: distributed Provides-Extra: bag -Provides-Extra: array Provides-Extra: delayed +Provides-Extra: complete +Provides-Extra: array +Provides-Extra: distributed Provides-Extra: dataframe diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/_version.py new/dask-0.20.2/dask/_version.py --- old/dask-0.20.1/dask/_version.py 2018-11-09 19:45:38.000000000 +0100 +++ new/dask-0.20.2/dask/_version.py 2018-11-15 15:10:02.000000000 +0100 @@ -11,8 +11,8 @@ { "dirty": false, "error": null, - "full-revisionid": "df4c6dc05461c77d568e747dfceb5043c065fb4d", - "version": "0.20.1" + "full-revisionid": "f74a9e9f8a73b6f9d1e15c54f40f2c83ea801657", + "version": "0.20.2" } ''' # END VERSION_JSON diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/array/tests/test_atop.py new/dask-0.20.2/dask/array/tests/test_atop.py --- old/dask-0.20.1/dask/array/tests/test_atop.py 2018-11-09 19:21:51.000000000 +0100 +++ new/dask-0.20.2/dask/array/tests/test_atop.py 2018-11-15 14:53:58.000000000 +0100 @@ -269,7 +269,7 @@ assert z.chunks == ((7,), (7,), (2, 2, 1)) assert_eq(z, np.ones((7, 7, 5))) - w = atop(lambda x: x[:, 0, 0], 'a', z, 'abc', dtype=x.dtype, concatenate=concatenate) + w = atop(lambda x: x[:, 0, 0], 'a', z, 'abc', dtype=x.dtype, concatenate=True) assert w.chunks == ((7,),) assert_eq(w, np.ones((7,))) @@ -413,3 +413,16 @@ D = da.Array(array_dsk, name, chunks, dtype=A.dtype) D.sum(axis=0).compute() + + +def test_dont_merge_before_reductions(): + x = da.ones(10, chunks=(5,)) + y = da.atop(inc, 'i', x, 'i', dtype=x.dtype) + z = da.atop(sum, '', y, 'i', dtype=y.dtype) + w = da.atop(sum, '', z, '', dtype=y.dtype) + + dsk = optimize_atop(w.dask) + + assert len([d for d in dsk.dicts.values() if isinstance(d, TOP)]) == 2 + + z.compute() diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/array/top.py new/dask-0.20.2/dask/array/top.py --- old/dask-0.20.1/dask/array/top.py 2018-11-09 19:21:51.000000000 +0100 +++ new/dask-0.20.2/dask/array/top.py 2018-11-15 14:53:58.000000000 +0100 @@ -601,20 +601,32 @@ deps = set(top_layers) while deps: # we gather as many sub-layers as we can dep = deps.pop() - if (dep in layers and - isinstance(layers[dep], TOP) and - not (dep != layer and dep in keep) and # output layer - layers[dep].concatenate == layers[layer].concatenate): # punt on mixed concatenate - top_layers.add(dep) - - # traverse further to this child's children - for d in full_graph.dependencies.get(dep, ()): - if len(dependents[d]) <= 1: - deps.add(d) - else: - stack.append(d) - else: + if dep not in layers: + stack.append(dep) + continue + if not isinstance(layers[dep], TOP): + stack.append(dep) + continue + if (dep != layer and dep in keep): + stack.append(dep) + continue + if layers[dep].concatenate != layers[layer].concatenate: stack.append(dep) + continue + + # passed everything, proceed + top_layers.add(dep) + + # traverse further to this child's children + for d in full_graph.dependencies.get(dep, ()): + # Don't allow reductions to proceed + output_indices = set(layers[dep].output_indices) + input_indices = {i for _, ind in layers[dep].indices if ind for i in ind} + + if len(dependents[d]) <= 1 and output_indices.issuperset(input_indices): + deps.add(d) + else: + stack.append(d) # Merge these TOP layers into one new_layer = rewrite_atop([layers[l] for l in top_layers]) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/dataframe/core.py new/dask-0.20.2/dask/dataframe/core.py --- old/dask-0.20.1/dask/dataframe/core.py 2018-11-09 19:21:51.000000000 +0100 +++ new/dask-0.20.2/dask/dataframe/core.py 2018-11-15 14:53:58.000000000 +0100 @@ -3893,18 +3893,27 @@ def cov_corr_chunk(df, corr=False): - """Chunk part of a covariance or correlation computation""" - mat = df.astype('float64', copy=False).values - mask = np.isfinite(mat) - keep = np.bitwise_and(mask[:, None, :], mask[:, :, None]) - - x = np.where(keep, mat[:, None, :], np.nan) - sums = np.nansum(x, 0) - counts = keep.astype('int').sum(0) + """Chunk part of a covariance or correlation computation + """ + shape = (df.shape[1], df.shape[1]) + sums = np.zeros(shape) + counts = np.zeros(shape) + df = df.astype('float64', copy=False) + for idx, col in enumerate(df): + mask = df[col].notnull() + sums[idx] = df[mask].sum().values + counts[idx] = df[mask].count().values cov = df.cov().values dtype = [('sum', sums.dtype), ('count', counts.dtype), ('cov', cov.dtype)] if corr: - m = np.nansum((x - sums / np.where(counts, counts, np.nan)) ** 2, 0) + mu = (sums / counts).T + m = np.zeros(shape) + mask = df.isnull().values + for idx, x in enumerate(df): + mu_discrepancy = np.subtract.outer(df[x], mu[idx]) ** 2 + mu_discrepancy[mask] = np.nan + m[idx] = np.nansum(mu_discrepancy, axis=0) + m = m.T dtype.append(('m', m.dtype)) out = np.empty(counts.shape, dtype=dtype) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/dataframe/methods.py new/dask-0.20.2/dask/dataframe/methods.py --- old/dask-0.20.1/dask/dataframe/methods.py 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/dask/dataframe/methods.py 2018-11-15 14:53:58.000000000 +0100 @@ -51,6 +51,8 @@ kind='loc'): """Index slice start/stop. Can switch include/exclude boundaries. + Examples + -------- >>> df = pd.DataFrame({'x': [10, 20, 30, 40, 50]}, index=[1, 2, 2, 3, 4]) >>> boundary_slice(df, 2, None) x @@ -69,7 +71,18 @@ 1 10 2 20 2 30 + + Empty input DataFrames are returned + + >>> df_empty = pd.DataFrame() + >>> boundary_slice(df_empty, 1, 3) + Empty DataFrame + Columns: [] + Index: [] """ + if df.empty: + return df + if kind == 'loc' and not df.index.is_monotonic: # Pandas treats missing keys differently for label-slicing # on monotonic vs. non-monotonic indexes diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/dataframe/tests/test_dataframe.py new/dask-0.20.2/dask/dataframe/tests/test_dataframe.py --- old/dask-0.20.1/dask/dataframe/tests/test_dataframe.py 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/dask/dataframe/tests/test_dataframe.py 2018-11-15 14:53:58.000000000 +0100 @@ -3076,6 +3076,13 @@ tm.assert_frame_equal(result, expected) +def test_boundary_slice_empty(): + df = pd.DataFrame() + result = methods.boundary_slice(df, 1, 4) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('start, stop, right_boundary, left_boundary, drop', [ (-1, None, False, False, [-1, -2]), (-1, None, False, True, [-2]), diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask/dataframe/tests/test_multi.py new/dask-0.20.2/dask/dataframe/tests/test_multi.py --- old/dask-0.20.1/dask/dataframe/tests/test_multi.py 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/dask/dataframe/tests/test_multi.py 2018-11-15 14:53:58.000000000 +0100 @@ -239,6 +239,28 @@ hash_join(a, 'y', b, 'y', 'outer', shuffle=shuffle)._name) +def test_sequential_joins(): + # Pandas version of multiple inner joins + df1 = pd.DataFrame({'key': list(range(6)), + 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + df2 = pd.DataFrame({'key': list(range(4)), + 'B': ['B0', 'B1', 'B2', 'B3']}) + df3 = pd.DataFrame({'key': list(range(1, 5)), + 'C': ['C0', 'C1', 'C2', 'C3']}) + + join_pd = df1.join(df2, how='inner', lsuffix='_l', rsuffix='_r') + multi_join_pd = join_pd.join(df3, how='inner', lsuffix='_l', rsuffix='_r') + + # Dask version of multiple inner joins + ddf1 = dd.from_pandas(df1, npartitions=3) + ddf2 = dd.from_pandas(df2, npartitions=2) + ddf3 = dd.from_pandas(df3, npartitions=2) + + join_dd = ddf1.join(ddf2, how='inner', lsuffix='_l', rsuffix='_r') + multi_join_dd = join_dd.join(ddf3, how='inner', lsuffix='_l', rsuffix='_r') + assert_eq(multi_join_pd, multi_join_dd) + + @pytest.mark.parametrize('join', ['inner', 'outer']) def test_indexed_concat(join): A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/dask.egg-info/PKG-INFO new/dask-0.20.2/dask.egg-info/PKG-INFO --- old/dask-0.20.1/dask.egg-info/PKG-INFO 2018-11-09 19:45:38.000000000 +0100 +++ new/dask-0.20.2/dask.egg-info/PKG-INFO 2018-11-15 15:10:02.000000000 +0100 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: dask -Version: 0.20.1 +Version: 0.20.2 Summary: Parallel PyData with Task Scheduling Home-page: http://github.com/dask/dask/ Maintainer: Matthew Rocklin @@ -44,9 +44,9 @@ Classifier: Programming Language :: Python :: 3.6 Classifier: Programming Language :: Python :: 3.7 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.* -Provides-Extra: complete -Provides-Extra: distributed Provides-Extra: bag -Provides-Extra: array Provides-Extra: delayed +Provides-Extra: complete +Provides-Extra: array +Provides-Extra: distributed Provides-Extra: dataframe diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/array-gufunc.rst new/dask-0.20.2/docs/source/array-gufunc.rst --- old/dask-0.20.1/docs/source/array-gufunc.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/array-gufunc.rst 2018-11-15 14:53:58.000000000 +0100 @@ -4,12 +4,12 @@ EXPERIMENTAL FEATURE added to Version 0.18.0 and above - see :ref:`disclaimer<disclaimer>`. -`numpy <http://www.numpy.org>`_ provides the concept of `generalized ufuncs <http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html>`_. Generalized ufuncs are functions +`NumPy <http://www.numpy.org>`_ provides the concept of `generalized ufuncs <http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html>`_. Generalized ufuncs are functions that distinguish the various dimensions of passed arrays in the two classes loop dimensions -and core dimensions. To accomplish this, a `signature <https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html#details-of-signature>`_ is specified for numpy generalized ufuncs. +and core dimensions. To accomplish this, a `signature <https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html#details-of-signature>`_ is specified for NumPy generalized ufuncs. -`dask <https://dask.org/>`_ integrates interoperability with numpy's generalized ufuncs -by adhering to respective `ufunc protocol <https://docs.scipy.org/doc/numpy/reference/arrays.classes.html#numpy.class.__array_ufunc__>`_, as well as provides a wrapper to make a Python function a generalized ufunc. +`Dask <https://dask.org/>`_ integrates interoperability with NumPy's generalized ufuncs +by adhering to respective `ufunc protocol <https://docs.scipy.org/doc/numpy/reference/arrays.classes.html#numpy.class.__array_ufunc__>`_, and provides a wrapper to make a Python function a generalized ufunc. Usage @@ -17,8 +17,10 @@ NumPy generalized ufunc ~~~~~~~~~~~~~~~~~~~~~~~ -Note: `numpy <http://www.numpy.org>`_ generalized ufuncs are currently (v1.14.3 and below) stored in -inside ``np.linalg._umath_linalg`` and might change in the future. +.. note:: + + `NumPy <http://www.numpy.org>`_ generalized ufuncs are currently (v1.14.3 and below) stored in + inside ``np.linalg._umath_linalg`` and might change in the future. .. code-block:: python @@ -66,10 +68,10 @@ ---------- This experimental generalized ufunc integration is not complete: -* ``gufunc`` does not create a true generalized ufunc to be used with other input arrays, but dask. - I.e. at the moment ``gufunc`` casts all input arguments to ``dask.array.Array``. +* ``gufunc`` does not create a true generalized ufunc to be used with other input arrays besides Dask. + I.e., at the moment, ``gufunc`` casts all input arguments to ``dask.array.Array`` -* Inferring ``output_dtypes`` automatically is not implemented yet. +* Inferring ``output_dtypes`` automatically is not implemented yet API diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/array-stack.rst new/dask-0.20.2/docs/source/array-stack.rst --- old/dask-0.20.1/docs/source/array-stack.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/array-stack.rst 2018-11-15 14:53:58.000000000 +0100 @@ -6,7 +6,7 @@ might have many HDF5/NetCDF files on disk, one for every day, but we want to do operations that span multiple days. -To solve this problem we use the functions ``da.stack``, ``da.concatenate``, +To solve this problem, we use the functions ``da.stack``, ``da.concatenate``, and ``da.block``. Stack @@ -63,7 +63,7 @@ ----- We can handle a larger variety of cases with ``da.block`` as it allows -concatenation to be applied over multiple dimensions at once. This is useful if +concatenation to be applied over multiple dimensions at once. This is useful if your chunks tile a space, for example if small squares tile a larger 2-D plane. .. code-block:: python diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/array-stats.rst new/dask-0.20.2/docs/source/array-stats.rst --- old/dask-0.20.1/docs/source/array-stats.rst 2018-11-09 19:21:51.000000000 +0100 +++ new/dask-0.20.2/docs/source/array-stats.rst 2018-11-15 14:53:58.000000000 +0100 @@ -29,7 +29,7 @@ >>> a = da.random.uniform(size=(50,), chunks=(25,)) >>> b = a + da.random.uniform(low=-0.15, high=0.15, size=(50,), chunks=(25,)) - >>> result = ttest_rel(a, b) + >>> result = stats.ttest_rel(a, b) >>> result.compute() Ttest_relResult(statistic=-1.5102104380013242, pvalue=0.13741197274874514) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/bag-creation.rst new/dask-0.20.2/docs/source/bag-creation.rst --- old/dask-0.20.1/docs/source/bag-creation.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/bag-creation.rst 2018-11-15 14:53:58.000000000 +0100 @@ -1,7 +1,7 @@ Create Dask Bags ================ -There are several ways to create Dask.bags around your data: +There are several ways to create Dask bags around your data: ``db.from_sequence`` -------------------- @@ -19,11 +19,11 @@ >>> b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=2) -This controls the granularity of the parallelism that you expose. By default -dask will try to partition your data into about 100 partitions. +This controls the granularity of the parallelism that you expose. By default, +Dask will try to partition your data into about 100 partitions. -IMPORTANT: do not load your data into Python and then load that data into -dask.bag. Instead, use dask.bag to load your data. This +IMPORTANT: do not load your data into Python and then load that data into a +Dask bag. Instead, use Dask Bag to load your data. This parallelizes the loading step and reduces inter-worker communication: .. code-block:: python @@ -34,9 +34,9 @@ ``db.read_text`` ---------------- -Dask.bag can load data directly from textfiles. -You can pass either a single filename, a list of filenames, or a globstring. -The resulting bag will have one item per line, one file per partition: +Dask Bag can load data directly from text files. You can pass either a +single file name, a list of file names, or a globstring. The resulting +bag will have one item per line and one file per partition: .. code-block:: python @@ -45,8 +45,8 @@ >>> b = db.read_text('myfile.*.txt') This handles standard compression libraries like ``gzip``, ``bz2``, ``xz``, or -any easily installed compression library that has a File-like object. -Compression will be inferred by filename extension, or by using the +any easily installed compression library that has a file-like object. +Compression will be inferred by the file name extension, or by using the ``compression='gzip'`` keyword: .. code-block:: python @@ -54,7 +54,7 @@ >>> b = db.read_text('myfile.*.txt.gz') The resulting items in the bag are strings. If you have encoded data like -line-delimited JSON then you may want to map a decoding or load function across +line-delimited JSON, then you may want to map a decoding or load function across the bag: .. code-block:: python @@ -62,7 +62,7 @@ >>> import json >>> b = db.read_text('myfile.*.json').map(json.loads) -Or do string munging tasks. For convenience there is a string namespace +Or do string munging tasks. For convenience, there is a string namespace attached directly to bags with ``.str.methodname``: .. code-block:: python @@ -73,11 +73,11 @@ ``db.from_avro`` ---------------- -Dask.bag can read binary files in the `Avro`_ format, if `fastavro`_ is installed. -A bag can be made from one or more -files, with optional chunking within files. The resulting bag will have one item per -Avro record, which will be a dictionary of the form given by the Avro schema. There will -be at least one partition per input file. +Dask Bag can read binary files in the `Avro`_ format if `fastavro`_ is installed. +A bag can be made from one or more files, with optional chunking within files. +The resulting bag will have one item per Avro record, which will be a dictionary +of the form given by the Avro schema. There will be at least one partition per +input file: .. code-block:: python @@ -87,11 +87,13 @@ .. _Avro: https://avro.apache.org/docs/1.8.2/ .. _fastavro: https://fastavro.readthedocs.io -By default, Dask will split data files into chunks of approximately ``blocksize`` bytes in -size. The actual blocks you would get depend on the internal blocking of the file. - -For files that are compressed after creation (this is not the same as the internal "codec" used -by Avro), then no chunking should be used, and there will be exactly one partition per file: +By default, Dask will split data files into chunks of approximately ``blocksize`` +bytes in size. The actual blocks you would get depend on the internal blocking +of the file. + +For files that are compressed after creation (this is not the same as the internal +"codec" used by Avro), no chunking should be used, and there will be exactly one +partition per file: .. code-block:: python @@ -101,10 +103,9 @@ ``db.from_delayed`` ------------------- -You can construct a dask bag from :doc:`dask.delayed <delayed>` values -using the ``db.from_delayed`` function. See -:doc:`documentation on using dask.delayed with collections <delayed-collections>` -for more information. +You can construct a Dask bag from :doc:`dask.delayed <delayed>` values using the +``db.from_delayed`` function. For more information, see +:doc:`documentation on using dask.delayed with collections <delayed-collections>`. Store Dask Bags @@ -113,7 +114,8 @@ In Memory --------- -You can convert a dask bag to a list or Python iterable by calling ``compute()`` or by converting the object into a list +You can convert a Dask bag to a list or Python iterable by calling ``compute()`` +or by converting the object into a list: .. code-block:: python @@ -121,11 +123,11 @@ or >>> result = list(b) -To Textfiles ------------- +To Text Files +------------- -You can convert a dask bag into a sequence of files on disk by calling the -``.to_textfiles()`` method +You can convert a Dask bag into a sequence of files on disk by calling the +``.to_textfiles()`` method: .. autofunction:: dask.bag.core.to_textfiles @@ -134,14 +136,14 @@ Dask bags can be written directly to Avro binary format using `fastavro`_. One file will be written per bag partition. This requires the user to provide a fully-specified -schema dictionary, see the docstring of the ``.to_avro()`` method. +schema dictionary (see the docstring of the ``.to_avro()`` method). .. autofunction:: dask.bag.avro.to_avro To DataFrames ------------- -You can convert a dask bag into a :doc:`dask dataframe<dataframe>` and use +You can convert a Dask bag into a :doc:`Dask DataFrame<dataframe>` and use those storage solutions. .. automethod:: dask.bag.core.Bag.to_dataframe @@ -150,7 +152,7 @@ To Delayed Values ----------------- -You can convert a dask bag into a list of :doc:`dask delayed values<delayed>` +You can convert a Dask bag into a list of :doc:`Dask delayed values<delayed>` and custom storage solutions from there. .. automethod:: dask.bag.core.Bag.to_delayed diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/bag-overview.rst new/dask-0.20.2/docs/source/bag-overview.rst --- old/dask-0.20.1/docs/source/bag-overview.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/bag-overview.rst 2018-11-15 14:53:58.000000000 +0100 @@ -1,10 +1,10 @@ Overview ======== -Dask.Bag implements operations like ``map``, ``filter``, ``fold``, and -``groupby`` on collections of Python objects. It does this in parallel and in -small memory using Python iterators. It is similar to a parallel version of -PyToolz_ or a Pythonic version of the `PySpark RDD`_. +Dask Bag implements operations like ``map``, ``filter``, ``fold``, and +``groupby`` on collections of Python objects. It does this in parallel with a +small memory footprint using Python iterators. It is similar to a parallel +version of PyToolz_ or a Pythonic version of the `PySpark RDD`_. .. _PyToolz: https://toolz.readthedocs.io/en/latest/ .. _`PySpark RDD`: http://spark.apache.org/docs/latest/api/python/pyspark.html @@ -28,7 +28,7 @@ Execution on bags provide two benefits: 1. Parallel: data is split up, allowing multiple cores or machines to execute - in parallel. + in parallel 2. Iterating: data processes lazily, allowing smooth execution of larger-than-memory data, even on a single machine within a single partition @@ -36,11 +36,11 @@ Default scheduler ~~~~~~~~~~~~~~~~~ -By default ``dask.bag`` uses ``dask.multiprocessing`` for computation. As a -benefit Dask bypasses the GIL_ and uses multiple cores on Pure Python objects. -As a drawback Dask.bag doesn't perform well on computations that include a +By default, ``dask.bag`` uses ``dask.multiprocessing`` for computation. As a +benefit, Dask bypasses the GIL_ and uses multiple cores on pure Python objects. +As a drawback, Dask Bag doesn't perform well on computations that include a great deal of inter-worker communication. For common operations this is rarely -an issue as most Dask.bag workflows are embarrassingly parallel or result in +an issue as most Dask Bag workflows are embarrassingly parallel or result in reductions with little data moving between workers. .. _GIL: https://docs.python.org/3/glossary.html#term-gil @@ -50,13 +50,13 @@ ~~~~~~~ Some operations, like ``groupby``, require substantial inter-worker -communication. On a single machine, dask uses partd_ to perform efficient, -parallel, spill-to-disk shuffles. When working in a cluster, dask uses a task +communication. On a single machine, Dask uses partd_ to perform efficient, +parallel, spill-to-disk shuffles. When working in a cluster, Dask uses a task based shuffle. These shuffle operations are expensive and better handled by projects like ``dask.dataframe``. It is best to use ``dask.bag`` to clean and process data, -then transform it into an array or dataframe before embarking on the more +then transform it into an array or DataFrame before embarking on the more complex operations that require shuffle steps. .. _partd: https://github.com/mrocklin/partd @@ -65,32 +65,32 @@ Known Limitations ----------------- -Bags provide very general computation (any Python function.) This generality +Bags provide very general computation (any Python function). This generality comes at cost. Bags have the following known limitations: -1. By default they rely on the multiprocessing scheduler, which has its own +1. By default, they rely on the multiprocessing scheduler, which has its own set of known limitations (see :doc:`shared`) 2. Bags are immutable and so you can not change individual elements -3. Bag operations tend to be slower than array/dataframe computations in the +3. Bag operations tend to be slower than array/DataFrame computations in the same way that standard Python containers tend to be slower than NumPy - arrays and Pandas dataframes. -4. ``Bag.groupby`` is slow. You should try to use ``Bag.foldby`` if possible. - Using ``Bag.foldby`` requires more thought. + arrays and Pandas DataFrames +4. Bag's ``groupby`` is slow. You should try to use Bag's ``foldby`` if possible. + Using ``foldby`` requires more thought tough Name ---- *Bag* is the mathematical name for an unordered collection allowing repeats. It -is a friendly synonym to multiset_. A bag or a multiset is a generalization of +is a friendly synonym to multiset_. A bag, or a multiset, is a generalization of the concept of a set that, unlike a set, allows multiple instances of the -multiset's elements. +multiset's elements: * ``list``: *ordered* collection *with repeats*, ``[1, 2, 3, 2]`` * ``set``: *unordered* collection *without repeats*, ``{1, 2, 3}`` * ``bag``: *unordered* collection *with repeats*, ``{1, 2, 2, 3}`` -So a bag is like a list, but it doesn't guarantee an ordering among elements. +So, a bag is like a list, but it doesn't guarantee an ordering among elements. There can be repeated elements but you can't ask for the ith element. .. _multiset: http://en.wikipedia.org/wiki/Bag_(mathematics) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/bag.rst new/dask-0.20.2/docs/source/bag.rst --- old/dask-0.20.1/docs/source/bag.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/bag.rst 2018-11-15 14:53:58.000000000 +0100 @@ -1,7 +1,7 @@ Bag === -Dask.Bag parallelizes computations across a large collection of generic Python +Dask Bag parallelizes computations across a large collection of generic Python objects. It is particularly useful when dealing with large quantities of semi-structured data like JSON blobs or log files. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/changelog.rst new/dask-0.20.2/docs/source/changelog.rst --- old/dask-0.20.1/docs/source/changelog.rst 2018-11-09 19:43:50.000000000 +0100 +++ new/dask-0.20.2/docs/source/changelog.rst 2018-11-15 14:57:39.000000000 +0100 @@ -1,6 +1,29 @@ Changelog ========= +0.20.2 / 2018-11-15 +------------------- + +Array ++++++ + +- Avoid fusing dependencies of atop reductions (:pr:`4207`) `Matthew Rocklin`_ + +Dataframe ++++++++++ + +- Improve memory footprint for dataframe correlation (:pr:`4193`) `Damien Garaud`_ +- Add empty DataFrame check to boundary_slice (:pr:`4212`) `James Bourbeau`_ + + +Documentation ++++++++++++++ + +- Copy edit documentation (:pr:`4197`) (:pr:`4204`) (:pr:`4198`) (:pr:`4199`) (:pr:`4200`) (:pr:`4202`) (:pr:`4209`) `Miguel Farrajota`_ +- Add stats module namespace (:pr:`4206`) `James Bourbeau`_ +- Fix link in dataframe documentation (:pr:`4208`) `James Bourbeau`_ + + 0.20.1 / 2018-11-09 ------------------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/dataframe-create.rst new/dask-0.20.2/docs/source/dataframe-create.rst --- old/dask-0.20.1/docs/source/dataframe-create.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/dataframe-create.rst 2018-11-15 14:53:58.000000000 +0100 @@ -1,19 +1,19 @@ Create and Store Dask DataFrames ================================ -Dask can create dataframes from various data storage formats like CSV, HDF, -Apache Parquet, and others. For most formats this data can live on various +Dask can create DataFrames from various data storage formats like CSV, HDF, +Apache Parquet, and others. For most formats, this data can live on various storage systems including local disk, network file systems (NFS), the Hadoop File System (HDFS), and Amazon's S3 (excepting HDF, which is only available on POSIX like file systems). See the :doc:`Overview section <dataframe-overview>` for an in depth -discussion of ``dask.dataframe`` scope, use, limitations. +discussion of ``dask.dataframe`` scope, use, and limitations. API --- -The following functions provide access to convert between Dask Dataframes, +The following functions provide access to convert between Dask DataFrames, file formats, and other Dask or Python collections. .. currentmodule:: dask.dataframe @@ -51,9 +51,9 @@ Locations --------- -For text, CSV, and Apache Parquet formats data can come from local disk, from -the Hadoop File System, from S3FS, or others, by prepending the filenames with -a protocol. +For text, CSV, and Apache Parquet formats, data can come from local disk, +the Hadoop File System, S3FS, or other sources, by prepending the filenames with +a protocol: .. code-block:: python @@ -61,10 +61,10 @@ >>> df = dd.read_csv('hdfs:///path/to/my-data-*.csv') >>> df = dd.read_csv('s3://bucket-name/my-data-*.csv') -For remote systems like HDFS or S3 credentials may be an issue. Usually these -are handled by configuration files on disk (such as a ``.boto`` file for S3) +For remote systems like HDFS or S3, credentials may be an issue. Usually, these +are handled by configuration files on disk (such as a ``.boto`` file for S3), but in some cases you may want to pass storage-specific options through to the -storage backend. You can do this with the ``storage_options=`` keyword. +storage backend. You can do this with the ``storage_options=`` keyword: .. code-block:: python @@ -75,11 +75,11 @@ Dask Delayed ------------ -For more complex situations not covered by the functions above you may want to -use :doc:`dask.delayed<delayed>` , which lets you construct -Dask.dataframes out of arbitrary Python function calls that load dataframes. -This can allow you to handle new formats easily, or bake in particular logic -around loading data if, for example, your data is stored with some special +For more complex situations not covered by the functions above, you may want to +use :doc:`dask.delayed<delayed>`, which lets you construct Dask DataFrames out +of arbitrary Python function calls that load DataFrames. This can allow you to +handle new formats easily or bake in particular logic around loading data if, +for example, your data is stored with some special format. See :doc:`documentation on using dask.delayed with collections<delayed-collections>` or an `example notebook @@ -87,36 +87,36 @@ how to create a Dask DataFrame from a nested directory structure of Feather files (as a stand in for any custom file format). -Dask.delayed is particularly useful when simple ``map`` operations aren't +Dask delayed is particularly useful when simple ``map`` operations aren't sufficient to capture the complexity of your data layout. From Raw Dask Graphs -------------------- -This section is mainly for developers wishing to extend dask.dataframe. It +This section is mainly for developers wishing to extend ``dask.dataframe``. It discusses internal API not normally needed by users. Everything below can be done just as effectively with :doc:`dask.delayed<delayed>` described -just above. You should never need to create a dataframe object by hand. +just above. You should never need to create a DataFrame object by hand. To construct a DataFrame manually from a dask graph you need the following information: -1. dask: a dask graph with keys like ``{(name, 0): ..., (name, 1): ...}`` as +1. Dask: a Dask graph with keys like ``{(name, 0): ..., (name, 1): ...}`` as well as any other tasks on which those tasks depend. The tasks corresponding to ``(name, i)`` should produce ``pandas.DataFrame`` objects - that correspond to the columns and divisions information discussed below. -2. name: The special name used above -3. columns: A list of column names -4. divisions: A list of index values that separate the different partitions. - Alternatively, if you don't know the divisions (this is common) you can + that correspond to the columns and divisions information discussed below +2. Name: the special name used above +3. Columns: a list of column names +4. Divisions: a list of index values that separate the different partitions. + Alternatively, if you don't know the divisions (this is common), you can provide a list of ``[None, None, None, ...]`` with as many partitions as - you have plus one. For more information see the Partitions section in the - :doc:`dataframe documentation <dataframe>`. + you have plus one. For more information, see the Partitions section in the + :doc:`DataFrame documentation <dataframe>` As an example, we build a DataFrame manually that reads several CSV files that -have a datetime index separated by day. Note, you should never do this. The -``dd.read_csv`` function does this for you. +have a datetime index separated by day. Note that you should **never** do this. +The ``dd.read_csv`` function does this for you: .. code-block:: Python diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/dataframe-performance.rst new/dask-0.20.2/docs/source/dataframe-performance.rst --- old/dask-0.20.1/docs/source/dataframe-performance.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/dataframe-performance.rst 2018-11-15 14:53:58.000000000 +0100 @@ -190,7 +190,7 @@ ----------------------------------- HDF5 is a popular choice for Pandas users with high performance needs. We -encourage Dask.dataframe users to :doc:`store and load data dataframe-create>` +encourage Dask.dataframe users to :doc:`store and load data <dataframe-create>` using Parquet instead. `Apache Parquet <http://parquet.apache.org/>`_ is a columnar binary format that is easy to split into multiple files (easier for parallel loading) and is generally much simpler to deal with than HDF5 (from diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/dask-0.20.1/docs/source/dataframe.rst new/dask-0.20.2/docs/source/dataframe.rst --- old/dask-0.20.1/docs/source/dataframe.rst 2018-11-02 15:42:48.000000000 +0100 +++ new/dask-0.20.2/docs/source/dataframe.rst 2018-11-15 14:53:58.000000000 +0100 @@ -1,4 +1,4 @@ -Dataframe +DataFrame ========= .. toctree:: @@ -13,17 +13,17 @@ dataframe-indexing.rst dataframe-extend.rst -A Dask DataFrame is a large parallel dataframe composed of many smaller Pandas -dataframes, split along the index. These pandas dataframes may live on disk +A Dask DataFrame is a large parallel DataFrame composed of many smaller Pandas +DataFrames, split along the index. These Pandas DataFrames may live on disk for larger-than-memory computing on a single machine, or on many different -machines in a cluster. One Dask dataframe operation triggers many operations -on the constituent Pandas dataframes. +machines in a cluster. One Dask DataFrame operation triggers many operations +on the constituent Pandas DataFrames. Design ------ -Dask dataframes coordinate many Pandas DataFrames/Series arranged along the -index. Dask.dataframe is partitioned *row-wise*, grouping rows by index value +Dask DataFrames coordinate many Pandas DataFrames/Series arranged along the +index. A Dask DataFrame is partitioned *row-wise*, grouping rows by index value for efficiency. These Pandas objects may live on disk or on other machines. .. image:: images/dask-dataframe.svg @@ -31,12 +31,12 @@ :width: 40% -Dask.dataframe copies the Pandas API +Dask DataFrame copies the Pandas API ------------------------------------ Because the ``dask.dataframe`` application programming interface (API) is a -subset of the Pandas API it should be familiar to Pandas users. There are some -slight alterations due to the parallel nature of dask: +subset of the Pandas API, it should be familiar to Pandas users. There are some +slight alterations due to the parallel nature of Dask: .. code-block:: python @@ -53,7 +53,7 @@ >>> df2 = df[df.y == 'a'].x + 1 -As with all dask collections one triggers computation by calling the +As with all Dask collections, one triggers computation by calling the ``.compute()`` method: .. code-block:: python @@ -67,24 +67,24 @@ Common Uses and Anti-Uses ------------------------- -Dask.dataframe is used in situations where Pandas is commonly needed, but when -Pandas fails due to data size or computation speed. +Dask DataFrame is used in situations where Pandas is commonly needed, usually when +Pandas fails due to data size or computation speed: - Manipulating large datasets, even when those datasets don't fit in memory - Accelerating long computations by using many cores - Distributed computing on large datasets with standard Pandas operations like groupby, join, and time series computations -Dask dataframe may not be the best choice in the following situations: +Dask DataFrame may not be the best choice in the following situations: -* If your dataset fits comfortably into RAM on your laptop then you may be - better off just using Pandas . There may be simpler ways to improve - performance than through parallelism. -* If your dataset doesn't fit neatly into the Pandas tabular model then you +* If your dataset fits comfortably into RAM on your laptop, then you may be + better off just using Pandas. There may be simpler ways to improve + performance than through parallelism +* If your dataset doesn't fit neatly into the Pandas tabular model, then you might find more use in :doc:`dask.bag <bag>` or :doc:`dask.array <array>` -* If you need functions that are not implemented in Dask dataframe then you +* If you need functions that are not implemented in Dask DataFrame, then you might want to look at :doc:`dask.delayed <delayed>` which offers more - flexibility. + flexibility * If you need a proper database with all that databases offer you might prefer something like Postgres_ @@ -95,16 +95,16 @@ Scope ----- -Dask.dataframe covers a well-used portion of the Pandas API. +Dask DataFrame covers a well-used portion of the Pandas API. The following class of computations works well: * Trivially parallelizable operations (fast): - * Elementwise operations: ``df.x + df.y``, ``df * df`` + * Element-wise operations: ``df.x + df.y``, ``df * df`` * Row-wise selections: ``df[df.x > 0]`` * Loc: ``df.loc[4.0:10.5]`` * Common aggregations: ``df.x.max()``, ``df.max()`` * Is in: ``df[df.x.isin([1, 2, 3])]`` - * Datetime/string accessors: ``df.timestamp.month`` + * Date time/string accessors: ``df.timestamp.month`` * Cleverly parallelizable operations (fast): * groupby-aggregate (with common aggregations): ``df.groupby(df.x).y.max()``, ``df.groupby('x').max()`` @@ -116,26 +116,26 @@ or ``dd.merge(df1, df2, on=['idx', 'x'])`` where ``idx`` is the index name for both ``df1`` and ``df2`` * Join with Pandas DataFrames: ``dd.merge(df1, df2, on='id')`` - * Elementwise operations with different partitions / divisions: ``df1.x + df2.y`` - * Datetime resampling: ``df.resample(...)`` + * Element-wise operations with different partitions / divisions: ``df1.x + df2.y`` + * Date time resampling: ``df.resample(...)`` * Rolling averages: ``df.rolling(...)`` - * Pearson Correlations: ``df[['col1', 'col2']].corr()`` + * Pearson's correlation: ``df[['col1', 'col2']].corr()`` * Operations requiring a shuffle (slow-ish, unless on index) * Set index: ``df.set_index(df.x)`` * groupby-apply not on index (with anything): ``df.groupby(df.x).apply(myfunc)`` * Join not on the index: ``dd.merge(df1, df2, on='name')`` -However Dask dataframe does not implement the entire Pandas interface. Users -expecting this will be disappointed. Notably, Dask dataframe has the following +However, Dask DataFrame does not implement the entire Pandas interface. Users +expecting this will be disappointed. Notably, Dask DataFrame has the following limitations: 1. Setting a new index from an unsorted column is expensive -2. Many operations, like groupby-apply and join on unsorted columns require +2. Many operations like groupby-apply and join on unsorted columns require setting the index, which as mentioned above, is expensive -3. The Pandas API is very large. Dask dataframe does not attempt to implement +3. The Pandas API is very large. Dask DataFrame does not attempt to implement many Pandas features or any of the more exotic data structures like NDFrames 4. Operations that were slow on Pandas, like iterating through row-by-row, - remain slow on Dask dataframe + remain slow on Dask DataFrame See :doc:`DataFrame API documentation<dataframe-api>` for a more extensive list. @@ -143,13 +143,13 @@ Execution --------- -By default ``dask.dataframe`` uses the multi-threaded scheduler. -This exposes some parallelism when Pandas or the underlying numpy operations -release the global interpreter lock (GIL). Generally Pandas is more GIL +By default, Dask DataFrame uses the multi-threaded scheduler. +This exposes some parallelism when Pandas or the underlying NumPy operations +release the global interpreter lock (GIL). Generally, Pandas is more GIL bound than NumPy, so multi-core speed-ups are not as pronounced for -``dask.dataframe`` as they are for ``dask.array``. This is changing, and +Dask DataFrame as they are for Dask Array. This is changing, and the Pandas development team is actively working on releasing the GIL. -When dealing with text data you may see speedups by switching to the newer +When dealing with text data, you may see speedups by switching to the newer :doc:`distributed scheduler <setup/single-distributed>` either on a cluster or single machine.
