commit python-dask for openSUSE:Factory

root Mon, 03 Dec 2018 01:13:28 -0800

Hello community,

here is the log from the commit of package python-dask for openSUSE:Factory 
checked in at 2018-12-03 10:12:53
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/python-dask (Old)
 and      /work/SRC/openSUSE:Factory/.python-dask.new.19453 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "python-dask"

Mon Dec  3 10:12:53 2018 rev:14 rq:653465 version:1.0.0

Changes:
--------
--- /work/SRC/openSUSE:Factory/python-dask/python-dask.changes  2018-11-26 
10:29:46.425065452 +0100
+++ /work/SRC/openSUSE:Factory/.python-dask.new.19453/python-dask.changes       
2018-12-03 10:12:58.339565562 +0100
@@ -1,0 +2,23 @@
+Sat Dec  1 18:36:31 UTC 2018 - Arun Persaud <a...@gmx.de>
+
+- update to version 1.0.0:
+  * Array
+    + Add nancumsum/nancumprod unit tests (:pr:`4215`) Guido Imperiale
+  * DataFrame
+    + Add index to to_dask_dataframe docstring (:pr:`4232`) James
+      Bourbeau
+    + Text and fix when appending categoricals with fastparquet
+      (:pr:`4245`) Martin Durant
+    + Don't reread metadata when passing ParquetFile to read_parquet
+      (:pr:`4247`) Martin Durant
+  * Documentation
+    + Copy edit documentation (:pr:`4222`) (:pr:`4224`) (:pr:`4228`)
+      (:pr:`4231`) (:pr:`4230`) (:pr:`4234`) (:pr:`4235`) (:pr:`4254`)
+      Miguel Farrajota
+    + Updated doc for the new scheduler keyword (:pr:`4251`) @milesial
+  * Core
+    + Avoid a few warnings (:pr:`4223`) Matthew Rocklin
+    + Remove dask.store module (:pr:`4221`) Matthew Rocklin
+    + Remove AUTHORS.md Jim Crist
+
+-------------------------------------------------------------------

Old:
----
  dask-0.20.2.tar.gz

New:
----
  dask-1.0.0.tar.gz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ python-dask.spec ++++++
--- /var/tmp/diff_new_pack.S5v8Ll/_old  2018-12-03 10:12:58.803565132 +0100
+++ /var/tmp/diff_new_pack.S5v8Ll/_new  2018-12-03 10:12:58.803565132 +0100
@@ -22,7 +22,7 @@
 # python(2/3)-distributed has a dependency loop with python(2/3)-dask
 %bcond_with     test_distributed
 Name:           python-dask
-Version:        0.20.2
+Version:        1.0.0
 Release:        0
 Summary:        Minimal task scheduling abstraction
 License:        BSD-3-Clause

++++++ dask-0.20.2.tar.gz -> dask-1.0.0.tar.gz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/PKG-INFO new/dask-1.0.0/PKG-INFO
--- old/dask-0.20.2/PKG-INFO    2018-11-15 15:10:02.000000000 +0100
+++ new/dask-1.0.0/PKG-INFO     2018-11-28 16:46:01.000000000 +0100
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dask
-Version: 0.20.2
+Version: 1.0.0
 Summary: Parallel PyData with Task Scheduling
 Home-page: http://github.com/dask/dask/
 Maintainer: Matthew Rocklin
@@ -44,9 +44,9 @@
 Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: 3.7
 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
+Provides-Extra: array
+Provides-Extra: dataframe
 Provides-Extra: bag
 Provides-Extra: delayed
 Provides-Extra: complete
-Provides-Extra: array
 Provides-Extra: distributed
-Provides-Extra: dataframe
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/_version.py 
new/dask-1.0.0/dask/_version.py
--- old/dask-0.20.2/dask/_version.py    2018-11-15 15:10:02.000000000 +0100
+++ new/dask-1.0.0/dask/_version.py     2018-11-28 16:46:01.000000000 +0100
@@ -11,8 +11,8 @@
 {
  "dirty": false,
  "error": null,
- "full-revisionid": "f74a9e9f8a73b6f9d1e15c54f40f2c83ea801657",
- "version": "0.20.2"
+ "full-revisionid": "38676863a6ae336bcb307a29a521de32480722b0",
+ "version": "1.0.0"
 }
 '''  # END VERSION_JSON
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/array/core.py 
new/dask-1.0.0/dask/array/core.py
--- old/dask-0.20.2/dask/array/core.py  2018-11-12 15:07:31.000000000 +0100
+++ new/dask-1.0.0/dask/array/core.py   2018-11-27 19:27:13.000000000 +0100
@@ -1158,6 +1158,17 @@
         ----------
         columns: list or string
             list of column names if DataFrame, single string if Series
+        index : dask.dataframe.Index, optional
+            An optional *dask* Index to use for the output Series or DataFrame.
+
+            The default output index depends on whether the array has any 
unknown
+            chunks. If there are any unknown chunks, the output has ``None``
+            for all the divisions (one per chunk). If all the chunks are known,
+            a default index with known divsions is created.
+
+            Specifying ``index`` can be useful if you're conforming a Dask 
Array
+            to an existing dask Series or DataFrame, and you would like the
+            indices to match.
 
         See Also
         --------
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/array/tests/test_reductions.py 
new/dask-1.0.0/dask/array/tests/test_reductions.py
--- old/dask-0.20.2/dask/array/tests/test_reductions.py 2018-11-12 
15:07:31.000000000 +0100
+++ new/dask-1.0.0/dask/array/tests/test_reductions.py  2018-11-27 
19:27:14.000000000 +0100
@@ -427,14 +427,22 @@
     assert_eq(x, func(np.ones((10, 10)), axis=0))
 
 
-@pytest.mark.parametrize("func", ["cumsum", "cumprod"])
+cum_funcs = ["cumsum", "cumprod"]
+if np.__version__ >= '1.12.0':
+    cum_funcs += ["nancumsum", "nancumprod"]
+
+
+@pytest.mark.parametrize("func", cum_funcs)
+@pytest.mark.parametrize("use_nan", [False, True])
 @pytest.mark.parametrize("axis", [None, 0, 1, -1])
-def test_array_cumreduction_axis(func, axis):
+def test_array_cumreduction_axis(func, use_nan, axis):
     np_func = getattr(np, func)
     da_func = getattr(da, func)
 
     s = (10, 11, 12)
     a = np.arange(np.prod(s)).reshape(s)
+    if use_nan:
+        a[1] = np.nan
     d = da.from_array(a, chunks=(4, 5, 6))
 
     a_r = np_func(a, axis=axis)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/dataframe/core.py 
new/dask-1.0.0/dask/dataframe/core.py
--- old/dask-0.20.2/dask/dataframe/core.py      2018-11-15 14:53:58.000000000 
+0100
+++ new/dask-1.0.0/dask/dataframe/core.py       2018-11-27 19:27:14.000000000 
+0100
@@ -3906,7 +3906,9 @@
     cov = df.cov().values
     dtype = [('sum', sums.dtype), ('count', counts.dtype), ('cov', cov.dtype)]
     if corr:
-        mu = (sums / counts).T
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            mu = (sums / counts).T
         m = np.zeros(shape)
         mask = df.isnull().values
         for idx, x in enumerate(df):
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/dataframe/io/parquet.py 
new/dask-1.0.0/dask/dataframe/io/parquet.py
--- old/dask-0.20.2/dask/dataframe/io/parquet.py        2018-11-12 
15:07:31.000000000 +0100
+++ new/dask-1.0.0/dask/dataframe/io/parquet.py 2018-11-28 16:10:27.000000000 
+0100
@@ -583,7 +583,7 @@
             raise ValueError('Appended columns not the same.\n'
                              'Previous: {} | New: {}'
                              .format(pf.columns, list(df.columns)))
-        elif set(pf.dtypes[c] for c in pf.columns) != 
set(df[pf.columns].dtypes):
+        elif (pd.Series(pf.dtypes).loc[pf.columns] != 
df[pf.columns].dtypes).any():
             raise ValueError('Appended dtypes differ.\n{}'
                              .format(set(pf.dtypes.items()) ^
                                      set(df.dtypes.iteritems())))
@@ -1137,6 +1137,7 @@
             mode='rb',
             storage_options=storage_options
         )
+        paths = path
     else:
         read = get_engine(engine)['read']
         fs, fs_token, paths = get_fs_token_paths(
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/dataframe/io/tests/test_parquet.py 
new/dask-1.0.0/dask/dataframe/io/tests/test_parquet.py
--- old/dask-0.20.2/dask/dataframe/io/tests/test_parquet.py     2018-11-12 
15:07:31.000000000 +0100
+++ new/dask-1.0.0/dask/dataframe/io/tests/test_parquet.py      2018-11-28 
16:10:27.000000000 +0100
@@ -1345,3 +1345,34 @@
     assert 'foo' in str(info.value)
     assert 'arrow' in str(info.value)
     assert 'fastparquet' in str(info.value)
+
+
+def test_append_cat_fp(tmpdir):
+    pytest.importorskip('fastparquet')
+    path = str(tmpdir)
+    # https://github.com/dask/dask/issues/4120
+    df = pd.DataFrame({"x": ["a", "a", "b", "a", "b"]})
+    df["x"] = df["x"].astype("category")
+    ddf = dd.from_pandas(df, npartitions=1)
+
+    dd.to_parquet(ddf, path)
+
+    # this fails:
+    dd.to_parquet(ddf, path, append=True, ignore_divisions=True)
+    d = dd.read_parquet(path).compute()
+    assert d['x'].tolist() == ["a", "a", "b", "a", "b"] * 2
+
+
+def test_passing_parquetfile(tmpdir):
+    import shutil
+    fp = pytest.importorskip('fastparquet')
+    path = str(tmpdir)
+    df = pd.DataFrame({"x": [1, 3, 2, 4]})
+    ddf = dd.from_pandas(df, npartitions=1)
+
+    dd.to_parquet(ddf, path)
+    pf = fp.ParquetFile(path)
+    shutil.rmtree(path)
+
+    # should pass, because no need to re-read metadata
+    dd.read_parquet(pf)
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/dataframe/methods.py 
new/dask-1.0.0/dask/dataframe/methods.py
--- old/dask-0.20.2/dask/dataframe/methods.py   2018-11-15 14:53:58.000000000 
+0100
+++ new/dask-1.0.0/dask/dataframe/methods.py    2018-11-27 19:27:14.000000000 
+0100
@@ -117,14 +117,18 @@
 
 def mean_aggregate(s, n):
     try:
-        return s / n
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter('always')
+            return s / n
     except ZeroDivisionError:
         return np.float64(np.nan)
 
 
 def var_aggregate(x2, x, n, ddof):
     try:
-        result = (x2 / n) - (x / n)**2
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter('always')
+            result = (x2 / n) - (x / n)**2
         if ddof != 0:
             result = result * n / (n - ddof)
         return result
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/dataframe/tests/test_groupby.py 
new/dask-1.0.0/dask/dataframe/tests/test_groupby.py
--- old/dask-0.20.2/dask/dataframe/tests/test_groupby.py        2018-11-12 
15:07:31.000000000 +0100
+++ new/dask-1.0.0/dask/dataframe/tests/test_groupby.py 2018-11-27 
19:27:14.000000000 +0100
@@ -1473,7 +1473,7 @@
         'custom_mode',
         lambda s: s.apply(lambda s: [s.value_counts()]),
         agg_mode,
-        lambda s: s.map(lambda i: i[0].argmax()),
+        lambda s: s.map(lambda i: i[0].idxmax()),
     )
 
     d = pd.DataFrame({
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/store/__init__.py 
new/dask-1.0.0/dask/store/__init__.py
--- old/dask-0.20.2/dask/store/__init__.py      2018-10-24 23:39:42.000000000 
+0200
+++ new/dask-1.0.0/dask/store/__init__.py       1970-01-01 01:00:00.000000000 
+0100
@@ -1 +0,0 @@
-from .core import Store
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/store/core.py 
new/dask-1.0.0/dask/store/core.py
--- old/dask-0.20.2/dask/store/core.py  2018-11-02 15:42:48.000000000 +0100
+++ new/dask-1.0.0/dask/store/core.py   1970-01-01 01:00:00.000000000 +0100
@@ -1,115 +0,0 @@
-from __future__ import absolute_import, division, print_function
-
-from collections import defaultdict
-from operator import getitem
-from datetime import datetime
-from time import time
-
-from ..compatibility import MutableMapping
-from ..core import istask, ishashable
-from ..utils_test import add  # noqa: F401
-
-
-class Store(MutableMapping):
-    """ Store - A storage of data and computation
-
-    Examples
-    --------
-
-    Store data like a dictionary
-
-    >>> import dask.store as ds
-    >>> s = ds.Store()
-    >>> s['x'] = 10
-    >>> s['x']
-    10
-
-    Also store computation on that data
-
-    >>> s['y'] = (add, 'x', 5)
-
-    Accessing these keys results in computations.  Results may be cached for
-    reuse.
-
-    >>> s['y']
-    15
-
-    Design
-    ------
-
-    A Store maintains the following state
-
-    dsk: dict
-        A dask to define all computation
-    cache: dict-like
-        Stores both ground data and cached intermediate values
-    data: set
-        The keys in the cache that can not be removed for correctness.
-    compute_time: dict:: {key: float}
-        dict mapping the time it took to compute each key
-    access_times: dict:: {key: [datetimes]}
-        The times at which a key was accessed
-    """
-
-    def __init__(self, cache=None):
-        self.dsk = dict()
-        if cache is None:
-            cache = dict()
-        self.cache = cache
-        self.data = set()
-        self.compute_time = dict()
-        self.access_times = defaultdict(list)
-
-    def __setitem__(self, key, value):
-        if key in self.dsk:
-            if (self.dsk[key] == value or
-                self.dsk[key] == (getitem, self.cache, key) and
-               self.cache[key] == value):
-                return
-            else:
-                raise KeyError("Can not overwrite data")
-        if istask(value):
-            self.dsk[key] = value
-        else:
-            self.cache[key] = value
-            self.dsk[key] = (getitem, self.cache, key)
-            self.data.add(key)
-
-    def __getitem__(self, key):
-        if isinstance(key, list):
-            return (self[item] for item in key)
-        if not ishashable(key):
-            return key
-        if key not in self.dsk:
-            return key
-
-        self.access_times[key].append(datetime.now())
-
-        if key in self.cache:
-            return self.cache[key]
-
-        task = self.dsk[key]
-        func, args = task[0], task[1:]
-
-        if func == getitem and args[0] is self.cache:
-            return self.cache[args[1]]
-
-        args = [self[arg] for arg in args]
-
-        start = time()
-        result = func(*args)
-        end = time()
-
-        self.cache[key] = result
-        self.compute_time[key] = end - start
-
-        return result
-
-    def __len__(self):
-        return len(self.dsk)
-
-    def __iter__(self):
-        return iter(self.dsk)
-
-    def __delitem__(self, key):
-        raise ValueError("Dask Store does not support deletion")
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask/store/tests/test_store.py 
new/dask-1.0.0/dask/store/tests/test_store.py
--- old/dask-0.20.2/dask/store/tests/test_store.py      2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/dask/store/tests/test_store.py       1970-01-01 
01:00:00.000000000 +0100
@@ -1,50 +0,0 @@
-import pytest
-
-from dask.store import Store
-from dask.utils_test import inc, add
-
-
-def test_basic():
-    s = Store()
-    s['x'] = 1
-    s['y'] = (inc, 'x')
-    s['z'] = (add, 'x', 'y')
-
-    assert s.data == set(['x'])
-
-    assert s['z'] == 3
-    assert 'x' in s.data
-    assert s.cache['z'] == 3
-    assert s.cache['y'] == 2
-
-    assert len(s.access_times['z']) == 1
-    assert len(s.access_times['y']) == 1
-    assert len(s.access_times['x']) == 2
-    assert s.compute_time['z'] < 0.1
-
-    cache = s.cache.copy()
-    assert s['z'] == 3
-    assert s.cache == cache
-    assert len(s.access_times['z']) == 2
-    assert len(s.access_times['y']) == 1
-    assert len(s.access_times['x']) == 2
-
-    assert s[5] == 5
-    assert list(s[['x', 'y']]) == [s['x'], s['y']]
-
-    def reassign():
-        s['x'] = 2
-    pytest.raises(Exception, reassign)
-
-
-def test_update():
-    s = Store()
-
-    dsk = {'x': 1, 'y': (inc, 'x')}
-    s.update(dsk)
-
-    assert s['y'] == 2
-
-    pytest.raises(Exception, lambda: s.update({'x': 2}))
-    # Test that it doesn't raise
-    s.update({'x': 1})
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask.egg-info/PKG-INFO 
new/dask-1.0.0/dask.egg-info/PKG-INFO
--- old/dask-0.20.2/dask.egg-info/PKG-INFO      2018-11-15 15:10:02.000000000 
+0100
+++ new/dask-1.0.0/dask.egg-info/PKG-INFO       2018-11-28 16:46:01.000000000 
+0100
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dask
-Version: 0.20.2
+Version: 1.0.0
 Summary: Parallel PyData with Task Scheduling
 Home-page: http://github.com/dask/dask/
 Maintainer: Matthew Rocklin
@@ -44,9 +44,9 @@
 Classifier: Programming Language :: Python :: 3.6
 Classifier: Programming Language :: Python :: 3.7
 Requires-Python: >=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*
+Provides-Extra: array
+Provides-Extra: dataframe
 Provides-Extra: bag
 Provides-Extra: delayed
 Provides-Extra: complete
-Provides-Extra: array
 Provides-Extra: distributed
-Provides-Extra: dataframe
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/dask.egg-info/SOURCES.txt 
new/dask-1.0.0/dask.egg-info/SOURCES.txt
--- old/dask-0.20.2/dask.egg-info/SOURCES.txt   2018-11-15 15:10:02.000000000 
+0100
+++ new/dask-1.0.0/dask.egg-info/SOURCES.txt    2018-11-28 16:46:01.000000000 
+0100
@@ -176,10 +176,6 @@
 dask/diagnostics/tests/__init__.py
 dask/diagnostics/tests/test_profiler.py
 dask/diagnostics/tests/test_progress.py
-dask/store/__init__.py
-dask/store/core.py
-dask/store/tests/__init__.py
-dask/store/tests/test_store.py
 dask/tests/__init__.py
 dask/tests/test_base.py
 dask/tests/test_cache.py
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/changelog.rst 
new/dask-1.0.0/docs/source/changelog.rst
--- old/dask-0.20.2/docs/source/changelog.rst   2018-11-15 14:57:39.000000000 
+0100
+++ new/dask-1.0.0/docs/source/changelog.rst    2018-11-28 16:44:29.000000000 
+0100
@@ -1,6 +1,36 @@
 Changelog
 =========
 
+1.0.0 / 2018-11-28
+------------------
+
+Array
++++++
+
+-  Add nancumsum/nancumprod unit tests (:pr:`4215`) `Guido Imperiale`_
+
+DataFrame
++++++++++
+
+-  Add index to to_dask_dataframe docstring (:pr:`4232`) `James Bourbeau`_
+-  Text and fix when appending categoricals with fastparquet (:pr:`4245`) 
`Martin Durant`_
+-  Don't reread metadata when passing ParquetFile to read_parquet (:pr:`4247`) 
`Martin Durant`_
+
+Documentation
++++++++++++++
+
+-  Copy edit documentation (:pr:`4222`) (:pr:`4224`) (:pr:`4228`) (:pr:`4231`) 
(:pr:`4230`) (:pr:`4234`) (:pr:`4235`) (:pr:`4254`) `Miguel Farrajota`_
+-  Updated doc for the new scheduler keyword (:pr:`4251`) `@milesial`_
+
+
+Core
+++++
+
+-  Avoid a few warnings (:pr:`4223`) `Matthew Rocklin`_
+-  Remove dask.store module (:pr:`4221`) `Matthew Rocklin`_
+-  Remove AUTHORS.md `Jim Crist`_
+
+
 0.20.2 / 2018-11-15
 -------------------
 
@@ -1620,3 +1650,4 @@
 .. _`Damien Garaud`: https://github.com/geraud
 .. _`Jonathan Fraine`: https://github.com/exowanderer
 .. _`Carlos Valiente`: https://github.com/carletes
+.. _`@milesial`: https://github.com/milesial
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/cite.rst 
new/dask-1.0.0/docs/source/cite.rst
--- old/dask-0.20.2/docs/source/cite.rst        2018-11-02 15:42:48.000000000 
+0100
+++ new/dask-1.0.0/docs/source/cite.rst 2018-11-28 16:10:27.000000000 +0100
@@ -21,8 +21,7 @@
      url = {https://dask.org},
    }
 
-The full author list is available using git, or by looking at the `AUTHORS file
-<https://raw.githubusercontent.com/dask/dask/master/AUTHORS.md>`_.
+The full author list is available using ``git`` (e.g. ``git shortlog -ns``).
 
 Papers about parts of Dask
 --------------------------
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/dataframe-design.rst 
new/dask-1.0.0/docs/source/dataframe-design.rst
--- old/dask-0.20.2/docs/source/dataframe-design.rst    2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/docs/source/dataframe-design.rst     2018-11-27 
19:27:14.000000000 +0100
@@ -3,23 +3,23 @@
 Internal Design
 ===============
 
-Dask dataframes coordinate many Pandas DataFrames/Series arranged along an
-index. We define a ``dask.dataframe`` object with the following components:
+Dask DataFrames coordinate many Pandas DataFrames/Series arranged along an
+index.  We define a Dask DataFrame object with the following components:
 
-- A dask graph with a special set of keys designating partitions, such as
-  ``('x', 0), ('x', 1), ...``.
-- A name to identify which keys in the dask graph refer to this dataframe, such
-  as ``'x'``.
-- An empty pandas object containing appropriate metadata (e.g.  column names,
-  dtypes, etc...).
-- A sequence of partition boundaries along the index, called ``divisions``.
+- A Dask graph with a special set of keys designating partitions, such as
+  ``('x', 0), ('x', 1), ...``
+- A name to identify which keys in the Dask graph refer to this DataFrame, such
+  as ``'x'``
+- An empty Pandas object containing appropriate metadata (e.g.  column names,
+  dtypes, etc.)
+- A sequence of partition boundaries along the index called ``divisions``
 
 Metadata
 --------
 
-Many dataframe operations rely on knowing the name and dtype of columns. To
-keep track of this information, all ``dask.dataframe`` objects have a ``_meta``
-attribute which contains an empty pandas object with the same dtypes and names.
+Many DataFrame operations rely on knowing the name and dtype of columns.  To
+keep track of this information, all Dask DataFrame objects have a ``_meta``
+attribute which contains an empty Pandas object with the same dtypes and names.
 For example:
 
 .. code-block:: python
@@ -35,7 +35,7 @@
    b    object
    dtype: object
 
-Internally ``dask.dataframe`` does its best to propagate this information
+Internally, Dask DataFrame does its best to propagate this information
 through all operations, so most of the time a user shouldn't have to worry
 about this.  Usually this is done by evaluating the operation on a small sample
 of fake data, which can be found on the ``_meta_nonempty`` attribute:
@@ -50,20 +50,20 @@
 Sometimes this operation may fail in user defined functions (e.g. when using
 ``DataFrame.apply``), or may be prohibitively expensive.  For these cases, many
 functions support an optional ``meta`` keyword, which allows specifying the
-metadata directly, avoiding the inference step. For convenience, this supports
+metadata directly, avoiding the inference step.  For convenience, this supports
 several options:
 
-1. A pandas object with appropriate dtypes and names. If not empty, an empty
+1. A Pandas object with appropriate dtypes and names.  If not empty, an empty
    slice will be taken:
 
 .. code-block:: python
 
   >>> ddf.map_partitions(foo, meta=pd.DataFrame({'a': [1], 'b': [2]}))
 
-2. A description of the appropriate names and dtypes. This can take several 
forms:
+2. A description of the appropriate names and dtypes.  This can take several 
forms:
 
     * A ``dict`` of ``{name: dtype}`` or an iterable of ``(name, dtype)``
-      specifies a dataframe
+      specifies a DataFrame
     * A tuple of ``(name, dtype)`` specifies a series
     * A dtype object or string (e.g. ``'f8'``) specifies a scalar
 
@@ -74,22 +74,22 @@
 Categoricals
 ------------
 
-Dask dataframe divides `categorical data`_ into two types:
+Dask DataFrame divides `categorical data`_ into two types:
 
 - Known categoricals have the ``categories`` known statically (on the ``_meta``
-  attribute). Each partition **must** have the same categories as found on the
-  ``_meta`` attribute.
+  attribute).  Each partition **must** have the same categories as found on the
+  ``_meta`` attribute
 - Unknown categoricals don't know the categories statically, and may have
   different categories in each partition.  Internally, unknown categoricals are
   indicated by the presence of ``dd.utils.UNKNOWN_CATEGORIES`` in the
-  categories on the ``_meta`` attribute.  Since most dataframe operations
+  categories on the ``_meta`` attribute.  Since most DataFrame operations
   propagate the categories, the known/unknown status should propagate through
-  operations (similar to how ``NaN`` propagates).
+  operations (similar to how ``NaN`` propagates)
 
 For metadata specified as a description (option 2 above), unknown categoricals
 are created.
 
-Certain operations are only available for known categoricals. For example,
+Certain operations are only available for known categoricals.  For example,
 ``df.col.cat.categories`` would only work if ``df.col`` has known categories,
 since the categorical mapping is only known statically on the metadata of known
 categoricals.
@@ -103,12 +103,12 @@
     False
 
 Additionally, an unknown categorical can be converted to known using
-``.cat.as_known()``. If you have multiple categorical columns in a dataframe,
+``.cat.as_known()``.  If you have multiple categorical columns in a DataFrame,
 you may instead want to use ``df.categorize(columns=...)``, which will convert
-all specified columns to known categoricals. Since getting the categories
+all specified columns to known categoricals.  Since getting the categories
 requires a full scan of the data, using ``df.categorize()`` is more efficient
 than calling ``.cat.as_known()`` for each column (which would result in
-multiple scans).
+multiple scans):
 
 .. code-block:: python
 
@@ -120,7 +120,7 @@
     True
 
 To convert a known categorical to an unknown categorical, there is also the
-``.cat.as_unknown()`` method. This requires no computation, as it's just a
+``.cat.as_unknown()`` method. This requires no computation as it's just a
 change in the metadata.
 
 Non-categorical columns can be converted to categoricals in a few different
@@ -136,7 +136,7 @@
     # categorize requires computation, and results in known categoricals
     ddf = ddf.categorize(columns=['mycol', ...])
 
-Additionally, with pandas 0.19.2 and up ``dd.read_csv`` and ``dd.read_table``
+Additionally, with Pandas 0.19.2 and up, ``dd.read_csv`` and ``dd.read_table``
 can read data directly into unknown categorical columns by specifying a column
 dtype as ``'category'``:
 
@@ -146,7 +146,7 @@
 
 .. _`categorical data`: 
http://pandas.pydata.org/pandas-docs/stable/categorical.html
 
-With pandas 0.21.0 and up, ``dd.read_csv`` and ``dd.read_table`` can read
+Moreover, with Pandas 0.21.0 and up, ``dd.read_csv`` and ``dd.read_table`` can 
read
 data directly into *known* categoricals by specifying instances of
 ``pd.api.types.CategoricalDtype``:
 
@@ -159,18 +159,18 @@
 Partitions
 ----------
 
-Internally a dask dataframe is split into many partitions, and each partition
-is one pandas dataframe.  These dataframes are split vertically along the
+Internally, a Dask DataFrame is split into many partitions, where each 
partition
+is one Pandas DataFrame.  These DataFrames are split vertically along the
 index.  When our index is sorted and we know the values of the divisions of our
 partitions, then we can be clever and efficient with expensive algorithms (e.g.
 groupby's, joins, etc...).
 
-For example, if we have a time-series index then our partitions might be
-divided by month.  All of January will live in one partition while all of
-February will live in the next.  In these cases operations like ``loc``,
+For example, if we have a time-series index, then our partitions might be
+divided by month: all of January will live in one partition while all of
+February will live in the next.  In these cases, operations like ``loc``,
 ``groupby``, and ``join/merge`` along the index can be *much* more efficient
 than would otherwise be possible in parallel.  You can view the number of
-partitions and divisions of your dataframe with the following fields:
+partitions and divisions of your DataFrame with the following fields:
 
 .. code-block:: python
 
@@ -180,8 +180,8 @@
    ['2015-01-01', '2015-02-01', '2015-03-01', '2015-04-01', '2015-04-31']
 
 Divisions includes the minimum value of every partition's index and the maximum
-value of the last partition's index.  In the example above if the user searches
-for a specific datetime range then we know which partitions we need to inspect
+value of the last partition's index.  In the example above, if the user 
searches
+for a specific datetime range, then we know which partitions we need to inspect
 and which we can drop:
 
 .. code-block:: python
@@ -189,7 +189,7 @@
    >>> df.loc['2015-01-20': '2015-02-10']  # Must inspect first two partitions
 
 Often we do not have such information about our partitions.  When reading CSV
-files for example we do not know, without extra user input, how the data is
+files, for example, we do not know, without extra user input, how the data is
 divided.  In this case ``.divisions`` will be all ``None``:
 
 .. code-block:: python
@@ -197,6 +197,6 @@
    >>> df.divisions
    [None, None, None, None, None]
 
-In these cases any operation that requires a cleanly partitioned dataframe with
+In these cases, any operation that requires a cleanly partitioned DataFrame 
with
 known divisions will have to perform a sort.  This can generally achieved by
 calling ``df.set_index(...)``.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/dataframe-extend.rst 
new/dask-1.0.0/docs/source/dataframe-extend.rst
--- old/dask-0.20.2/docs/source/dataframe-extend.rst    2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/docs/source/dataframe-extend.rst     2018-11-27 
19:27:14.000000000 +0100
@@ -9,29 +9,29 @@
 -  ...
 
 These projects may also want to produce parallel variants of themselves with
-Dask, and may want to reuse some of the code in Dask Dataframe.
+Dask, and may want to reuse some of the code in Dask DataFrame.
 This document describes how to do this.  It is intended for maintainers of
-these libraries, and not for general users.
+these libraries and not for general users.
 
 
 Implement dask, name, meta, and divisions
 -----------------------------------------
 
 You will need to implement ``._meta``, ``.dask``, ``.divisions``, and
-``._name`` as defined in the :doc:`dataframe design docs <dataframe-design>`.
+``._name`` as defined in the :doc:`DataFrame design docs <dataframe-design>`.
 
 
 Extend Dispatched Methods
 -------------------------
 
 If you are going to pass around Pandas-like objects that are not normal Pandas
-objects then we ask you to extend a few dispatched methods.
+objects, then we ask you to extend a few dispatched methods.
 
 make_meta
 ~~~~~~~~~
 
 This function returns an empty version of one of your non-Dask objects, given a
-non-empty non-Dask object.
+non-empty non-Dask object:
 
 .. code-block:: python
 
@@ -52,11 +52,11 @@
        return ind[:0]
 
 
-Additionally you should create a similar function that returns a non-empty
-version of your non-Dask dataframe objects, filled with a few rows of
+Additionally, you should create a similar function that returns a non-empty
+version of your non-Dask DataFrame objects filled with a few rows of
 representative or random data.  This is used to guess types when they are not
 provided.  It should expect an empty version of your object with columns,
-dtypes, index name, and it should return a non-empty version.
+dtypes, index name, and it should return a non-empty version:
 
 .. code-block:: python
 
@@ -82,7 +82,7 @@
 get_parallel_type
 ~~~~~~~~~~~~~~~~~
 
-Given a non-Dask dataframe object, return the Dask equivalent
+Given a non-Dask DataFrame object, return the Dask equivalent:
 
 .. code-block:: python
 
@@ -106,8 +106,8 @@
 concat
 ~~~~~~
 
-Concatenate many of your non-Dask dataframe objects together.  It should expect
-a list of your objects (homogeneously typed).
+Concatenate many of your non-Dask DataFrame objects together.  It should expect
+a list of your objects (homogeneously typed):
 
 .. code-block:: python
 
@@ -121,9 +121,9 @@
 Extension Arrays
 ----------------
 
-Rather than subclassing Pandas dataframes, you may be interested in extending
+Rather than subclassing Pandas DataFrames, you may be interested in extending
 Pandas with `Extension Arrays 
<https://pandas.pydata.org/pandas-docs/stable/extending.html>`_.
 
-API support for extension arrays isn't in Dask Dataframe yet (though this would
+API support for extension arrays isn't in Dask DataFrame yet (though this would
 be a good contribution), but many of the complications above will go away if
-your objects are genuinely Pandas dataframes, rather than a subclass.
+your objects are genuinely Pandas DataFrames, rather than a subclass.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/dataframe-groupby.rst 
new/dask-1.0.0/docs/source/dataframe-groupby.rst
--- old/dask-0.20.2/docs/source/dataframe-groupby.rst   2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/docs/source/dataframe-groupby.rst    2018-11-27 
19:27:14.000000000 +0100
@@ -5,7 +5,7 @@
 
 Operations like ``groupby``, ``join``, and ``set_index`` have special
 performance considerations that are different from normal Pandas due to the
-parallel, larger-than-memory, and distributed nature of dask.dataframe.
+parallel, larger-than-memory, and distributed nature of Dask DataFrame.
 
 Easy Case
 ---------
@@ -15,16 +15,16 @@
 var, count, nunique`` are all quite fast and efficient, even if partitions are
 not cleanly divided with known divisions.  This is the common case.
 
-Additionally, if divisions are known then applying an arbitrary function to
+Additionally, if divisions are known, then applying an arbitrary function to
 groups is efficient when the grouping columns include the index.
 
-Joins are also quite fast when joining a Dask dataframe to a Pandas dataframe
-or when joining two Dask dataframes along their index.  No special
+Joins are also quite fast when joining a Dask DataFrame to a Pandas DataFrame
+or when joining two Dask DataFrames along their index.  No special
 considerations need to be made when operating in these common cases.
 
-So if you're doing common groupby and join operations then you can stop reading
-this.  Everything will scale nicely.  Fortunately this is true most of the
-time.
+So, if you're doing common groupby and join operations, then you can stop 
reading
+this.  Everything will scale nicely.  Fortunately, this is true most of the
+time:
 
 .. code-block:: python
 
@@ -40,7 +40,7 @@
 In some cases, such as when applying an arbitrary function to groups (when not
 grouping on index with known divisions), when joining along non-index columns,
 or when explicitly setting an unsorted column to be the index, we may need to
-trigger a full dataset shuffle
+trigger a full dataset shuffle:
 
 .. code-block:: python
 
@@ -49,10 +49,10 @@
    >>> df.set_index(column)                          # Requires shuffle
 
 A shuffle is necessary when we need to re-sort our data along a new index.  For
-example if we have banking records that are organized by time and we now want
-to organize them by user ID then we'll need to move a lot of data around.  In
-Pandas all of this data fit in memory, so this operation was easy.  Now that we
-don't assume that all data fits in memory we must be a bit more careful.
+example, if we have banking records that are organized by time and we now want
+to organize them by user ID, then we'll need to move a lot of data around.  In
+Pandas all of this data fits in memory, so this operation was easy.  Now that 
we
+don't assume that all data fits in memory, we must be a bit more careful.
 
 Re-sorting the data can be avoided by restricting yourself to the easy cases
 mentioned above.
@@ -61,12 +61,13 @@
 ---------------
 
 There are currently two strategies to shuffle data depending on whether you are
-on a single machine or on a distributed cluster.
+on a single machine or on a distributed cluster: shuffle on disk and shuffle 
+over the network.
 
 Shuffle on Disk
 ```````````````
 
-When operating on larger-than-memory data on a single machine we shuffle by
+When operating on larger-than-memory data on a single machine, we shuffle by
 dumping intermediate results to disk.  This is done using the partd_ project
 for on-disk shuffles.
 
@@ -75,8 +76,8 @@
 Shuffle over the Network
 ````````````````````````
 
-When operating on a distributed cluster the Dask workers may not have access to
-a shared hard drive.  In this case we shuffle data by breaking input partitions
+When operating on a distributed cluster, the Dask workers may not have access 
to
+a shared hard drive.  In this case, we shuffle data by breaking input 
partitions
 into many pieces based on where they will end up and moving these pieces
 throughout the network.  This prolific expansion of intermediate partitions
 can stress the task scheduler.  To manage for many-partitioned datasets we
@@ -86,9 +87,9 @@
 Selecting methods
 `````````````````
 
-Dask will use on-disk shuffling by default but will switch to task-based
+Dask will use on-disk shuffling by default, but will switch to task-based
 distributed shuffling if the default scheduler is set to use a
-``dask.distributed.Client`` such as would be the case if the user sets the
+``dask.distributed.Client``, such as would be the case if the user sets the
 Client as default:
 
 .. code-block:: python
@@ -97,7 +98,7 @@
 
 Alternatively, if you prefer to avoid defaults, you can configure the global
 shuffling method by using the ``dask.config.set(shuffle=...)`` command.
-This can be done globally,
+This can be done globally:
 
 .. code-block:: python
 
@@ -105,7 +106,7 @@
 
     df.groupby(...).apply(...)
 
-or as a context manager
+or as a context manager:
 
 .. code-block:: python
 
@@ -114,7 +115,7 @@
 
 
 In addition, ``set_index`` also accepts a ``shuffle`` keyword argument that
-can be used to select either on-disk or task-based shuffling
+can be used to select either on-disk or task-based shuffling:
 
 .. code-block:: python
 
@@ -125,15 +126,15 @@
 Aggregate
 =========
 
-Dask support Pandas' ``aggregate`` syntax to run multiple reductions on the
-same groups. Common reductions, such as ``max``, ``sum``, ``mean`` are directly
-supported:
+Dask supports Pandas' ``aggregate`` syntax to run multiple reductions on the
+same groups.  Common reductions such as ``max``, ``sum``, and ``mean`` are 
+directly supported:
 
 .. code-block:: python
 
     >>> df.groupby(columns).aggregate(['sum', 'mean', 'max', 'min'])
 
-Dask also supports user defined reductions. To ensure proper performance, the
+Dask also supports user defined reductions.  To ensure proper performance, the
 reduction has to be formulated in terms of three independent steps. The
 ``chunk`` step is applied to each partition independently and reduces the data
 within a partition. The ``aggregate`` combines the within partition results.
@@ -142,7 +143,7 @@
 recognize the reduction, it has to be passed as an instance of
 ``dask.dataframe.Aggregation``.
 
-For example, ``sum`` could be implemented as
+For example, ``sum`` could be implemented as:
 
 .. code-block:: python
 
@@ -150,12 +151,12 @@
     df.groupby('g').agg(custom_sum)
 
 The name argument should be different from existing reductions to avoid data
-corruption. The arguments to each function are pre-grouped series objects,
+corruption.  The arguments to each function are pre-grouped series objects,
 similar to ``df.groupby('g')['value']``.
 
 Many reductions can only be implemented with multiple temporaries. To implement
 these reductions, the steps should return tuples and expect multiple arguments.
-A mean function can be implemented as
+A mean function can be implemented as:
 
 .. code-block:: python
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/dataframe-indexing.rst 
new/dask-1.0.0/docs/source/dataframe-indexing.rst
--- old/dask-0.20.2/docs/source/dataframe-indexing.rst  2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/docs/source/dataframe-indexing.rst   2018-11-27 
19:27:14.000000000 +0100
@@ -3,7 +3,7 @@
 Indexing into Dask DataFrames
 =============================
 
-Dask DataFrame supports some of pandas' indexing behavior.
+Dask DataFrame supports some of Pandas' indexing behavior.
 
 .. currentmodule:: dask.dataframe
 
@@ -15,14 +15,14 @@
 Label-based Indexing
 --------------------
 
-Just like pandas, Dask DataFrame supports label-based indexing with the 
``.loc``
+Just like Pandas, Dask DataFrame supports label-based indexing with the 
``.loc``
 accessor for selecting rows or columns, and ``__getitem__`` (square brackets)
 for selecting just columns.
 
 .. note::
 
    To select rows, the DataFrame's divisions must be known (see
-   :ref:`dataframe.design` and :ref:`dataframe.performance`) for more.
+   :ref:`dataframe.design` and :ref:`dataframe.performance` for more 
information.)
 
 .. code-block:: python
 
@@ -76,7 +76,7 @@
    c                ...
    Dask Name: loc, 2 tasks
 
-Dask DataFrame supports pandas' `partial-string indexing 
<https://pandas.pydata.org/pandas-docs/stable/timeseries.html#partial-string-indexing>`_:
+Dask DataFrame supports Pandas' `partial-string indexing 
<https://pandas.pydata.org/pandas-docs/stable/timeseries.html#partial-string-indexing>`_:
 
 .. code-block:: python
 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/dataframe-performance.rst 
new/dask-1.0.0/docs/source/dataframe-performance.rst
--- old/dask-0.20.2/docs/source/dataframe-performance.rst       2018-11-15 
14:53:58.000000000 +0100
+++ new/dask-1.0.0/docs/source/dataframe-performance.rst        2018-11-27 
19:27:14.000000000 +0100
@@ -7,27 +7,27 @@
 ----------
 
 For data that fits into RAM, Pandas can often be faster and easier to use than
-Dask.dataframe.  While "Big Data" tools can be exciting, they are almost always
+Dask DataFrame.  While "Big Data" tools can be exciting, they are almost always
 worse than normal data tools while those remain appropriate.
 
-Pandas Performance Tips Apply to Dask.dataframe
-------------------------------------------------
+Pandas Performance Tips Apply to Dask DataFrame
+-----------------------------------------------
 
-Normal Pandas performance tips, like avoiding apply, using vectorized
-operations, using categoricals, etc. all apply equally to Dask.dataframe.  See
-`Modern Pandas <https://tomaugspurger.github.io/modern-1.html>`_ by `Tom
-Augspurger <https://github.com/TomAugspurger>`_ is a good read here.
+Usual Pandas performance tips like avoiding apply, using vectorized
+operations, using categoricals, etc., all apply equally to Dask DataFrame.  See
+`Modern Pandas <https://tomaugspurger.github.io/modern-1-intro>`_ by `Tom
+Augspurger <https://github.com/TomAugspurger>`_ for a good read on this topic.
 
 Use the Index
 -------------
 
-Dask.dataframe can be optionally sorted along a single index column.  Some
-operations against this column can be very fast.  For example if your dataset
-is sorted by time you can quickly select data for a particular day, perform
+Dask DataFrame can be optionally sorted along a single index column.  Some
+operations against this column can be very fast.  For example, if your dataset
+is sorted by time, you can quickly select data for a particular day, perform
 time series joins, etc.  You can check if your data is sorted by looking at the
 ``df.known_divisions`` attribute.  You can set an index column using the
-``.set_index(columnname)`` method.  This operation is expensive though, so use
-it sparingly (see below).
+``.set_index(column_name)`` method.  This operation is expensive though, so use
+it sparingly (see below):
 
 .. code-block:: python
 
@@ -39,30 +39,29 @@
 Avoid Shuffles
 --------------
 
-Setting an index is an important (see above) but expensive operation.  You
+Setting an index is an important but expensive operation (see above).  You
 should do it infrequently and you should persist afterwards (see below).
 
 Some operations like ``set_index`` and ``merge/join`` are harder to do in a
-parallel or distributed setting than they are in-memory on a single machine.
-In particular *shuffling operations* that rearrange data become much more
-communication intensive.  For example if your data is arranged by customer ID
-but now you want to arrange it by time all of your partitions will have to talk
-to each other to exchange shards of data.  This can be an intense process,
+parallel or distributed setting than if they are in-memory on a single machine.
+In particular, *shuffling operations* that rearrange data become much more
+communication intensive.  For example, if your data is arranged by customer ID
+but now you want to arrange it by time, all of your partitions will have to 
talk
+to each other to exchange shards of data.  This can be an intensive process,
 particularly on a cluster.
 
-So definitely set the index, but try do so infrequently.  After you set the
-index then you may want to ``persist`` your data if you are on a cluster.
+So, definitely set the index but try do so infrequently.  After you set the
+index, you may want to ``persist`` your data if you are on a cluster:
 
 .. code-block:: python
 
-   df = df.set_index('column-name')  # do this infrequently
+   df = df.set_index('column_name')  # do this infrequently
 
-Additionally, set_index has a few options that can accelerate it in some
-situations.  For example if you know that your dataset is sorted or you already
-know the values by which it is divided you can provide these to accelerate the
-set_index operation.  See the `set_index docstring
-<http://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.set_index>`_
-for more information.
+Additionally, ``set_index`` has a few options that can accelerate it in some
+situations.  For example, if you know that your dataset is sorted or you 
already
+know the values by which it is divided, you can provide these to accelerate the
+``set_index`` operation.  For more information, see the `set_index docstring
+<http://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.set_index>`_.
 
 .. code-block:: python
 
@@ -72,9 +71,9 @@
 Persist Intelligently
 ---------------------
 
-*This section is only relevant to users on distributed systems.*
+.. note:: This section is only relevant to users on distributed systems.
 
-Often dataframe workloads look like the following:
+Often DataFrame workloads look like the following:
 
 1.  Load data from files
 2.  Filter data to a particular subset
@@ -82,11 +81,11 @@
 4.  Several complex queries on top of this indexed data
 
 It is often ideal to load, filter, and shuffle data once and keep this result 
in
-memory.  Afterwards each of the several complex queries can be based off of
+memory.  Afterwards, each of the several complex queries can be based off of
 this in-memory data rather than have to repeat the full load-filter-shuffle
 process each time.  To do this, use the `client.persist
 
<https://distributed.dask.org/en/latest/api.html#distributed.client.Client.persist>`_
-method.
+method:
 
 .. code-block:: python
 
@@ -103,43 +102,43 @@
    >>> df.groupby(df.city).size().compute()
    ...
 
-Persist is important because Dask.dataframe is *lazy by default*.  Persist is a
-way of telling the cluster that it should start computing on the computations
-that you have defined so far and that it should try to keep those results in
-memory.  You will get back a new dataframe that is semantically equivalent to
-your old dataframe, but now points to running data.  Your old dataframe still
-points to lazy computations
+Persist is important because Dask DataFrame is *lazy by default*.  It is a
+way of telling the cluster that it should start executing the computations
+that you have defined so far, and that it should try to keep those results in
+memory.  You will get back a new DataFrame that is semantically equivalent to
+your old DataFrame, but now points to running data.  Your old DataFrame still
+points to lazy computations:
 
 .. code-block:: python
 
    # Don't do this
-   client.persist(df)  # Persist doesn't change the input in-place
+   client.persist(df)  # persist doesn't change the input in-place
 
    # Do this instead
-   df = client.persist(df)  # Replace your old lazy dataframe
+   df = client.persist(df)  # replace your old lazy DataFrame
 
 
 Repartition to Reduce Overhead
 ------------------------------
 
-Your Dask.dataframe is split up into many Pandas dataframes.  We sometimes call
-these "partitions".  Often the number of partitions is decided for you; for
-example it might be the number of CSV files from which you are reading. However
-over time as you reduce or increase the size of your pandas dataframes by
-filtering or joining it may be wise to reconsider how many partitions you need.
+Your Dask DataFrame is split up into many Pandas DataFrames.  We sometimes call
+these "partitions", and often the number of partitions is decided for you. For
+example, it might be the number of CSV files from which you are reading. 
However,
+over time, as you reduce or increase the size of your pandas DataFrames by
+filtering or joining, it may be wise to reconsider how many partitions you 
need.
 There is a cost to having too many or having too few.
 
 Partitions should fit comfortably in memory (smaller than a gigabyte) but also
-not be too numerous.  Every operation on every partition takes the central
+not be too many.  Every operation on every partition takes the central
 scheduler a few hundred microseconds to process.  If you have a few thousand
 tasks this is barely noticeable, but it is nice to reduce the number if
 possible.
 
 A common situation is that you load lots of data into reasonably sized
-partitions (dask's defaults make decent choices) but then you filter down your
-dataset to only a small fraction of the original.  At this point it is wise to
-regroup your many small partitions into a few larger ones.  You can do this
-with the ``repartition`` method:
+partitions (Dask's defaults make decent choices), but then you filter down your
+dataset to only a small fraction of the original.  At this point, it is wise to
+regroup your many small partitions into a few larger ones.  You can do this by
+using the ``repartition`` method:
 
 .. code-block:: python
 
@@ -154,29 +153,28 @@
 data each.
 
 Additionally, reducing partitions is very helpful just before shuffling, which
-creates ``n log(n)`` tasks relative to the number of partitions.  Dataframes
-with less than 100 partitions are much easier to shuffle than dataframes with
+creates ``n log(n)`` tasks relative to the number of partitions.  DataFrames
+with less than 100 partitions are much easier to shuffle than DataFrames with
 tens of thousands.
 
 
 Joins
 -----
 
-Joining two dataframes can be either very expensive or very cheap depending on
+Joining two DataFrames can be either very expensive or very cheap depending on
 the situation.  It is cheap in the following cases:
 
-1.  Joining a Dask.dataframe with a Pandas dataframe
-2.  Joining a Dask.dataframe with a Dask.dataframe of a single partition.
-3.  Joining Dask.dataframes along their indexes
+1.  Joining a Dask DataFrame with a Pandas DataFrame
+2.  Joining a Dask DataFrame with another Dask DataFrame of a single partition
+3.  Joining Dask DataFrames along their indexes
 
-It is expensive in the following case:
+Also, it is expensive in the following case:
 
-1.  Joining Dask.dataframes along columns that are not their index
+1.  Joining Dask DataFrames along columns that are not their index
 
-The expensive case requires a shuffle.  This is fine, and Dask.dataframe will
+The expensive case requires a shuffle.  This is fine, and Dask DataFrame will
 complete the job well, but it will be more expensive than a typical linear-time
-operation.
-
+operation:
 
 .. code-block:: python
 
@@ -190,35 +188,36 @@
 -----------------------------------
 
 HDF5 is a popular choice for Pandas users with high performance needs.  We
-encourage Dask.dataframe users to :doc:`store and load data <dataframe-create>`
+encourage Dask DataFrame users to :doc:`store and load data <dataframe-create>`
 using Parquet instead.  `Apache Parquet <http://parquet.apache.org/>`_ is a
 columnar binary format that is easy to split into multiple files (easier for
 parallel loading) and is generally much simpler to deal with than HDF5 (from
 the library's perspective).  It is also a common format used by other big data
 systems like `Apache Spark <http://spark.apache.org/>`_ and `Apache Impala
-<http://impala.apache.org/>`_ and so is useful to interchange with other
-systems.
+<http://impala.apache.org/>`_, and so it is useful to interchange with other
+systems:
 
 .. code-block:: python
 
    df.to_parquet('path/to/my-results/')
    df = dd.read_parquet('path/to/my-results/')
 
-Dask supports reading with multiple implementations of the Apache Parquet
-format for Python.
+Dask supports reading parquet files with different engine implementations of 
+the Apache Parquet format for Python:
 
 .. code-block:: python
 
    df1 = dd.read_parquet('path/to/my-results/', engine='fastparquet')
    df2 = dd.read_parquet('path/to/my-results/', engine='pyarrow')
 
-These libraries be installed using
+These libraries can be installed using:
 
 .. code-block:: shell
 
    conda install fastparquet pyarrow -c conda-forge
 
-Fastparquet is a Python-based implementation that uses the `Numba
-<http://numba.pydata.org/>`_ Python-to-LLVM compiler. PyArrow is part of the
+`fastparquet <https://github.com/dask/fastparquet/>`_ is a Python-based 
+implementation that uses the `Numba <http://numba.pydata.org/>`_ 
+Python-to-LLVM compiler. PyArrow is part of the
 `Apache Arrow <http://arrow.apache.org/>`_ project and uses the `C++
 implementation of Apache Parquet <https://github.com/apache/parquet-cpp>`_.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/delayed-best-practices.rst 
new/dask-1.0.0/docs/source/delayed-best-practices.rst
--- old/dask-0.20.2/docs/source/delayed-best-practices.rst      2018-11-12 
15:07:31.000000000 +0100
+++ new/dask-1.0.0/docs/source/delayed-best-practices.rst       2018-11-28 
16:10:27.000000000 +0100
@@ -3,13 +3,13 @@
 
 It is easy to get started with Dask delayed, but using it *well* does require
 some experience.  This page contains suggestions for best practices, and
-includes solutions to common problems:
+includes solutions to common problems.
 
 
 Call delayed on the function, not the result
 --------------------------------------------
 
-Dask.delayed operates on functions, like ``dask.delayed(f)(x, y)``, not on 
their results like ``dask.delayed(f(x, y))``.  When you do the latter Python 
first calculates ``f(x, y)`` before Dask has a chance to step in
+Dask delayed operates on functions like ``dask.delayed(f)(x, y)``, not on 
their results like ``dask.delayed(f(x, y))``.  When you do the latter, Python 
first calculates ``f(x, y)`` before Dask has a chance to step in.
 
 **Don't**
 
@@ -27,9 +27,9 @@
 Compute on lots of computation at once
 --------------------------------------
 
-To improve parallelism you want to include lots of computation in each compute 
call.
-Ideally you want to make many dask.delayed calls to define your computation and
-then only call ``dask.compute`` at the end.  It's ok to call ``dask.compute``
+To improve parallelism, you want to include lots of computation in each 
compute call.
+Ideally, you want to make many ``dask.delayed`` calls to define your 
computation and
+then call ``dask.compute`` only at the end.  It is ok to call ``dask.compute``
 in the middle of your computation as well, but everything will stop there as
 Dask computes those results before moving forward with your code.
 
@@ -56,7 +56,7 @@
 Don't mutate inputs
 -------------------
 
-Your functions should not change the inputs directly
+Your functions should not change the inputs directly.
 
 **Don't**
 
@@ -75,7 +75,7 @@
    def f(x):
        return x + 1
 
-If you need to use a mutable operation then make a copy within your function 
first
+If you need to use a mutable operation, then make a copy within your function 
first:
 
 .. code-block:: python
 
@@ -89,9 +89,9 @@
 Avoid global state
 ------------------
 
-Ideally your operations shouldn't rely on global state.  Using global state
+Ideally, your operations shouldn't rely on global state.  Using global state
 *might* work if you only use threads, but when you move to multiprocessing or
-distributed computing then you will likely encounter confusing errors
+distributed computing then you will likely encounter confusing errors.
 
 **Don't**
 
@@ -128,11 +128,11 @@
 Break up computations into many pieces
 --------------------------------------
 
-Every dask.delayed function call is a single operation from Dask's perspective.
-You achieve parallelism by having many dask.delayed calls, not by using only a
-single one.  Dask will not look inside a function decorated with dask.delayed
-and parallelize that code internally.  It needs your help to find good places
-to break up a computation.
+Every ``dask.delayed`` function call is a single operation from Dask's 
perspective.
+You achieve parallelism by having many delayed calls, not by using only a
+single one: Dask will not look inside a function decorated with 
``@dask.delayed``
+and parallelize that code internally.  To accomplish that, it needs your help 
to 
+find good places to break up a computation.
 
 **Don't**
 
@@ -191,9 +191,9 @@
 --------------------
 
 Every delayed task has an overhead of a few hundred microseconds.  Usually this
-is ok, but it can become a problem if you apply dask.delayed too finely.  In
-this case it's often best to break up your many tasks into batches, or use one
-of the dask collections to help you.
+is ok, but it can become a problem if you apply ``dask.delayed`` too finely.  
In
+this case, it's often best to break up your many tasks into batches or use one
+of the Dask collections to help you.
 
 **Don't**
 
@@ -233,11 +233,11 @@
 Avoid calling delayed within delayed functions
 ----------------------------------------------
 
-Often if you are new to using Dask.delayed you place dask.delayed calls
-everywhere and hope for the best.  While this may actually work it's usually
+Often, if you are new to using Dask delayed, you place ``dask.delayed`` calls
+everywhere and hope for the best.  While this may actually work, it's usually
 slow and results in hard-to-understand solutions.
 
-Usually you never call dask.delayed within dask.delayed functions.
+Usually you never call ``dask.delayed`` within ``dask.delayed`` functions.
 
 **Don't**
 
@@ -253,7 +253,7 @@
 
 **Do**
 
-Instead, because this function only does delayed work it is very fast and so
+Instead, because this function only does delayed work, it is very fast and so
 there is no reason to delay it.
 
 .. code-block:: python
@@ -266,17 +266,16 @@
         return result
 
 
-
 Don't call dask.delayed on other Dask collections
 -------------------------------------------------
 
-When you place a dask array or dask dataframe into a delayed call that function
-will receive the Numpy or Pandas equivalent.  Beware that if your array is
-large then this might crash your workers.
+When you place a Dask array or Dask DataFrame into a delayed call, that 
function
+will receive the NumPy or Pandas equivalent.  Beware that if your array is
+large, then this might crash your workers.
 
 Instead, it's more common to use methods like ``da.map_blocks`` or
-``df.map_partitions``, or to turn your arrays or dataframes into *many* delayed
-objects
+``df.map_partitions``, or to turn your arrays or DataFrames into *many* delayed
+objects.
 
 **Don't**
 
@@ -300,15 +299,14 @@
 
    delayed_values = [dask.delayed(train)(part) for part in partitions]
 
-However, if you don't mind turning your dask array/dataframe into a single
-chunk then this is ok.
+However, if you don't mind turning your Dask array/DataFrame into a single
+chunk, then this is ok.
 
 .. code-block:: python
 
    dask.delayed(train)(..., y=df.sum())
 
 
-
 Avoid repeatedly putting large inputs into delayed calls
 --------------------------------------------------------
 
@@ -329,7 +327,7 @@
    results = [dask.delayed(train)(x, i) for i in range(1000)]
 
 
-Every call to ``dask.delayed(train)(x, ...)`` has to hash the numpy array 
``x``, which slows things down.
+Every call to ``dask.delayed(train)(x, ...)`` has to hash the NumPy array 
``x``, which slows things down.
 
 
 **Do**
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/delayed-collections.rst 
new/dask-1.0.0/docs/source/delayed-collections.rst
--- old/dask-0.20.2/docs/source/delayed-collections.rst 2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/docs/source/delayed-collections.rst  2018-11-28 
16:10:27.000000000 +0100
@@ -1,20 +1,20 @@
 Working with Collections
 ========================
 
-Often we want to do a bit of custom work with ``dask.delayed`` (for example
+Often we want to do a bit of custom work with ``dask.delayed`` (for example,
 for complex data ingest), then leverage the algorithms in ``dask.array`` or
 ``dask.dataframe``, and then switch back to custom work.  To this end, all
 collections support ``from_delayed`` functions and ``to_delayed``
 methods.
 
 As an example, consider the case where we store tabular data in a custom format
-not known by ``dask.dataframe``.  This format is naturally broken apart into
+not known by Dask DataFrame.  This format is naturally broken apart into
 pieces and we have a function that reads one piece into a Pandas DataFrame.
 We use ``dask.delayed`` to lazily read these files into Pandas DataFrames,
 use ``dd.from_delayed`` to wrap these pieces up into a single
-``dask.dataframe``, use the complex algorithms within ``dask.dataframe``
-(groupby, join, etc..) and then switch back to delayed to save our results
-back to the custom format.
+Dask DataFrame, use the complex algorithms within the DataFrame
+(groupby, join, etc.), and then switch back to ``dask.delayed`` to save our 
results
+back to the custom format:
 
 .. code-block:: python
 
@@ -34,6 +34,6 @@
 
    dd.compute(*writes)
 
-Data science is often complex, ``dask.delayed`` provides a release valve for
+Data science is often complex, and ``dask.delayed`` provides a release valve 
for
 users to manage this complexity on their own, and solve the last mile problem
 for custom formats and complex situations.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/delayed.rst 
new/dask-1.0.0/docs/source/delayed.rst
--- old/dask-0.20.2/docs/source/delayed.rst     2018-11-02 15:42:48.000000000 
+0100
+++ new/dask-1.0.0/docs/source/delayed.rst      2018-11-28 16:10:27.000000000 
+0100
@@ -12,7 +12,7 @@
 Sometimes problems don't fit into one of the collections like ``dask.array`` or
 ``dask.dataframe``. In these cases, users can parallelize custom algorithms
 using the simpler ``dask.delayed`` interface. This allows one to create graphs
-directly with a light annotation of normal python code.
+directly with a light annotation of normal python code:
 
 .. code-block:: python
 
@@ -29,8 +29,8 @@
 Example
 -------
 
-Sometimes we face problems that are parallelizable, but don't fit high-level
-abstractions Dask array or Dask dataframe.  Consider the following example:
+Sometimes we face problems that are parallelizable, but don't fit into 
high-level
+abstractions like Dask Array or Dask DataFrame.  Consider the following 
example:
 
 .. code-block:: python
 
@@ -54,15 +54,15 @@
 
     total = sum(output)
 
-There is clearly parallelism in this problem (many of the ``inc`` and
-``double`` and ``add`` functions can evaluate independently), but it's not
-clear how to convert this to a big array or big dataframe computation.
+There is clearly parallelism in this problem (many of the ``inc``,
+``double``, and ``add`` functions can evaluate independently), but it's not
+clear how to convert this to a big array or big DataFrame computation.
 
-As written this code runs sequentially in a single thread.  However we see that
+As written, this code runs sequentially in a single thread.  However, we see 
that
 a lot of this could be executed in parallel.
 
 The Dask ``delayed`` function decorates your functions so that they operate
-*lazily*.  Rather than executing your function immediately it will defer
+*lazily*.  Rather than executing your function immediately, it will defer
 execution, placing the function and its arguments into a task graph.
 
 .. currentmodule:: dask.delayed
@@ -71,7 +71,7 @@
     delayed
 
 We slightly modify our code by wrapping functions in ``delayed``.
-This delays the execution of the function and generates a dask graph instead.
+This delays the execution of the function and generates a Dask graph instead:
 
 .. code-block:: python
 
@@ -87,11 +87,11 @@
     total = dask.delayed(sum)(output)
 
 We used the ``dask.delayed`` function to wrap the function calls that we want
-to turn into tasks.  None of the ``inc``, ``double``, ``add`` or ``sum`` calls
-have happened yet, instead the object ``total`` is a ``Delayed`` result that
+to turn into tasks.  None of the ``inc``, ``double``, ``add``, or ``sum`` calls
+have happened yet. Instead, the object ``total`` is a ``Delayed`` result that
 contains a task graph of the entire computation.  Looking at the graph we see
-clear opportunities for parallel execution.  The dask schedulers will exploit
-this parallelism, generally improving performance.  (although not in this
+clear opportunities for parallel execution.  The Dask schedulers will exploit
+this parallelism, generally improving performance (although not in this
 example, because these functions are already very small and fast.)
 
 .. code-block:: python
@@ -113,7 +113,7 @@
 ---------
 
 It is also common to see the delayed function used as a decorator.  Here is a
-reproduction of our original problem as a parallel code.
+reproduction of our original problem as a parallel code:
 
 .. code-block:: python
 
@@ -147,11 +147,11 @@
 ---------
 
 Sometimes you want to create and destroy work during execution, launch tasks
-from other tasks, etc..  For this, see the :doc:`Futures <futures>` interface.
+from other tasks, etc.  For this, see the :doc:`Futures <futures>` interface.
 
 
 Best Practices
 --------------
 
 For a list of common problems and recommendations see :doc:`Delayed Best
-Practices <delayed-best-practices>`
+Practices <delayed-best-practices>`.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/docs/source/scheduler-overview.rst 
new/dask-1.0.0/docs/source/scheduler-overview.rst
--- old/dask-0.20.2/docs/source/scheduler-overview.rst  2018-11-02 
15:42:48.000000000 +0100
+++ new/dask-1.0.0/docs/source/scheduler-overview.rst   2018-11-28 
16:10:27.000000000 +0100
@@ -55,7 +55,7 @@
 
 The compute method takes a number of keywords:
 
-- ``get``: a scheduler ``get`` function, overrides the default for the 
collection
+- ``scheduler``: the name of the desired scheduler like ``"threads"``, 
``"processes"``, or ``"single-threaded"`, a ``get`` function, or a 
``dask.distributed.Client`` object.  Overrides the default for the collection.
 - ``**kwargs``: extra keywords to pass on to the scheduler ``get`` function.
 
 See also: :ref:`configuring-schedulers`.
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/setup.cfg new/dask-1.0.0/setup.cfg
--- old/dask-0.20.2/setup.cfg   2018-11-15 15:10:02.000000000 +0100
+++ new/dask-1.0.0/setup.cfg    2018-11-28 16:46:01.000000000 +0100
@@ -28,6 +28,7 @@
 markers = 
        skip_if_np_ge_114: Skip a test when NumPy is older than 1.14
        skip_if_np_lt_114: Skip a test when NumPy is at least 1.14
+addopts = -rsx -v --durations=10
 
 [egg_info]
 tag_build = 
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' old/dask-0.20.2/setup.py new/dask-1.0.0/setup.py
--- old/dask-0.20.2/setup.py    2018-11-02 15:42:48.000000000 +0100
+++ new/dask-1.0.0/setup.py     2018-11-27 19:27:14.000000000 +0100
@@ -17,7 +17,7 @@
 }
 extras_require['complete'] = sorted(set(sum(extras_require.values(), [])))
 
-packages = ['dask', 'dask.array', 'dask.bag', 'dask.store', 'dask.bytes',
+packages = ['dask', 'dask.array', 'dask.bag', 'dask.bytes',
             'dask.dataframe', 'dask.dataframe.io', 'dask.dataframe.tseries',
             'dask.diagnostics']

commit python-dask for openSUSE:Factory

Reply via email to