Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package python-swifter for openSUSE:Factory checked in at 2023-08-02 16:50:05 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-swifter (Old) and /work/SRC/openSUSE:Factory/.python-swifter.new.22712 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-swifter" Wed Aug 2 16:50:05 2023 rev:11 rq:1101765 version:1.4.0 Changes: -------- --- /work/SRC/openSUSE:Factory/python-swifter/python-swifter.changes 2023-06-03 00:07:36.386124530 +0200 +++ /work/SRC/openSUSE:Factory/.python-swifter.new.22712/python-swifter.changes 2023-08-02 16:51:58.542054579 +0200 @@ -1,0 +2,8 @@ +Tue Aug 1 08:59:00 UTC 2023 - Markéta Machová <mmach...@suse.com> + +- Update to 1.4.0 + * Significantly reduced core dependencies of swifter library. + * Removed deprecated loffset parameter + * Updated README to be more readable for darkmode users + +------------------------------------------------------------------- Old: ---- swifter-1.3.4.tar.gz New: ---- swifter-1.4.0.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-swifter.spec ++++++ --- /var/tmp/diff_new_pack.hHzDoT/_old 2023-08-02 16:51:59.742061828 +0200 +++ /var/tmp/diff_new_pack.hHzDoT/_new 2023-08-02 16:51:59.766061973 +0200 @@ -17,7 +17,7 @@ Name: python-swifter -Version: 1.3.4 +Version: 1.4.0 Release: 0 Summary: Tool to speed up pandas calculations License: MIT @@ -28,28 +28,20 @@ BuildRequires: %{python_module wheel} BuildRequires: fdupes BuildRequires: python-rpm-macros -Requires: python-bleach >= 3.1.1 -Requires: python-cloudpickle >= 0.2.2 Requires: python-dask-dataframe >= 2.10.0 -Requires: python-ipywidgets >= 7.0.0 Requires: python-pandas >= 1.0 -Requires: python-parso > 0.4 Requires: python-psutil >= 5.6.6 Requires: python-tqdm >= 4.33.0 +Suggests: python-ipywidgets >= 7.0.0 Suggests: python-ray >= 1.0 BuildArch: noarch # SECTION test requirements -BuildRequires: %{python_module bleach >= 3.1.1} -BuildRequires: %{python_module cloudpickle >= 0.2.2} BuildRequires: %{python_module dask-dataframe >= 2.10.0} BuildRequires: %{python_module ipywidgets >= 7.0.0} BuildRequires: %{python_module pandas >= 1.0} -BuildRequires: %{python_module parso > 0.4} BuildRequires: %{python_module psutil >= 5.6.6} BuildRequires: %{python_module pytest-xdist} BuildRequires: %{python_module pytest} -# Not available -#BuildRequires: %%{python_module ray >= 1.0} BuildRequires: %{python_module tqdm >= 4.33.0} # /SECTION %python_subpackages ++++++ swifter-1.3.4.tar.gz -> swifter-1.4.0.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/.circleci/codecov.yml new/swifter-1.4.0/.circleci/codecov.yml --- old/swifter-1.3.4/.circleci/codecov.yml 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/.circleci/codecov.yml 2023-07-31 20:01:04.000000000 +0200 @@ -12,7 +12,7 @@ target: number threshold: 75% base: auto - patch: yes + patch: off changes: no parsers: @@ -30,4 +30,4 @@ ignore: - "/usr/local/lib/**/*" - - "/usr/lal/lib/**/*" \ No newline at end of file + - "/usr/lal/lib/**/*" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/.circleci/config.yml new/swifter-1.4.0/.circleci/config.yml --- old/swifter-1.3.4/.circleci/config.yml 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/.circleci/config.yml 2023-07-31 20:01:04.000000000 +0200 @@ -1,10 +1,11 @@ version: 2.1 orbs: + win: circleci/windows@5.0 codecov: codecov/codecov@1.0.5 jobs: - unittest-lint-codecov: + unittest-lint-codecov-linux: parallelism: 1 - resource_class: xlarge + resource_class: xlarge # 8 vCPU 16GB RAM working_directory: ~/repo docker: - image: python:3.9 @@ -17,6 +18,7 @@ - run: name: Install requirements command: | + pip install --upgrade pip pip install -r docker/requirements-dev.txt - run: name: Black lint check @@ -34,8 +36,30 @@ - store_artifacts: path: htmlcov + unittest-windows: + parallelism: 1 + working_directory: ~/repo + executor: + name: win/default + size: large # 8 vCPU 30GB RAM + + steps: + - checkout + - run: + name: Install requirements + command: | + pip install --upgrade pip + pip install -r docker/requirements-windows.txt + shell: bash.exe + - run: + name: Unit tests + command: | + python -m unittest swifter/swifter_tests.py + shell: bash.exe + workflows: version: 2 build-and-test: jobs: - - unittest-lint-codecov + - unittest-lint-codecov-linux + - unittest-windows diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/MANIFEST new/swifter-1.4.0/MANIFEST --- old/swifter-1.3.4/MANIFEST 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/MANIFEST 1970-01-01 01:00:00.000000000 +0100 @@ -1,7 +0,0 @@ -# file GENERATED by distutils, do NOT edit -setup.cfg -setup.py -swifter/__init__.py -swifter/swifter.py -swifter/swifter_tests.py -swifter/tqdm_dask_progressbar.py diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/Makefile new/swifter-1.4.0/Makefile --- old/swifter-1.3.4/Makefile 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/Makefile 2023-07-31 20:01:04.000000000 +0200 @@ -44,7 +44,7 @@ dev-start: ## Primary make command for dev, spins up containers docker-compose -f docker/docker-compose.yml --project-name ${PROJECT} up -d --build -dev-stop: dev-start ## Spins down active containers +dev-stop: ## Spins down active containers docker-compose -f docker/docker-compose.yml --project-name ${PROJECT} down sphinx: ## Creates docs using sphinx diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/README.md new/swifter-1.4.0/README.md --- old/swifter-1.3.4/README.md 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/README.md 2023-07-31 20:01:04.000000000 +0200 @@ -23,9 +23,11 @@ ``` $ pip install -U pandas # upgrade pandas $ pip install swifter # first time installation +$ pip install swifter[notebook] # first time installation including dependency for rich progress bar in jupyter notebooks $ pip install swifter[groupby] # first time installation including dependency for groupby.apply functionality $ pip install -U swifter # upgrade to latest version if already installed +$ pip install -U swifter[notebook] # upgrade to latest version to include dependency for rich progress bar in jupyter notebooks $ pip install -U swifter[groupby] # upgrade to latest version to include dependency for groupby.apply functionality ``` @@ -68,16 +70,15 @@ ``` ## Vectorizes your function, when possible - - + ## When vectorization is not possible, automatically decides which is faster: to use dask parallel processing or a simple pandas apply - - + ## Highly performant, even for groupby applies - - + + +See the [speed benchmark notebook](examples/swifter_speed_comparison.ipynb) for source of the above performance plots. ## Notes 1. The function is documented in the .py file. In Jupyter Notebooks, you can see the docs by pressing Shift+Tab(x3). Also, check out the complete documentation [here](docs/documentation.md) along with the [changelog](docs/changelog.md). Binary files old/swifter-1.3.4/assets/groupby_parallel_vs_single_compatible.png and new/swifter-1.4.0/assets/groupby_parallel_vs_single_compatible.png differ Binary files old/swifter-1.3.4/assets/multiprocessing_vs_single_compatible.png and new/swifter-1.4.0/assets/multiprocessing_vs_single_compatible.png differ Binary files old/swifter-1.3.4/assets/vectorizes_when_possible_compatible.png and new/swifter-1.4.0/assets/vectorizes_when_possible_compatible.png differ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/docker/Dockerfile new/swifter-1.4.0/docker/Dockerfile --- old/swifter-1.3.4/docker/Dockerfile 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/docker/Dockerfile 1970-01-01 01:00:00.000000000 +0100 @@ -1,5 +0,0 @@ -FROM python:3.9 -ADD requirements.txt /build/requirements.txt -WORKDIR /build/ -RUN pip install -r requirements.txt -WORKDIR /mnt/ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/docker/Dockerfile-dev new/swifter-1.4.0/docker/Dockerfile-dev --- old/swifter-1.3.4/docker/Dockerfile-dev 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/docker/Dockerfile-dev 2023-07-31 20:01:04.000000000 +0200 @@ -1,6 +1,8 @@ FROM python:3.9 -ADD requirements-dev.txt /build/requirements.txt +ADD requirements-windows.txt /build/requirements-windows.txt +ADD requirements-dev.txt /build/requirements-dev.txt WORKDIR /build/ -RUN pip install -r requirements.txt +RUN pip install --upgrade pip +RUN pip install -r requirements-dev.txt WORKDIR /mnt/ ENV PYTHONPATH "${PYTHONPATH}:/mnt" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/docker/requirements-dev.txt new/swifter-1.4.0/docker/requirements-dev.txt --- old/swifter-1.3.4/docker/requirements-dev.txt 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/docker/requirements-dev.txt 2023-07-31 20:01:04.000000000 +0200 @@ -1,17 +1,2 @@ -pandas>=1.0.0 -psutil>=5.6.6 -ray>=1.0.0 -dask[dataframe]>=2.10.0 -modin[dask]>=0.8.1.1 -tqdm>=4.33.0 -ipywidgets>=7.0.0 -cloudpickle>=0.2.2 -parso>0.4.0 -bleach>=3.1.1 -black==22.3.0 -flake8==3.7.7 -perfplot==0.7.3 -pytest==6.2.2 -coverage -codecov -nose +-r requirements-windows.txt +ray>=1.0.0 \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/docker/requirements-windows.txt new/swifter-1.4.0/docker/requirements-windows.txt --- old/swifter-1.3.4/docker/requirements-windows.txt 1970-01-01 01:00:00.000000000 +0100 +++ new/swifter-1.4.0/docker/requirements-windows.txt 2023-07-31 20:01:04.000000000 +0200 @@ -0,0 +1,14 @@ +pandas>=1.0.0 +psutil>=5.6.6 +dask[dataframe]>=2.10.0 +modin[dask]>=0.8.1.1 +tqdm>=4.33.0 +ipywidgets>=7.0.0 +black==22.3.0 +flake8==3.7.7 +perfplot==0.7.3 +pytest==6.2.2 +jupyterlab==3.6.2 +coverage +codecov +nose diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/docs/changelog.md new/swifter-1.4.0/docs/changelog.md --- old/swifter-1.3.4/docs/changelog.md 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/docs/changelog.md 2023-07-31 20:01:04.000000000 +0200 @@ -1,5 +1,17 @@ # Changelog +## Version 1.4.0 -- 2023-07-21 +* Significantly reduced core dependencies of swifter library. See https://github.com/jmcarpenter2/swifter/issues/219 for discussion + - Big shout out to @PeterJCLaw for starting this discussion and contributions from @xquyvu as well +* Removed deprecated `loffset` parameter + - Thanks to @bnavigator for identifying this bug +* Updated README to be more readable for darkmode users + - Thank you to @MemphisMeng for identifying this gap + +## Version 1.3.5 -- 2023-06-12 +* Add secondary fallback for series applies +* Code refactoring for simplicity + ## Version 1.3.4 -- 2022-08-16 * Enable indexing after a groupby, e.g. `df.swifter.groupby(by)[key].apply(func)` * Improve groupby apply progress bar diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/requirements.txt new/swifter-1.4.0/requirements.txt --- old/swifter-1.3.4/requirements.txt 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/requirements.txt 1970-01-01 01:00:00.000000000 +0100 @@ -1,8 +0,0 @@ -pandas>=1.0.0 -psutil>=5.6.6 -dask[dataframe]>=2.10.0 -tqdm>=4.33.0 -ipywidgets>=7.0.0 -cloudpickle>=0.2.2 -parso>0.4.0 -bleach>=3.1.1 \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/setup.py new/swifter-1.4.0/setup.py --- old/swifter-1.3.4/setup.py 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/setup.py 2023-07-31 20:01:04.000000000 +0200 @@ -3,25 +3,22 @@ setup( name="swifter", packages=["swifter"], # this must be the same as the name above - version="1.3.4", + version="1.4.0", description="A package which efficiently applies any function to a pandas dataframe or series in the fastest available manner", author="Jason Carpenter", author_email="jcarpen...@manifold.ai", url="https://github.com/jmcarpenter2/swifter", # use the URL to the github repo - download_url=f"https://github.com/jmcarpenter2/swifter/archive/1.3.4.tar.gz", + download_url="https://github.com/jmcarpenter2/swifter/archive/1.4.0.tar.gz", keywords=["pandas", "dask", "apply", "function", "parallelize", "vectorize"], install_requires=[ "pandas>=1.0.0", "psutil>=5.6.6", "dask[dataframe]>=2.10.0", "tqdm>=4.33.0", - "ipywidgets>=7.0.0", - "cloudpickle>=0.2.2", - "parso>0.4.0", - "bleach>=3.1.1", ], extras_require={ - "groupby": ["ray>=1.0.0"] + "groupby": ["ray>=1.0.0"], + "notebook": ["ipywidgets>=7.0.0"], }, classifiers=[], ) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/swifter/__init__.py new/swifter-1.4.0/swifter/__init__.py --- old/swifter-1.3.4/swifter/__init__.py 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/swifter/__init__.py 2023-07-31 20:01:04.000000000 +0200 @@ -22,4 +22,4 @@ "register_parallel_series_accessor", "register_modin", ] -__version__ = "1.3.4" +__version__ = "1.4.0" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/swifter/swifter.py new/swifter-1.4.0/swifter/swifter.py --- old/swifter-1.3.4/swifter/swifter.py 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/swifter/swifter.py 2023-07-31 20:01:04.000000000 +0200 @@ -196,7 +196,6 @@ "label": label, "convention": convention, "kind": kind, - "loffset": loffset, "base": base, "on": on, "level": level, @@ -205,6 +204,8 @@ } if not base: kwds.pop("base") + if loffset is not None: + kwds.update({"loffset": loffset}) return Resampler( self._obj, @@ -228,59 +229,60 @@ return wrapped - def _dask_apply(self, func, convert_dtype, *args, **kwds): + def _pandas_apply(self, df, func, convert_dtype, *args, **kwds): + if self._progress_bar: + tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") + return df.progress_apply(func, convert_dtype=convert_dtype, args=args, **kwds) + else: + return df.apply(func, convert_dtype=convert_dtype, args=args, **kwds) + + def _dask_map_partitions(self, df, func, meta, *args, **kwds): + return ( + dd.from_pandas(df, npartitions=self._npartitions) + .map_partitions(func, *args, meta=meta, **kwds) + .compute(scheduler=self._scheduler) + ) + + def _dask_apply(self, df, func, convert_dtype, meta, *args, **kwds): + return ( + dd.from_pandas(df, npartitions=self._npartitions) + .apply( + lambda x: func(x, *args, **kwds), + convert_dtype=convert_dtype, + meta=meta, + ) + .compute(scheduler=self._scheduler) + ) + + def _parallel_apply(self, func, convert_dtype, *args, **kwds): sample = self._obj.iloc[self._SAMPLE_INDEX] with suppress_stdout_stderr_logging(): meta = sample.apply(func, convert_dtype=convert_dtype, args=args, **kwds) try: # check that the dask map partitions matches the pandas apply with suppress_stdout_stderr_logging(): - tmp_df = ( - dd.from_pandas(sample, npartitions=self._npartitions) - .map_partitions(func, *args, meta=meta, **kwds) - .compute(scheduler=self._scheduler) - ) + tmp_df = self._dask_map_partitions(sample, func, meta, *args, **kwds) self._validate_apply( tmp_df.equals(meta), error_message=("Dask map-partitions sample does not match pandas apply sample."), ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .map_partitions(func, *args, meta=meta, **kwds) - .compute(scheduler=self._scheduler) - ) + return self._dask_map_partitions(self._obj, func, meta, *args, **kwds) else: - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .map_partitions(func, *args, meta=meta, **kwds) - .compute(scheduler=self._scheduler) - ) + return self._dask_map_partitions(self._obj, func, meta, *args, **kwds) except ERRORS_TO_HANDLE: # if map partitions doesn't match pandas apply, # we can use dask apply, but it will be a bit slower - if self._progress_bar: - with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .apply( - lambda x: func(x, *args, **kwds), - convert_dtype=convert_dtype, - meta=meta, - ) - .compute(scheduler=self._scheduler) - ) - else: - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .apply( - lambda x: func(x, *args, **kwds), - convert_dtype=convert_dtype, - meta=meta, - ) - .compute(scheduler=self._scheduler) - ) + try: + if self._progress_bar: + with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): + return self._dask_apply(self._obj, func, convert_dtype, meta, *args, **kwds) + else: + return self._dask_apply(self._obj, func, convert_dtype, meta, *args, **kwds) + except ERRORS_TO_HANDLE: + # Second fallback to pandas if dask apply fails + return self._pandas_apply(self._obj, func, convert_dtype, *args, **kwds) def apply(self, func, convert_dtype=True, args=(), **kwds): """ @@ -293,7 +295,7 @@ # If parallel processing is forced by the user, then skip the logic and apply dask if self._force_parallel: - return self._dask_apply(func, convert_dtype, *args, **kwds) + return self._parallel_apply(func, convert_dtype, *args, **kwds) sample = self._obj.iloc[self._SAMPLE_INDEX] # check if input is string or @@ -322,13 +324,9 @@ # if pandas sample apply takes too long and not performing str processing # then use dask if (est_apply_duration > self._dask_threshold) and allow_dask_processing: - return self._dask_apply(func, convert_dtype, *args, **kwds) + return self._parallel_apply(func, convert_dtype, *args, **kwds) else: # use pandas - if self._progress_bar: - tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") - return self._obj.progress_apply(func, convert_dtype=convert_dtype, args=args, **kwds) - else: - return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds) + return self._pandas_apply(self._obj, func, convert_dtype, *args, **kwds) @pd.api.extensions.register_dataframe_accessor("swifter") @@ -342,7 +340,31 @@ return wrapped - def _dask_apply(self, func, axis=0, raw=None, result_type=None, *args, **kwds): + def _pandas_apply(self, df, func, axis, raw, result_type, *args, **kwds): + if self._progress_bar: + tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") + apply_func = df.progress_apply + else: + apply_func = df.apply + + return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) + + def _dask_apply(self, df, func, axis, raw, result_type, meta, *args, **kwds): + return ( + dd.from_pandas(df, npartitions=self._npartitions) + .apply( + func, + *args, + axis=axis, + raw=raw, + result_type=result_type, + meta=meta, + **kwds, + ) + .compute(scheduler=self._scheduler) + ) + + def _parallel_apply(self, func, axis=0, raw=None, result_type=None, *args, **kwds): sample = self._obj.iloc[self._SAMPLE_INDEX] with suppress_stdout_stderr_logging(): meta = sample.apply(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) @@ -368,42 +390,12 @@ ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Apply"): - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .apply( - func, - *args, - axis=axis, - raw=raw, - result_type=result_type, - meta=meta, - **kwds, - ) - .compute(scheduler=self._scheduler) - ) + return self._dask_apply(self._obj, func, axis, raw, result_type, meta, *args, **kwds) else: - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .apply( - func, - *args, - axis=axis, - raw=raw, - result_type=result_type, - meta=meta, - **kwds, - ) - .compute(scheduler=self._scheduler) - ) + return self._dask_apply(self._obj, func, axis, raw, result_type, meta, *args, **kwds) except ERRORS_TO_HANDLE: # if dask apply doesn't match pandas apply, fallback to pandas - if self._progress_bar: - tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") - apply_func = self._obj.progress_apply - else: - apply_func = self._obj.apply - - return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) + return self._pandas_apply(self._obj, func, axis, raw, result_type, *args, **kwds) def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ @@ -415,7 +407,7 @@ # If parallel processing is forced by the user, then skip the logic and apply dask if self._force_parallel: - return self._dask_apply(func, axis, raw, result_type, *args, **kwds) + return self._parallel_apply(func, axis, raw, result_type, *args, **kwds) sample = self._obj.iloc[self._SAMPLE_INDEX] # check if input is string @@ -440,15 +432,9 @@ # if pandas sample apply takes too long # and not performing str processing, use dask if (est_apply_duration > self._dask_threshold) and allow_dask_processing and axis == 1: - return self._dask_apply(func, axis, raw, result_type, *args, **kwds) + return self._parallel_apply(func, axis, raw, result_type, *args, **kwds) else: # use pandas - if self._progress_bar: - tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") - apply_func = self._obj.progress_apply - else: - apply_func = self._obj.apply - - return apply_func(func, axis=axis, raw=raw, result_type=result_type, args=args, **kwds) + return self._pandas_apply(self._obj, func, axis, raw, result_type, *args, **kwds) def _wrapped_applymap(self, func): def wrapped(): @@ -457,44 +443,42 @@ return wrapped - def _dask_applymap(self, func): + def _pandas_applymap(self, df, func): + if self._progress_bar: + tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") + applymap_func = df.progress_applymap + else: + applymap_func = df.applymap + + return applymap_func(func) + + def _dask_applymap(self, df, func, meta): + return ( + dd.from_pandas(df, npartitions=self._npartitions) + .applymap(func, meta=meta) + .compute(scheduler=self._scheduler) + ) + + def _parallel_applymap(self, func): sample = self._obj.iloc[self._SAMPLE_INDEX] with suppress_stdout_stderr_logging(): meta = sample.applymap(func) try: with suppress_stdout_stderr_logging(): # check that the dask apply matches the pandas apply - tmp_df = ( - dd.from_pandas(sample, npartitions=self._npartitions) - .applymap(func, meta=meta) - .compute(scheduler=self._scheduler) - ) + tmp_df = self._dask_applymap(sample, func, meta) self._validate_apply( tmp_df.equals(meta), error_message=("Dask applymap sample does not match pandas applymap sample."), ) if self._progress_bar: with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Applymap"): - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .applymap(func, meta=meta) - .compute(scheduler=self._scheduler) - ) + return self._dask_applymap(self._obj, func, meta) else: - return ( - dd.from_pandas(self._obj, npartitions=self._npartitions) - .applymap(func, meta=meta) - .compute(scheduler=self._scheduler) - ) + return self._dask_applymap(self._obj, func, meta) except ERRORS_TO_HANDLE: # if dask apply doesn't match pandas apply, fallback to pandas - if self._progress_bar: - tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") - applymap_func = self._obj.progress_applymap - else: - applymap_func = self._obj.applymap - - return applymap_func(func) + return self._pandas_applymap(self._obj, func) def applymap(self, func): """ @@ -507,7 +491,7 @@ # If parallel processing is forced by the user, then skip the logic and apply dask if self._force_parallel: - return self._dask_applymap(func) + return self._parallel_applymap(func) sample = self._obj.iloc[self._SAMPLE_INDEX] # check if input is string @@ -532,15 +516,9 @@ # if pandas sample apply takes too long # and not performing str processing, use dask if (est_apply_duration > self._dask_threshold) and allow_dask_processing: - return self._dask_applymap(func) + return self._parallel_applymap(func) else: # use pandas - if self._progress_bar: - tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") - applymap_func = self._obj.progress_applymap - else: - applymap_func = self._obj.applymap - - return applymap_func(func) + return self._pandas_applymap(self._obj, func) def groupby( self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, observed=False, dropna=True @@ -696,8 +674,8 @@ return wrapped @abstractmethod - def _dask_apply(self, func, *args, **kwds): - raise NotImplementedError("Transformation class does not implement _dask_apply") + def _parallel_apply(self, func, *args, **kwds): + raise NotImplementedError("Transformation class does not implement _parallel_apply") def apply(self, func, *args, **kwds): """ @@ -709,7 +687,7 @@ # If parallel processing is forced by the user, then skip the logic and apply dask if self._force_parallel: - return self._dask_apply(func, *args, **kwds) + return self._parallel_apply(func, *args, **kwds) # estimate time to pandas apply wrapped = self._wrapped_apply(func, *args, **kwds) @@ -720,7 +698,7 @@ # No `allow_dask_processing` variable here, # because we don't know the dtypes of the transformation if est_apply_duration > self._dask_threshold: - return self._dask_apply(func, *args, **kwds) + return self._parallel_apply(func, *args, **kwds) else: # use pandas if self._progress_bar and hasattr(self._obj_pd, "progress_apply"): tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") @@ -758,7 +736,7 @@ self._obj_pd = self._obj_pd.rolling(**kwds) self._obj_dd = self._obj_dd.rolling(**{k: v for k, v in kwds.items() if k not in ["on", "closed"]}) - def _dask_apply(self, func, *args, **kwds): + def _parallel_apply(self, func, *args, **kwds): try: # check that the dask rolling apply matches the pandas apply with suppress_stdout_stderr_logging(): @@ -821,7 +799,7 @@ else None ) - def _dask_apply(self, func, *args, **kwds): + def _parallel_apply(self, func, *args, **kwds): try: # check that the dask resampler apply matches the pandas apply with suppress_stdout_stderr_logging(): diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-1.3.4/swifter/swifter_tests.py new/swifter-1.4.0/swifter/swifter_tests.py --- old/swifter-1.3.4/swifter/swifter_tests.py 2022-08-17 01:30:01.000000000 +0200 +++ new/swifter-1.4.0/swifter/swifter_tests.py 2023-07-31 20:01:04.000000000 +0200 @@ -1,3 +1,4 @@ +import os import sys import importlib import unittest @@ -12,9 +13,12 @@ import pandas as pd import swifter -from .swifter import GROUPBY_MAX_ROWS_PANDAS_DEFAULT +from .swifter import RAY_INSTALLED, GROUPBY_MAX_ROWS_PANDAS_DEFAULT + from tqdm.auto import tqdm +WINDOWS_CI = "windows" in os.environ.get("CIRCLE_JOB", "") + LOG = logging.getLogger(__name__) LOG.setLevel(logging.INFO) @@ -73,7 +77,21 @@ return True +def run_if_ray_installed(func): + # if ray is installed, run the test/test suite + if RAY_INSTALLED: + return func + else: # if ray isnt installed just skip the test(s) + return True + + class TestSwifter(unittest.TestCase): + def assertLessLinux(self, a, b, msg=None): + if WINDOWS_CI: + pass + else: + super().assertLess(a, b, msg=msg) + def assertSeriesEqual(self, a, b, msg): try: pd.testing.assert_series_equal(a, b) @@ -410,7 +428,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_math_apply_on_large_series(self): LOG.info("test_nonvectorized_math_apply_on_large_series") @@ -434,7 +452,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_vectorized_force_parallel_math_apply_on_large_series(self): LOG.info("test_vectorized_force_parallel_math_apply_on_large_series") @@ -459,7 +477,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) class TestPandasDataFrame(TestSwifter): @@ -477,6 +495,7 @@ swifter_val = df.swifter.applymap(math_vec_square) self.assertEqual(pd_val, swifter_val) # equality test + @run_if_ray_installed def test_groupby_apply_on_empty_dataframe(self): LOG.info("test_groupby_apply_on_empty_dataframe") df = pd.DataFrame(columns=["x", "y"]) @@ -484,6 +503,7 @@ swifter_val = df.swifter.groupby("x").apply(math_vec_square) self.assertEqual(pd_val, swifter_val) # equality test + @run_if_ray_installed def test_groupby_index_apply(self): LOG.info("test_groupby_index_apply") SIZE = GROUPBY_MAX_ROWS_PANDAS_DEFAULT * 2 @@ -537,7 +557,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_math_apply_on_large_dataframe_broadcast(self): LOG.info("test_nonvectorized_math_apply_on_large_dataframe_broadcast") @@ -560,7 +580,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_math_apply_on_large_dataframe_reduce(self): LOG.info("test_nonvectorized_math_apply_on_large_dataframe_reduce") @@ -583,7 +603,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_text_dask_apply_on_large_dataframe(self): LOG.info("test_nonvectorized_text_dask_apply_on_large_dataframe") @@ -612,7 +632,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_vectorized_force_parallel_math_apply_on_large_dataframe(self): LOG.info("test_vectorized_force_parallel_math_apply_on_large_dataframe") @@ -641,7 +661,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_vectorized_math_applymap_on_large_dataframe(self): LOG.info("test_vectorized_math_applymap_on_large_dataframe") @@ -667,7 +687,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_vectorized_force_parallel_math_applymap_on_large_dataframe(self): LOG.info("test_vectorized_force_parallel_math_applymap_on_large_dataframe") @@ -696,7 +716,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_math_applymap_on_large_dataframe(self): LOG.info("test_nonvectorized_math_applymap_on_large_dataframe") @@ -720,7 +740,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_math_applymap_on_small_dataframe(self): LOG.info("test_nonvectorized_math_applymap_on_small_dataframe") @@ -736,6 +756,7 @@ swifter_val = df.swifter.progress_bar(enable=False).applymap(math_foo) self.assertEqual(pd_val, swifter_val) # equality test + @run_if_ray_installed def test_vectorized_math_groupby_apply_on_small_dataframe(self): LOG.info("test_vectorized_math_groupby_apply_on_small_dataframe") df = pd.DataFrame( @@ -749,6 +770,7 @@ swifter_val = df.swifter.groupby("g").apply(numeric_func) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_force_parallel_math_groupby_apply_on_small_dataframe(self): LOG.info("test_vectorized_force_parallel_math_groupby_apply_on_small_dataframe") df = pd.DataFrame( @@ -762,6 +784,7 @@ swifter_val = df.swifter.force_parallel(True).groupby("g").apply(numeric_func) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_math_groupby_apply_on_large_dataframe(self): LOG.info("test_vectorized_math_groupby_apply_on_large_dataframe") df = pd.DataFrame( @@ -775,6 +798,7 @@ swifter_val = df.swifter.groupby("g").apply(numeric_func) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_math_groupby_apply_on_large_dataframe_index(self): LOG.info("test_vectorized_math_groupby_apply_on_large_dataframe_index") df = pd.DataFrame( @@ -788,6 +812,7 @@ swifter_val = df.swifter.groupby(df.index).apply(numeric_func) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_force_parallel_math_groupby_apply_on_large_dataframe(self): LOG.info("test_vectorized_force_parallel_math_groupby_apply_on_large_dataframe") df = pd.DataFrame( @@ -801,6 +826,7 @@ swifter_val = df.swifter.force_parallel(True).groupby("g").apply(numeric_func) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_text_groupby_apply_on_small_dataframe(self): LOG.info("test_vectorized_text_groupby_apply_on_small_dataframe") df = pd.DataFrame( @@ -810,6 +836,7 @@ swifter_val = df.swifter.groupby("g").apply(clean_text_foo) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_force_parallel_text_groupby_apply_on_small_dataframe(self): LOG.info("test_vectorized_force_parallel_text_groupby_apply_on_small_dataframe") df = pd.DataFrame( @@ -819,6 +846,7 @@ swifter_val = df.swifter.force_parallel(True).groupby("g").apply(clean_text_foo) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_text_groupby_apply_on_large_dataframe(self): LOG.info("test_vectorized_text_groupby_apply_on_large_dataframe") df = pd.DataFrame( @@ -831,6 +859,7 @@ swifter_val = df.swifter.groupby("g").apply(clean_text_foo) self.assertSeriesEqual(pd_val, swifter_val, "Swifter output does not equal Pandas output") # equality test + @run_if_ray_installed def test_vectorized_force_parallel_text_groupby_apply_on_large_dataframe(self): LOG.info("test_vectorized_force_parallel_text_groupby_apply_on_large_dataframe") df = pd.DataFrame( @@ -925,7 +954,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_vectorized_force_parallel_math_apply_on_large_rolling_dataframe(self): LOG.info("test_vectorized_force_parallel_math_apply_on_large_rolling_dataframe") @@ -982,7 +1011,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) def test_nonvectorized_force_parallel_math_apply_on_large_resampler_dataframe(self): LOG.info("test_nonvectorized_force_parallel_math_apply_on_large_resampler_dataframe") @@ -1009,7 +1038,7 @@ self.assertEqual(pd_val, swifter_val) # equality test if self.ncores > 1: # speed test - self.assertLess(swifter_time, pd_time) + self.assertLessLinux(swifter_time, pd_time) @run_if_modin_installed @@ -1173,4 +1202,4 @@ self.assertEqual(md_val, swifter_val) # equality test self.assertEqual(md_pd_val, swifter_pd_val) # equality test after converting to pandas - self.assertLess(swifter_time, md_time) # speed test + self.assertLessLinux(swifter_time, md_time) # speed test