Hello community, here is the log from the commit of package python-swifter for openSUSE:Factory checked in at 2020-02-03 11:13:29 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/python-swifter (Old) and /work/SRC/openSUSE:Factory/.python-swifter.new.26092 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "python-swifter" Mon Feb 3 11:13:29 2020 rev:2 rq:768875 version:0.300 Changes: -------- --- /work/SRC/openSUSE:Factory/python-swifter/python-swifter.changes 2019-12-04 14:20:33.090446806 +0100 +++ /work/SRC/openSUSE:Factory/.python-swifter.new.26092/python-swifter.changes 2020-02-03 11:13:34.277852604 +0100 @@ -1,0 +2,12 @@ +Thu Jan 30 19:22:19 UTC 2020 - Todd R <[email protected]> + +- Update to 0.300 + * Added new applymap method for pandas dataframes. + df.swifter.applymap(...) +- Update to 0.297 + * Fixed issue causing errors when using swifter on empty + dataframes. Now swifter will perform a pandas apply on empty + dataframes. +- Drop upstream-included use_current_exe.patch + +------------------------------------------------------------------- Old: ---- swifter-0.296.tar.gz use_current_exe.patch New: ---- swifter-0.300.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ python-swifter.spec ++++++ --- /var/tmp/diff_new_pack.VeTRVH/_old 2020-02-03 11:13:35.701853324 +0100 +++ /var/tmp/diff_new_pack.VeTRVH/_new 2020-02-03 11:13:35.705853326 +0100 @@ -1,7 +1,7 @@ # # spec file for package python-swifter # -# Copyright (c) 2019 SUSE LINUX GmbH, Nuernberg, Germany. +# Copyright (c) 2020 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -19,14 +19,12 @@ %{?!python_module:%define python_module() python-%{**} python3-%{**}} %define skip_python2 1 Name: python-swifter -Version: 0.296 +Version: 0.300 Release: 0 Summary: Tool to speed up pandas calculations License: MIT URL: https://github.com/jmcarpenter2/swifter Source: https://github.com/jmcarpenter2/swifter/archive/%{version}.tar.gz#/swifter-%{version}.tar.gz -# PATCH-FIX-UPSTREAM -- use_current_exe.spec -- https://github.com/jmcarpenter2/swifter/pull/92 -Patch0: use_current_exe.patch BuildRequires: %{python_module setuptools} BuildRequires: fdupes BuildRequires: python-rpm-macros @@ -56,7 +54,6 @@ %prep %setup -q -n swifter-%{version} -%autopatch -p1 %build %python_build ++++++ swifter-0.296.tar.gz -> swifter-0.300.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/.circleci/config.yml new/swifter-0.300/.circleci/config.yml --- old/swifter-0.296/.circleci/config.yml 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/.circleci/config.yml 2020-01-16 00:03:39.000000000 +0100 @@ -6,27 +6,27 @@ parallelism: 1 working_directory: ~/repo docker: - - image: manifoldai/orbyter-ml-dev:latest + - image: python:3.6 steps: - checkout - run: name: Black lint check command: | - cd swifter && black -l 120 --check . + pip install black && cd swifter && black -l 120 --check . - run: name: Flake8 lint check command: | - cd swifter && flake8 --max-line-length 120 . + pip install flake8 && cd swifter && flake8 --max-line-length 120 . - run: name: Unit tests command: | - pip install pipenv && pipenv install --dev && pip install coverage && coverage run -m unittest swifter/swifter_tests.py + pip install pipenv && pipenv install && pipenv install coverage && pipenv run coverage run -m unittest swifter/swifter_tests.py - run: name: Codecov report command: | coverage report -i && coverage html -i - pip install codecov && codecov --required -t ${CODECOV_TOKEN} || (sleep 5 && codecov --required -t ${CODECOV_TOKEN}) || (sleep 5 && codecov --required -t ${CODECOV_TOKEN}) || (sleep 5 && codecov --required -t ${CODECOV_TOKEN}) || (sleep 5 && codecov --required -t ${CODECOV_TOKEN}) + pip install codecov && codecov --required || (sleep 5 && codecov --required) || (sleep 5 && codecov --required) || (sleep 5 && codecov --required) || (sleep 5 && codecov --required) - store_artifacts: path: htmlcov diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/README.md new/swifter-0.300/README.md --- old/swifter-0.296/README.md 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/README.md 2020-01-16 00:03:39.000000000 +0100 @@ -3,7 +3,7 @@ [](https://badge.fury.io/py/swifter) [](https://circleci.com/gh/jmcarpenter2/swifter) -[](https://codecov.io/gh/jmcarpenter2/swifter) +[](https://codecov.io/gh/jmcarpenter2/swifter) [](https://github.com/ambv/black)   diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/docs/changelog.md new/swifter-0.300/docs/changelog.md --- old/swifter-0.296/docs/changelog.md 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/docs/changelog.md 2020-01-16 00:03:39.000000000 +0100 @@ -1,5 +1,11 @@ # Changelog +## Version 0.300 +Added new `applymap` method for pandas dataframes. `df.swifter.applymap(...)` + +## Version 0.297 +Fixed issue causing errors when using swifter on empty dataframes. Now swifter will perform a pandas apply on empty dataframes. + ## Version 0.296 Added support for resample objects in syntax that refects pandas. `df.swifter.resample(...).apply(...)` diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/docs/documentation.md new/swifter-0.300/docs/documentation.md --- old/swifter-0.296/docs/documentation.md 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/docs/documentation.md 2020-01-16 00:03:39.000000000 +0100 @@ -86,7 +86,15 @@ The new dataframe/series with the function applied as quickly as possible -## 3. `pandas.DataFrame.swifter.rolling.apply` +## 3. `pandas.DataFrame.swifter.applymap` + +Efficiently applymap any function to a pandas dataframe in the fastest available manner. Applymap is elementwise. + +```python +def pandas.DataFrame.swifter.applymap(func) +``` + +## 4. `pandas.DataFrame.swifter.rolling.apply` Applies over a rolling object on the original series/dataframe in the fastest available manner. @@ -102,7 +110,7 @@ ).apply(func, *args, **kwds) ``` -## 4. `pandas.DataFrame.swifter.resample.apply` +## 5. `pandas.DataFrame.swifter.resample.apply` Applies over a resampler object on the original series/dataframe in the fastest available manner. @@ -121,7 +129,7 @@ ).apply(func, *args, **kwds) ``` -## 5. `pandas.DataFrame.swifter.progress_bar(False).apply` +## 6. `pandas.DataFrame.swifter.progress_bar(False).apply` Enable or disable the TQDM progress bar by setting the enable parameter to True/False, respectively. You can also specify a custom description. @@ -137,7 +145,7 @@ df.swifter.progress_bar(False).apply(lambda x: x+1) ``` -## 6. `pandas.DataFrame.swifter.set_npartitions(npartitions=None).apply` +## 7. `pandas.DataFrame.swifter.set_npartitions(npartitions=None).apply` Specify the number of partitions to allocate to swifter, if parallel processing is chosen to be the quickest apply. If npartitions=None, it defaults to cpu_count()*2 @@ -151,7 +159,7 @@ df.swifter.set_npartitions(2).apply(lambda x: x+1) ``` -## 7. `pandas.DataFrame.swifter.set_dask_threshold(dask_threshold=1).apply` +## 8. `pandas.DataFrame.swifter.set_dask_threshold(dask_threshold=1).apply` Specify the dask threshold (in seconds) for the max allowable time estimate for a pandas apply on the full dataframe ```python @@ -163,7 +171,7 @@ df.swifter.set_dask_threshold(dask_threshold=3).apply(lambda x: x+1) ``` -## 8. `pandas.DataFrame.swifter.set_dask_scheduler(scheduler="processes").apply` +## 9. `pandas.DataFrame.swifter.set_dask_scheduler(scheduler="processes").apply` Set the dask scheduler @@ -177,7 +185,7 @@ df.swifter.set_dask_scheduler(scheduler="threads").apply(lambda x: x+1) ``` -## 9. `pandas.DataFrame.swifter.allow_dask_on_strings(enable=True).apply` +## 10. `pandas.DataFrame.swifter.allow_dask_on_strings(enable=True).apply` Specify whether to allow dask to handle dataframes containing string types. Dask can be particularly slow if you are actually manipulating strings, but if you just have a string column in your data frame this will allow dask to handle the execution. ```python diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/setup.py new/swifter-0.300/setup.py --- old/swifter-0.296/setup.py 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/setup.py 2020-01-16 00:03:39.000000000 +0100 @@ -3,12 +3,12 @@ setup( name="swifter", packages=["swifter"], # this must be the same as the name above - version="0.296", + version="0.300", description="A package which efficiently applies any function to a pandas dataframe or series in the fastest available manner", author="Jason Carpenter", author_email="[email protected]", url="https://github.com/jmcarpenter2/swifter", # use the URL to the github repo - download_url="https://github.com/jmcarpenter2/swifter/archive/0.296.tar.gz", + download_url="https://github.com/jmcarpenter2/swifter/archive/0.300.tar.gz", keywords=["pandas", "dask", "apply", "function", "parallelize", "vectorize"], install_requires=["pandas>=0.23.0", "psutil", "dask[complete]>=0.19.0", "tqdm>=4.33.0", "ipywidgets>=7.0.0", "parso>0.4.0", "numba"], classifiers=[], diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/swifter/__init__.py new/swifter-0.300/swifter/__init__.py --- old/swifter-0.296/swifter/__init__.py 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/swifter/__init__.py 2020-01-16 00:03:39.000000000 +0100 @@ -3,4 +3,4 @@ from .swifter import SeriesAccessor, DataFrameAccessor __all__ = ["SeriesAccessor, DataFrameAccessor"] -__version__ = "0.296" +__version__ = "0.300" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/swifter/swifter.py new/swifter-0.300/swifter/swifter.py --- old/swifter-0.296/swifter/swifter.py 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/swifter/swifter.py 2020-01-16 00:03:39.000000000 +0100 @@ -217,6 +217,11 @@ """ Apply the function to the Series using swifter """ + + # if the series is empty, return early using Pandas + if not self._nrows: + return self._obj.apply(func, convert_dtype=convert_dtype, args=args, **kwds) + sample = self._obj.iloc[: self._npartitions * 2] # check if input is string or if the user is overriding the string processing default allow_dask_processing = True if self._allow_dask_on_strings else (sample.dtype != "object") @@ -324,6 +329,13 @@ """ Apply the function to the DataFrame using swifter """ + + # If there are no rows return early using Pandas + if not self._nrows: + return self._obj.apply( + func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds + ) + sample = self._obj.iloc[: self._npartitions * 2, :] # check if input is string or if the user is overriding the string processing default allow_dask_processing = True if self._allow_dask_on_strings else ("object" not in sample.dtypes.values) @@ -386,6 +398,96 @@ **kwds ) + def _wrapped_applymap(self, func): + def wrapped(): + with suppress_stdout_stderr(): + self._obj.iloc[: self._SAMPLE_SIZE, :].applymap(func) + + return wrapped + + def _dask_applymap(self, func): + sample = self._obj.iloc[: self._npartitions * 2, :] + with suppress_stdout_stderr(): + meta = sample.applymap(func) + try: + with suppress_stdout_stderr(): + # check that the dask apply matches the pandas apply + tmp_df = ( + dd.from_pandas(sample, npartitions=self._npartitions) + .applymap(func, meta=meta) + .compute(scheduler=self._scheduler) + ) + self._validate_apply( + tmp_df.equals(meta), error_message="Dask applymap sample does not match pandas applymap sample." + ) + if self._progress_bar: + with TQDMDaskProgressBar(desc=self._progress_bar_desc or "Dask Applymap"): + return ( + dd.from_pandas(self._obj, npartitions=self._npartitions) + .applymap(func, meta=meta) + .compute(scheduler=self._scheduler) + ) + else: + return ( + dd.from_pandas(self._obj, npartitions=self._npartitions) + .applymap(func, meta=meta) + .compute(scheduler=self._scheduler) + ) + except (AttributeError, ValueError, TypeError, KeyError): + # if dask apply doesn't match pandas apply, fallback to pandas + if self._progress_bar: + tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") + applymap_func = self._obj.progress_applymap + else: + applymap_func = self._obj.applymap + + return applymap_func(func) + + def applymap(self, func): + """ + Applymap the function to the DataFrame using swifter + """ + + # If there are no rows return early using Pandas + if not self._nrows: + return self._obj.applymap(func) + + sample = self._obj.iloc[: self._npartitions * 2, :] + # check if input is string or if the user is overriding the string processing default + allow_dask_processing = True if self._allow_dask_on_strings else ("object" not in sample.dtypes.values) + + try: # try to vectorize + with suppress_stdout_stderr(): + tmp_df = func(sample) + self._validate_apply( + sample.apply(func).equals(tmp_df), + error_message="Vectorized function sample does not match pandas apply sample.", + ) + return func(self._obj) + except ( + AttributeError, + ValueError, + TypeError, + TypingError, + KeyError, + ): # if can't vectorize, estimate time to pandas apply + wrapped = self._wrapped_applymap(func) + timed = timeit.timeit(wrapped, number=N_REPEATS) + sample_proc_est = timed / N_REPEATS + est_apply_duration = sample_proc_est / self._SAMPLE_SIZE * self._obj.shape[0] + + # if pandas sample apply takes too long and not performing str processing, use dask + if (est_apply_duration > self._dask_threshold) and allow_dask_processing: + return self._dask_applymap(func) + else: # use pandas + if self._progress_bar: + tqdm.pandas(desc=self._progress_bar_desc or "Pandas Apply") + applymap_func = self._obj.progress_applymap + else: + applymap_func = self._obj.applymap + + return applymap_func(func) + class Transformation(_SwifterObject): def __init__( @@ -424,6 +526,10 @@ """ Apply the function to the transformed swifter object """ + # if the transformed dataframe is empty, return early using Pandas + if not self._nrows: + return self._obj_pd.apply(func, args=args, **kwds) + # estimate time to pandas apply wrapped = self._wrapped_apply(func, *args, **kwds) timed = timeit.timeit(wrapped, number=N_REPEATS) @@ -508,7 +614,13 @@ self._sample_original = self._sample_pd.copy() self._sample_pd = self._sample_pd.resample(**kwds) self._obj_pd = self._obj_pd.resample(**kwds) - self._obj_dd = self._obj_dd.resample(**{k: v for k, v in kwds.items() if k in ["rule", "closed", "label"]}) + # Setting dask dataframe `self._obj_dd` to None when there are 0 `self._nrows` because + # swifter will immediately return the pandas form during the apply function if there are 0 `self._nrows` + self._obj_dd = ( + self._obj_dd.resample(**{k: v for k, v in kwds.items() if k in ["rule", "closed", "label"]}) + if self._nrows + else None + ) def _wrapped_apply(self, func, *args, **kwds): def wrapped(): @@ -545,6 +657,10 @@ """ Apply the function to the resampler swifter object """ + # if the resampled dataframe is empty, return early using Pandas + if not self._nrows: + return self._obj_pd.apply(func, args=args, **kwds) + # estimate time to pandas apply wrapped = self._wrapped_apply(func, *args, **kwds) timed = timeit.timeit(wrapped, number=N_REPEATS) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/swifter-0.296/swifter/swifter_tests.py new/swifter-0.300/swifter/swifter_tests.py --- old/swifter-0.296/swifter/swifter_tests.py 2019-11-20 21:19:42.000000000 +0100 +++ new/swifter-0.300/swifter/swifter_tests.py 2020-01-16 00:03:39.000000000 +0100 @@ -1,3 +1,4 @@ +import sys import unittest import subprocess import time @@ -7,6 +8,8 @@ import pandas as pd import swifter +from psutil import cpu_count + logging.getLogger(__name__) logging.info(f"Version {swifter.__version__}") @@ -58,6 +61,28 @@ self.addTypeEqualityFunc(pd.DataFrame, self.assertDataFrameEqual) def test_set_npartitions(self): + for swifter_df, set_npartitions, expected in zip( + [ + pd.DataFrame().swifter, + pd.Series().swifter, + pd.DataFrame( + {"x": np.arange(0, 10)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=10) + ).swifter.rolling("1d"), + pd.DataFrame( + {"x": np.arange(0, 10)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=10) + ).swifter.resample("3T"), + ], + [None, 1000, 1001, 1002], + [cpu_count() * 2, 1000, 1001, 1002], + ): + before = swifter_df._npartitions + swifter_df.set_npartitions(set_npartitions) + actual = swifter_df._npartitions + self.assertEqual(actual, expected) + if set_npartitions is not None: + self.assertNotEqual(before, actual) + + def test_set_dask_threshold(self): expected = 1000 for swifter_df in [ pd.DataFrame().swifter, @@ -69,9 +94,9 @@ {"x": np.arange(0, 10)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=10) ).swifter.resample("3T"), ]: - before = swifter_df._npartitions - swifter_df.set_npartitions(expected) - actual = swifter_df._npartitions + before = swifter_df._dask_threshold + swifter_df.set_dask_threshold(expected) + actual = swifter_df._dask_threshold self.assertEqual(actual, expected) self.assertNotEqual(before, actual) @@ -123,7 +148,7 @@ def test_stdout_redirected(self): print_messages = subprocess.check_output( [ - "python", + sys.executable, "-c", "import pandas as pd; import numpy as np; import swifter; " + "df = pd.DataFrame({'x': np.random.normal(size=4)}); " @@ -133,6 +158,50 @@ ) self.assertEqual(len(print_messages.decode("utf-8").rstrip("\n").split("\n")), 1) + def test_apply_on_empty_series(self): + series = pd.Series() + pd_val = series.apply(math_foo, compare_to=1) + swifter_val = series.swifter.apply(math_foo, compare_to=1) + self.assertEqual(pd_val, swifter_val) + + def test_apply_on_empty_dataframe(self): + df = pd.DataFrame(columns=["x", "y"]) + pd_val = df.apply(math_vec_multiply, axis=1) + swifter_val = df.swifter.apply(math_vec_multiply, axis=1) + self.assertEqual(pd_val, swifter_val) + + def test_applymap_on_empty_dataframe(self): + df = pd.DataFrame(columns=["x", "y"]) + pd_val = df.applymap(math_vec_square) + swifter_val = df.swifter.applymap(math_vec_square) + self.assertEqual(pd_val, swifter_val) + + def test_rolling_apply_on_empty_dataframe(self): + df = pd.DataFrame(columns=["x", "y"]) + pd_val = df.rolling(1).apply(math_agg_foo) + swifter_val = df.swifter.rolling(1).apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + + def test_resample_apply_on_empty_dataframe(self): + df = pd.DataFrame(columns=["x", "y"], index=pd.DatetimeIndex(freq="3d", periods=0, start="2020/01/01")) + pd_val = df.resample("1d").apply(math_agg_foo) + swifter_val = df.swifter.resample("1d").apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + + def test_nonvectorized_math_apply_on_small_series(self): + df = pd.DataFrame({"x": np.random.normal(size=1000)}) + series = df["x"] + pd_val = series.apply(math_foo, compare_to=1) + swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_foo, compare_to=1) + self.assertEqual(pd_val, swifter_val) + + def test_nonvectorized_math_apply_on_small_series_no_progress_bar(self): + df = pd.DataFrame({"x": np.random.normal(size=1000)}) + series = df["x"] + pd_val = series.apply(math_foo, compare_to=1) + swifter_val = series.swifter.progress_bar(enable=False).apply(math_foo, compare_to=1) + self.assertEqual(pd_val, swifter_val) + def test_vectorized_math_apply_on_large_series(self): df = pd.DataFrame({"x": np.random.normal(size=1_000_000)}) series = df["x"] @@ -143,7 +212,7 @@ pd_time = end_pd - start_pd start_swifter = time.time() - swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_vec_square) + swifter_val = series.swifter.progress_bar(desc="Vec math apply ~ Series").apply(math_vec_square, axis=0) end_swifter = time.time() swifter_time = end_swifter - start_swifter @@ -167,6 +236,18 @@ self.assertEqual(pd_val, swifter_val) self.assertLess(swifter_time, pd_time) + def test_nonvectorized_math_apply_on_small_dataframe(self): + df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) + pd_val = df.apply(math_agg_foo) + swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + + def test_nonvectorized_math_apply_on_small_dataframe_no_progress_bar(self): + df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) + pd_val = df.apply(math_agg_foo) + swifter_val = df.swifter.progress_bar(enable=False).apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + def test_vectorized_math_apply_on_large_dataframe(self): df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)}) @@ -183,16 +264,36 @@ self.assertEqual(pd_val, swifter_val) self.assertLess(swifter_time, pd_time) - def test_nonvectorized_math_apply_on_large_dataframe(self): + def test_nonvectorized_math_apply_on_large_dataframe_broadcast(self): df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)}) start_pd = time.time() - pd_val = df.apply(math_agg_foo, axis=1) + pd_val = df.apply(math_agg_foo, axis=1, broadcast=True) end_pd = time.time() pd_time = end_pd - start_pd start_swifter = time.time() - swifter_val = df.swifter.progress_bar(desc="Nonvec math apply ~ DF").apply(math_agg_foo, axis=1) + swifter_val = df.swifter.progress_bar(desc="Nonvec math apply + broadcast ~ DF").apply( + math_agg_foo, axis=1, broadcast=True + ) + end_swifter = time.time() + swifter_time = end_swifter - start_swifter + + self.assertEqual(pd_val, swifter_val) + self.assertLess(swifter_time, pd_time) + + def test_nonvectorized_math_apply_on_large_dataframe_reduce(self): + df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)}) + + start_pd = time.time() + pd_val = df.apply(math_agg_foo, axis=1, reduce=True) + end_pd = time.time() + pd_time = end_pd - start_pd + + start_swifter = time.time() + swifter_val = df.swifter.progress_bar(desc="Nonvec math apply + reduce ~ DF").apply( + math_agg_foo, axis=1, reduce=True + ) end_swifter = time.time() swifter_time = end_swifter - start_swifter @@ -217,6 +318,18 @@ self.assertEqual(pd_val, swifter_val) self.assertLess(swifter_time, pd_time) + def test_nonvectorized_math_apply_on_small_rolling_dataframe(self): + df = pd.DataFrame({"x": np.arange(0, 1000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=1000)) + pd_val = df.rolling("3T").apply(math_agg_foo) + swifter_val = df.swifter.rolling("3T").progress_bar(desc="Nonvec math apply ~ Rolling DF").apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + + def test_nonvectorized_math_apply_on_small_rolling_dataframe_no_progress_bar(self): + df = pd.DataFrame({"x": np.arange(0, 1000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=1000)) + pd_val = df.rolling("3T").apply(math_agg_foo) + swifter_val = df.swifter.rolling("3T").progress_bar(enable=False).apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + def test_vectorized_math_apply_on_large_rolling_dataframe(self): df = pd.DataFrame( {"x": np.arange(0, 1_500_000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=1_500_000) @@ -237,7 +350,7 @@ def test_nonvectorized_math_apply_on_large_rolling_dataframe(self): df = pd.DataFrame( - {"x": np.arange(0, 1_500_000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=1_500_000) + {"x": np.arange(0, 2_000_000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=2_000_000) ) start_pd = time.time() @@ -253,6 +366,12 @@ self.assertEqual(pd_val, swifter_val) self.assertLess(swifter_time, pd_time) + def test_nonvectorized_math_apply_on_small_resampler_dataframe(self): + df = pd.DataFrame({"x": np.arange(0, 1000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=1000)) + pd_val = df.resample("3T").apply(math_agg_foo) + swifter_val = df.swifter.resample("3T").progress_bar(desc="Nonvec math apply ~ Resample DF").apply(math_agg_foo) + self.assertEqual(pd_val, swifter_val) + def test_nonvectorized_math_apply_on_large_resampler_dataframe(self): df = pd.DataFrame( {"x": np.arange(0, 1_000_000)}, index=pd.date_range("2019-01-1", "2020-01-1", periods=1_000_000) @@ -270,3 +389,47 @@ self.assertEqual(pd_val, swifter_val) self.assertLess(swifter_time, pd_time) + + def test_vectorized_math_applymap_on_large_dataframe(self): + df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)}) + + start_pd = time.time() + pd_val = df.applymap(math_vec_square) + end_pd = time.time() + pd_time = end_pd - start_pd + + start_swifter = time.time() + swifter_val = df.swifter.progress_bar(desc="Vec math applymap ~ DF").applymap(math_vec_square) + end_swifter = time.time() + swifter_time = end_swifter - start_swifter + + self.assertEqual(pd_val, swifter_val) + self.assertLess(swifter_time, pd_time) + + def test_nonvectorized_math_applymap_on_large_dataframe(self): + df = pd.DataFrame({"x": np.random.normal(size=2_000_000), "y": np.random.uniform(size=2_000_000)}) + + start_pd = time.time() + pd_val = df.applymap(math_foo) + end_pd = time.time() + pd_time = end_pd - start_pd + + start_swifter = time.time() + swifter_val = df.swifter.progress_bar(desc="Nonvec math applymap ~ DF").applymap(math_foo) + end_swifter = time.time() + swifter_time = end_swifter - start_swifter + + self.assertEqual(pd_val, swifter_val) + self.assertLess(swifter_time, pd_time) + + def test_nonvectorized_math_applymap_on_small_dataframe(self): + df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) + pd_val = df.applymap(math_foo) + swifter_val = df.swifter.applymap(math_foo) + self.assertEqual(pd_val, swifter_val) + + def test_nonvectorized_math_applymap_on_small_dataframe_no_progress_bar(self): + df = pd.DataFrame({"x": np.random.normal(size=1000), "y": np.random.uniform(size=1000)}) + pd_val = df.applymap(math_foo) + swifter_val = df.swifter.progress_bar(enable=False).applymap(math_foo) + self.assertEqual(pd_val, swifter_val)
