This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new be546ff5cd08 [SPARK-49890][PYTHON] Extract the preparation of
df.sample to parent class
be546ff5cd08 is described below
commit be546ff5cd084ff3d883b554776d544493b38d3e
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Oct 7 20:46:17 2024 +0900
[SPARK-49890][PYTHON] Extract the preparation of df.sample to parent class
### What changes were proposed in this pull request?
Extract the preparation of df.sample to parent class
### Why are the changes needed?
deduplicate codes
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
existing tests
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #48365 from zhengruifeng/py_sql_sample.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/sql/classic/dataframe.py | 40 ++--------------------------
python/pyspark/sql/connect/dataframe.py | 47 +++------------------------------
python/pyspark/sql/dataframe.py | 42 +++++++++++++++++++++++++++++
3 files changed, 48 insertions(+), 81 deletions(-)
diff --git a/python/pyspark/sql/classic/dataframe.py
b/python/pyspark/sql/classic/dataframe.py
index 9f9dedbd3820..e412b98c47de 100644
--- a/python/pyspark/sql/classic/dataframe.py
+++ b/python/pyspark/sql/classic/dataframe.py
@@ -601,44 +601,8 @@ class DataFrame(ParentDataFrame, PandasMapOpsMixin,
PandasConversionMixin):
fraction: Optional[Union[int, float]] = None,
seed: Optional[int] = None,
) -> ParentDataFrame:
- # For the cases below:
- # sample(True, 0.5 [, seed])
- # sample(True, fraction=0.5 [, seed])
- # sample(withReplacement=False, fraction=0.5 [, seed])
- is_withReplacement_set = type(withReplacement) == bool and
isinstance(fraction, float)
-
- # For the case below:
- # sample(faction=0.5 [, seed])
- is_withReplacement_omitted_kwargs = withReplacement is None and
isinstance(fraction, float)
-
- # For the case below:
- # sample(0.5 [, seed])
- is_withReplacement_omitted_args = isinstance(withReplacement, float)
-
- if not (
- is_withReplacement_set
- or is_withReplacement_omitted_kwargs
- or is_withReplacement_omitted_args
- ):
- argtypes = [type(arg).__name__ for arg in [withReplacement,
fraction, seed]]
- raise PySparkTypeError(
- errorClass="NOT_BOOL_OR_FLOAT_OR_INT",
- messageParameters={
- "arg_name": "withReplacement (optional), "
- + "fraction (required) and seed (optional)",
- "arg_type": ", ".join(argtypes),
- },
- )
-
- if is_withReplacement_omitted_args:
- if fraction is not None:
- seed = cast(int, fraction)
- fraction = withReplacement
- withReplacement = None
-
- seed = int(seed) if seed is not None else None
- args = [arg for arg in [withReplacement, fraction, seed] if arg is not
None]
- jdf = self._jdf.sample(*args)
+ _w, _f, _s = self._preapare_args_for_sample(withReplacement, fraction,
seed)
+ jdf = self._jdf.sample(*[_w, _f, _s])
return DataFrame(jdf, self.sparkSession)
def sampleBy(
diff --git a/python/pyspark/sql/connect/dataframe.py
b/python/pyspark/sql/connect/dataframe.py
index 136fe60532df..bb4dcb38c9e5 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -781,53 +781,14 @@ class DataFrame(ParentDataFrame):
fraction: Optional[Union[int, float]] = None,
seed: Optional[int] = None,
) -> ParentDataFrame:
- # For the cases below:
- # sample(True, 0.5 [, seed])
- # sample(True, fraction=0.5 [, seed])
- # sample(withReplacement=False, fraction=0.5 [, seed])
- is_withReplacement_set = type(withReplacement) == bool and
isinstance(fraction, float)
-
- # For the case below:
- # sample(faction=0.5 [, seed])
- is_withReplacement_omitted_kwargs = withReplacement is None and
isinstance(fraction, float)
-
- # For the case below:
- # sample(0.5 [, seed])
- is_withReplacement_omitted_args = isinstance(withReplacement, float)
-
- if not (
- is_withReplacement_set
- or is_withReplacement_omitted_kwargs
- or is_withReplacement_omitted_args
- ):
- argtypes = [type(arg).__name__ for arg in [withReplacement,
fraction, seed]]
- raise PySparkTypeError(
- errorClass="NOT_BOOL_OR_FLOAT_OR_INT",
- messageParameters={
- "arg_name": "withReplacement (optional), "
- + "fraction (required) and seed (optional)",
- "arg_type": ", ".join(argtypes),
- },
- )
-
- if is_withReplacement_omitted_args:
- if fraction is not None:
- seed = cast(int, fraction)
- fraction = withReplacement
- withReplacement = None
-
- if withReplacement is None:
- withReplacement = False
-
- seed = int(seed) if seed is not None else random.randint(0,
sys.maxsize)
-
+ _w, _f, _s = self._preapare_args_for_sample(withReplacement, fraction,
seed)
res = DataFrame(
plan.Sample(
child=self._plan,
lower_bound=0.0,
- upper_bound=fraction, # type: ignore[arg-type]
- with_replacement=withReplacement, # type: ignore[arg-type]
- seed=seed,
+ upper_bound=_f,
+ with_replacement=_w,
+ seed=_s,
),
session=self._session,
)
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 5906108163b4..c21e2271a64a 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -17,6 +17,8 @@
# mypy: disable-error-code="empty-body"
+import sys
+import random
from typing import (
Any,
Callable,
@@ -2040,6 +2042,46 @@ class DataFrame:
"""
...
+ def _preapare_args_for_sample(
+ self,
+ withReplacement: Optional[Union[float, bool]] = None,
+ fraction: Optional[Union[int, float]] = None,
+ seed: Optional[int] = None,
+ ) -> Tuple[bool, float, int]:
+ from pyspark.errors import PySparkTypeError
+
+ if isinstance(withReplacement, bool) and isinstance(fraction, float):
+ # For the cases below:
+ # sample(True, 0.5 [, seed])
+ # sample(True, fraction=0.5 [, seed])
+ # sample(withReplacement=False, fraction=0.5 [, seed])
+ _seed = int(seed) if seed is not None else random.randint(0,
sys.maxsize)
+ return withReplacement, fraction, _seed
+
+ elif withReplacement is None and isinstance(fraction, float):
+ # For the case below:
+ # sample(faction=0.5 [, seed])
+ _seed = int(seed) if seed is not None else random.randint(0,
sys.maxsize)
+ return False, fraction, _seed
+
+ elif isinstance(withReplacement, float):
+ # For the case below:
+ # sample(0.5 [, seed])
+ _seed = int(fraction) if fraction is not None else
random.randint(0, sys.maxsize)
+ _fraction = float(withReplacement)
+ return False, _fraction, _seed
+
+ else:
+ argtypes = [type(arg).__name__ for arg in [withReplacement,
fraction, seed]]
+ raise PySparkTypeError(
+ errorClass="NOT_BOOL_OR_FLOAT_OR_INT",
+ messageParameters={
+ "arg_name": "withReplacement (optional), "
+ + "fraction (required) and seed (optional)",
+ "arg_type": ", ".join(argtypes),
+ },
+ )
+
@dispatch_df_method
def sampleBy(
self, col: "ColumnOrName", fractions: Dict[Any, float], seed:
Optional[int] = None
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]