This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new b1ddec5757ae [SPARK-49391][PS] Box plot select outliers by distance
from fences
b1ddec5757ae is described below
commit b1ddec5757aeef69bdd4b08f4f75096b129f5d31
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Mon Aug 26 18:10:36 2024 +0800
[SPARK-49391][PS] Box plot select outliers by distance from fences
### What changes were proposed in this pull request?
Box plot select outliers by distance from fences
### Why are the changes needed?
if there are more than 1k outliers, existing implementations select the
values by distance `|value - min(non_outliers)|` which is not reasonable
because it prefers outliers above upper fence over outliers below lower fence.
We should order them by the distance from fences:
1, if value > upper fence, value - upper fence;
2, it value < lower fence, lower fence - value;
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI and manually test
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #47870 from zhengruifeng/plot_hist_select_outlier.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/pandas/plot/core.py | 42 ++++++++++++++++------
python/pyspark/pandas/plot/matplotlib.py | 2 +-
python/pyspark/pandas/plot/plotly.py | 4 +--
.../pyspark/pandas/tests/plot/test_series_plot.py | 2 +-
4 files changed, 35 insertions(+), 15 deletions(-)
diff --git a/python/pyspark/pandas/plot/core.py
b/python/pyspark/pandas/plot/core.py
index 2e188b411df1..fe5beb0e730d 100644
--- a/python/pyspark/pandas/plot/core.py
+++ b/python/pyspark/pandas/plot/core.py
@@ -420,14 +420,24 @@ class BoxPlotBase:
return minmax.iloc[0][["min", "max"]].values
@staticmethod
- def get_fliers(colname, outliers, min_val):
+ def get_fliers(colname, outliers, lfence, ufence):
# Filters only the outliers, should "showfliers" be True
fliers_df = outliers.filter("`__{}_outlier`".format(colname))
# If it shows fliers, take the top 1k with highest absolute values
- # Here we normalize the values by subtracting the minimum value from
- # each, and use absolute values.
- order_col = F.abs(F.col("`{}`".format(colname)) - min_val.item())
+ # Here we normalize the values by subtracting the fences.
+ formated_colname = "`{}`".format(colname)
+ order_col = (
+ F.when(
+ F.col(formated_colname) > F.lit(ufence),
+ F.col(formated_colname) - F.lit(ufence),
+ )
+ .when(
+ F.col(formated_colname) < F.lit(lfence),
+ F.lit(lfence) - F.col(formated_colname),
+ )
+ .otherwise(F.lit(None))
+ )
fliers = (
fliers_df.select(F.col("`{}`".format(colname)))
.orderBy(order_col)
@@ -439,15 +449,26 @@ class BoxPlotBase:
return fliers
@staticmethod
- def get_multicol_fliers(colnames, multicol_outliers, multicol_whiskers):
+ def get_multicol_fliers(colnames, multicol_outliers, multicol_stats):
scols = []
- extract_colnames = []
for i, colname in enumerate(colnames):
formated_colname = "`{}`".format(colname)
outlier_colname = "__{}_outlier".format(colname)
- min_val = multicol_whiskers[colname]["min"]
+ lfence, ufence = multicol_stats[colname]["lfence"],
multicol_stats[colname]["ufence"]
+ order_col = (
+ F.when(
+ F.col(formated_colname) > F.lit(ufence),
+ F.col(formated_colname) - F.lit(ufence),
+ )
+ .when(
+ F.col(formated_colname) < F.lit(lfence),
+ F.lit(lfence) - F.col(formated_colname),
+ )
+ .otherwise(F.lit(None))
+ )
+
pair_col = F.struct(
- F.abs(F.col(formated_colname) - F.lit(min_val)).alias("ord"),
+ order_col.alias("ord"),
F.col(formated_colname).alias("val"),
)
scols.append(
@@ -457,11 +478,10 @@ class BoxPlotBase:
.alias(f"pair_{i}"),
1001,
False,
- ).alias(f"top_{i}")
+ ).alias(f"top_{i}")["val"]
)
- extract_colnames.append(f"top_{i}.val")
- results =
multicol_outliers.select(scols).select(extract_colnames).first()
+ results = multicol_outliers.select(scols).first()
fliers = {}
for i, colname in enumerate(colnames):
diff --git a/python/pyspark/pandas/plot/matplotlib.py
b/python/pyspark/pandas/plot/matplotlib.py
index f496f2bc664b..3d045ffc8d6b 100644
--- a/python/pyspark/pandas/plot/matplotlib.py
+++ b/python/pyspark/pandas/plot/matplotlib.py
@@ -292,7 +292,7 @@ class PandasOnSparkBoxPlot(PandasBoxPlot, BoxPlotBase):
whiskers = BoxPlotBase.calc_whiskers(spark_column_name, outliers)
if showfliers:
- fliers = BoxPlotBase.get_fliers(spark_column_name, outliers,
whiskers[0])
+ fliers = BoxPlotBase.get_fliers(spark_column_name, outliers,
*col_fences)
else:
fliers = []
diff --git a/python/pyspark/pandas/plot/plotly.py
b/python/pyspark/pandas/plot/plotly.py
index 0afcd6d7e869..995060eb9c12 100644
--- a/python/pyspark/pandas/plot/plotly.py
+++ b/python/pyspark/pandas/plot/plotly.py
@@ -162,7 +162,7 @@ def plot_box(data: Union["ps.DataFrame", "ps.Series"],
**kwargs):
fliers = None
if boxpoints:
- fliers = BoxPlotBase.get_fliers(spark_column_name, outliers,
whiskers[0])
+ fliers = BoxPlotBase.get_fliers(spark_column_name, outliers,
*col_fences)
fliers = [fliers] if len(fliers) > 0 else None
fig.add_trace(
@@ -201,7 +201,7 @@ def plot_box(data: Union["ps.DataFrame", "ps.Series"],
**kwargs):
fliers = None
if boxpoints:
- fliers = BoxPlotBase.get_multicol_fliers(numeric_column_names,
outliers, whiskers)
+ fliers = BoxPlotBase.get_multicol_fliers(numeric_column_names,
outliers, multicol_stats)
i = 0
for colname in numeric_column_names:
diff --git a/python/pyspark/pandas/tests/plot/test_series_plot.py
b/python/pyspark/pandas/tests/plot/test_series_plot.py
index 9daefbc2a23b..9bd335af527e 100644
--- a/python/pyspark/pandas/tests/plot/test_series_plot.py
+++ b/python/pyspark/pandas/tests/plot/test_series_plot.py
@@ -61,7 +61,7 @@ class SeriesPlotTestsMixin:
stats, fences = BoxPlotBase.compute_stats(psdf["a"], "a", whis=k,
precision=0.01)
outliers = BoxPlotBase.outliers(psdf["a"], "a", *fences)
whiskers = BoxPlotBase.calc_whiskers("a", outliers)
- fliers = BoxPlotBase.get_fliers("a", outliers, whiskers[0])
+ fliers = BoxPlotBase.get_fliers("a", outliers, *fences)
expected_mean = pdf["a"].mean()
expected_median = pdf["a"].median()
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]