zhengruifeng commented on code in PR #48447:
URL: https://github.com/apache/spark/pull/48447#discussion_r1798720533
##########
python/pyspark/sql/plot/core.py:
##########
@@ -338,3 +340,154 @@ def pie(self, x: str, y: str, **kwargs: Any) -> "Figure":
},
)
return self(kind="pie", x=x, y=y, **kwargs)
+
+ def box(self, column: Union[str, List[str]], precision: float = 0.01,
**kwargs: Any):
+ """
+ Make a box plot of the DataFrame columns.
+
+ Make a box-and-whisker plot from DataFrame columns, optionally grouped
by some
+ other columns. A box plot is a method for graphically depicting groups
of numerical
+ data through their quartiles. The box extends from the Q1 to Q3
quartile values of
+ the data, with a line at the median (Q2). The whiskers extend from the
edges of box
+ to show the range of the data. By default, they extend no more than
+ 1.5 * IQR (IQR = Q3 - Q1) from the edges of the box, ending at the
farthest data point
+ within that interval. Outliers are plotted as separate dots.
+
+ Parameters
+ ----------
+ column: str or list of str
+ Column name or list of names to be used for creating the boxplot.
+ precision: float, default = 0.01
+ This argument is used by pyspark to compute approximate statistics
+ for building a boxplot.
+ **kwargs
+ Additional keyword arguments.
+
+ Returns
+ -------
+ :class:`plotly.graph_objs.Figure`
+ Return an custom object when ``backend!=plotly``.
+ Return an ndarray when ``subplots=True`` (matplotlib-only).
+
+ Notes
+ -----
+ There are behavior differences between pandas-on-Spark and pandas.
+
+ * pandas-on-Spark computes approximate statistics - expect differences
between
+ pandas and pandas-on-Spark boxplots, especially regarding 1st and
3rd quartiles.
+ * The `whis` argument is only supported as a single number.
+ * pandas-on-Spark doesn't support the following argument(s)
(matplotlib-only).
+
+ * `bootstrap` argument is not supported
+ * `autorange` argument is not supported
+
+ Examples
+ --------
+ Draw a box plot from a DataFrame with four columns of randomly
+ generated data.
+
+ For Series:
+
+ .. plotly::
+
+ >>> data = np.random.randn(25, 4)
+ >>> df = ps.DataFrame(data, columns=list('ABCD'))
+ >>> df['A'].plot.box() # doctest: +SKIP
+
+ This is an unsupported function for DataFrame type
+ """
+ return self(kind="box", column=column, precision=precision, **kwargs)
+
+
+class PySparkBoxPlotBase:
+ @staticmethod
+ def compute_box(
+ sdf: "DataFrame", colnames: List[str], whis: float, precision: float,
showfliers: bool
+ ):
+ assert len(colnames) > 0
+ formatted_colnames = ["`{}`".format(colname) for colname in colnames]
+
+ stats_scols = []
+ for i, colname in enumerate(formatted_colnames):
+ percentiles = F.percentile_approx(colname, [0.25, 0.50, 0.75],
int(1.0 / precision))
+ q1 = F.get(percentiles, 0)
+ med = F.get(percentiles, 1)
+ q3 = F.get(percentiles, 2)
+ iqr = q3 - q1
+ lfence = q1 - F.lit(whis) * iqr
+ ufence = q3 + F.lit(whis) * iqr
+
+ stats_scols.append(
+ F.struct(
+ F.mean(colname).alias("mean"),
+ med.alias("med"),
+ q1.alias("q1"),
+ q3.alias("q3"),
+ lfence.alias("lfence"),
+ ufence.alias("ufence"),
+ ).alias(f"_box_plot_stats_{i}")
+ )
+
+ sdf_stats = sdf.select(*stats_scols)
+
+ result_scols = []
+ for i, colname in enumerate(formatted_colnames):
+ value = F.col(colname)
+
+ lfence = F.col(f"_box_plot_stats_{i}.lfence")
+ ufence = F.col(f"_box_plot_stats_{i}.ufence")
+ mean = F.col(f"_box_plot_stats_{i}.mean")
+ med = F.col(f"_box_plot_stats_{i}.med")
+ q1 = F.col(f"_box_plot_stats_{i}.q1")
+ q3 = F.col(f"_box_plot_stats_{i}.q3")
+
+ outlier = ~value.between(lfence, ufence)
+
+ # Computes min and max values of non-outliers - the whiskers
+ upper_whisker = F.max(F.when(~outlier,
value).otherwise(F.lit(None)))
+ lower_whisker = F.min(F.when(~outlier,
value).otherwise(F.lit(None)))
+
+ # If it shows fliers, take the top 1k with the highest absolute
values
+ # Here we normalize the values by subtracting the median.
+ if showfliers:
+ pair = F.when(
+ outlier,
+ F.struct(F.abs(value - med), value.alias("val")),
+ ).otherwise(F.lit(None))
+ topk = collect_top_k(pair, 1001, False)
+ fliers = F.when(F.size(topk) > 0,
topk["val"]).otherwise(F.lit(None))
+ else:
+ fliers = F.lit(None)
+
+ result_scols.append(
+ F.struct(
+ F.first(mean).alias("mean"),
+ F.first(med).alias("med"),
+ F.first(q1).alias("q1"),
+ F.first(q3).alias("q3"),
+ upper_whisker.alias("upper_whisker"),
+ lower_whisker.alias("lower_whisker"),
+ fliers.alias("fliers"),
+ ).alias(f"_box_plot_results_{i}")
+ )
+
+ sdf_result =
sdf.join(sdf_stats.hint("broadcast")).select(*result_scols)
+ return sdf_result.first()
+
+
+def _invoke_internal_function_over_columns(name: str, *cols: "ColumnOrName")
-> Column:
Review Comment:
todo for myself: find a proper place for this helper function so it can be
used in both pyspark df and ps.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]