HyukjinKwon commented on a change in pull request #34251:
URL: https://github.com/apache/spark/pull/34251#discussion_r727626053



##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -98,25 +98,29 @@ def set_result_text(self, ax):
             )
 
 
-class HistogramPlotBase:
-    @staticmethod
-    def prepare_hist_data(data, bins):
-        # TODO: this logic is similar with KdePlotBase. Might have to 
deduplicate it.
-        from pyspark.pandas.series import Series
+def _prepare_numeric_data(data):

Review comment:
       Can we create a class called `NumericPlotBase` and define this method, 
and let `HistogramPlotBase` and `KdePlotBase` inherit it?

##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -98,25 +98,29 @@ def set_result_text(self, ax):
             )
 
 
-class HistogramPlotBase:
-    @staticmethod
-    def prepare_hist_data(data, bins):
-        # TODO: this logic is similar with KdePlotBase. Might have to 
deduplicate it.
-        from pyspark.pandas.series import Series
+def _prepare_numeric_data(data):

Review comment:
       Many checks types only when types are provided :)

##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -98,25 +98,29 @@ def set_result_text(self, ax):
             )
 
 
-class HistogramPlotBase:
-    @staticmethod
-    def prepare_hist_data(data, bins):
-        # TODO: this logic is similar with KdePlotBase. Might have to 
deduplicate it.
-        from pyspark.pandas.series import Series
+def _prepare_numeric_data(data):

Review comment:
       mypy checks types only when types are provided :)

##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -117,6 +116,13 @@ def prepare_hist_data(data, bins):
                 "Empty {0!r}: no numeric data to " 
"plot".format(numeric_data.__class__.__name__)
             )
 
+        return data, numeric_data
+
+
+class HistogramPlotBase(NumericPlotBase):
+    @staticmethod
+    def prepare_hist_data(data, bins):
+        data, numeric_data = NumericPlotBase.prepare_numeric_data(data)

Review comment:
       ```suggestion
           data, numeric_data = self.prepare_numeric_data(data)
   ```

##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -340,25 +346,10 @@ def get_fliers(colname, outliers, min_val):
         return fliers
 
 
-class KdePlotBase:
+class KdePlotBase(NumericPlotBase):
     @staticmethod
     def prepare_kde_data(data):
-        # TODO: this logic is similar with HistogramPlotBase. Might have to 
deduplicate it.
-        from pyspark.pandas.series import Series
-
-        if isinstance(data, Series):
-            data = data.to_frame()
-
-        numeric_data = data.select_dtypes(
-            include=["byte", "decimal", "integer", "float", "long", "double", 
np.datetime64]
-        )
-
-        # no empty frames or series allowed
-        if len(numeric_data.columns) == 0:
-            raise TypeError(
-                "Empty {0!r}: no numeric data to " 
"plot".format(numeric_data.__class__.__name__)
-            )
-
+        _, numeric_data = NumericPlotBase.prepare_numeric_data(data)

Review comment:
       ```suggestion
           _, numeric_data = self.prepare_numeric_data(data)
   ```

##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -340,25 +346,10 @@ def get_fliers(colname, outliers, min_val):
         return fliers
 
 
-class KdePlotBase:
+class KdePlotBase(NumericPlotBase):
     @staticmethod
     def prepare_kde_data(data):
-        # TODO: this logic is similar with HistogramPlotBase. Might have to 
deduplicate it.
-        from pyspark.pandas.series import Series
-
-        if isinstance(data, Series):
-            data = data.to_frame()
-
-        numeric_data = data.select_dtypes(
-            include=["byte", "decimal", "integer", "float", "long", "double", 
np.datetime64]
-        )
-
-        # no empty frames or series allowed
-        if len(numeric_data.columns) == 0:
-            raise TypeError(
-                "Empty {0!r}: no numeric data to " 
"plot".format(numeric_data.__class__.__name__)
-            )
-
+        _, numeric_data = self.prepare_numeric_data(data)

Review comment:
       ```suggestion
           _, numeric_data = NumericPlotBase.prepare_numeric_data(data)
   ```

##########
File path: python/pyspark/pandas/plot/core.py
##########
@@ -117,6 +116,13 @@ def prepare_hist_data(data, bins):
                 "Empty {0!r}: no numeric data to " 
"plot".format(numeric_data.__class__.__name__)
             )
 
+        return data, numeric_data
+
+
+class HistogramPlotBase(NumericPlotBase):
+    @staticmethod
+    def prepare_hist_data(data, bins):
+        data, numeric_data = self.prepare_numeric_data(data)

Review comment:
       ```suggestion
           data, numeric_data = NumericPlotBase.prepare_numeric_data(data)
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to