[beam] branch master updated: Only value_counts.drop_na for Pandas 2 (#28500)

tvalentyn Tue, 19 Sep 2023 12:03:24 -0700

This is an automated email from the ASF dual-hosted git repository.

tvalentyn pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git



The following commit(s) were added to refs/heads/master by this push:
     new 0f2f3b12ac6 Only  value_counts.drop_na for Pandas 2 (#28500)
0f2f3b12ac6 is described below

commit 0f2f3b12ac6f13dd068e95c22146f7aa636c6df4
Author: caneff <[email protected]>
AuthorDate: Tue Sep 19 15:03:10 2023 -0400

    Only  value_counts.drop_na for Pandas 2 (#28500)
---
 sdks/python/apache_beam/dataframe/frames.py      | 17 ++++++++++++++---
 sdks/python/apache_beam/dataframe/frames_test.py |  2 ++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/sdks/python/apache_beam/dataframe/frames.py 
b/sdks/python/apache_beam/dataframe/frames.py
index b864e952e19..7929c879bdd 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -2348,8 +2348,13 @@ class DeferredSeries(DeferredDataFrameOrSeries):
 
     result = column.groupby(column, dropna=dropna).size()
 
-    # groupby.size() names the index, which we don't need
-    result.index.name = None
+    # Pandas 2 introduces new naming for the results.
+    if PD_VERSION >= (2, 0):
+      result.index.name = getattr(self, "name", None)
+      result.name = "proportion" if normalize else "count"
+    else:
+      # groupby.size() names the index, which we don't need
+      result.index.name = None
 
     if normalize:
       return result / column.length()
@@ -4007,12 +4012,18 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
       columns = subset or list(self.columns)
 
       if dropna:
-        dropped = self.dropna()
+        # Must include subset here because otherwise we spuriously drop NAs due
+        # to columns outside our subset.
+        dropped = self.dropna(subset=subset)
       else:
         dropped = self
 
       result = dropped.groupby(columns, dropna=dropna).size()
 
+      # Pandas 2 introduces new naming for the results.
+      if PD_VERSION >= (2,0):
+        result.name = "proportion" if normalize else "count"
+
       if normalize:
         return result/dropped.length()
       else:
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py 
b/sdks/python/apache_beam/dataframe/frames_test.py
index 5d904855f87..257d77e0a6b 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -694,6 +694,8 @@ class DeferredFrameTest(_AbstractFrameTest):
 
     self._run_test(lambda df: df.value_counts(), df)
     self._run_test(lambda df: df.value_counts(normalize=True), df)
+    # Ensure we don't drop rows due to nan values in unused columns.
+    self._run_test(lambda df: df.value_counts('num_wings'), df)
 
     if PD_VERSION >= (1, 3):
       # dropna=False is new in pandas 1.3

[beam] branch master updated: Only value_counts.drop_na for Pandas 2 (#28500)

Reply via email to