This is an automated email from the ASF dual-hosted git repository.
tvalentyn pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/beam.git
The following commit(s) were added to refs/heads/master by this push:
new 0f2f3b12ac6 Only value_counts.drop_na for Pandas 2 (#28500)
0f2f3b12ac6 is described below
commit 0f2f3b12ac6f13dd068e95c22146f7aa636c6df4
Author: caneff <[email protected]>
AuthorDate: Tue Sep 19 15:03:10 2023 -0400
Only value_counts.drop_na for Pandas 2 (#28500)
---
sdks/python/apache_beam/dataframe/frames.py | 17 ++++++++++++++---
sdks/python/apache_beam/dataframe/frames_test.py | 2 ++
2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/sdks/python/apache_beam/dataframe/frames.py
b/sdks/python/apache_beam/dataframe/frames.py
index b864e952e19..7929c879bdd 100644
--- a/sdks/python/apache_beam/dataframe/frames.py
+++ b/sdks/python/apache_beam/dataframe/frames.py
@@ -2348,8 +2348,13 @@ class DeferredSeries(DeferredDataFrameOrSeries):
result = column.groupby(column, dropna=dropna).size()
- # groupby.size() names the index, which we don't need
- result.index.name = None
+ # Pandas 2 introduces new naming for the results.
+ if PD_VERSION >= (2, 0):
+ result.index.name = getattr(self, "name", None)
+ result.name = "proportion" if normalize else "count"
+ else:
+ # groupby.size() names the index, which we don't need
+ result.index.name = None
if normalize:
return result / column.length()
@@ -4007,12 +4012,18 @@ class DeferredDataFrame(DeferredDataFrameOrSeries):
columns = subset or list(self.columns)
if dropna:
- dropped = self.dropna()
+ # Must include subset here because otherwise we spuriously drop NAs due
+ # to columns outside our subset.
+ dropped = self.dropna(subset=subset)
else:
dropped = self
result = dropped.groupby(columns, dropna=dropna).size()
+ # Pandas 2 introduces new naming for the results.
+ if PD_VERSION >= (2,0):
+ result.name = "proportion" if normalize else "count"
+
if normalize:
return result/dropped.length()
else:
diff --git a/sdks/python/apache_beam/dataframe/frames_test.py
b/sdks/python/apache_beam/dataframe/frames_test.py
index 5d904855f87..257d77e0a6b 100644
--- a/sdks/python/apache_beam/dataframe/frames_test.py
+++ b/sdks/python/apache_beam/dataframe/frames_test.py
@@ -694,6 +694,8 @@ class DeferredFrameTest(_AbstractFrameTest):
self._run_test(lambda df: df.value_counts(), df)
self._run_test(lambda df: df.value_counts(normalize=True), df)
+ # Ensure we don't drop rows due to nan values in unused columns.
+ self._run_test(lambda df: df.value_counts('num_wings'), df)
if PD_VERSION >= (1, 3):
# dropna=False is new in pandas 1.3