yeandy commented on a change in pull request #16615:
URL: https://github.com/apache/beam/pull/16615#discussion_r803185618
##########
File path: sdks/python/apache_beam/dataframe/transforms_test.py
##########
@@ -334,6 +358,49 @@ def test_repeat(self):
})
self.run_scenario(df, lambda df: df.strings.str.repeat(df.repeats))
+ def test_get_dummies(self):
+ # Should not work because series is not a categorical type
+ with self.assertRaisesRegex(
+ frame_base.WontImplementError,
+ r"get_dummies\(\) of non-categorical type is not supported"):
+ s = pd.Series(['a ,b', 'a', 'a, d'])
+ self.run_scenario(s, lambda s: s.str.get_dummies(','), check_subset=True)
+
+ # Different separator
+ s = pd.Series(['a ,b', 'a', 'a, d', 'c'])
+ s = s.astype(pd.CategoricalDtype(categories=['a ,b', 'c', 'b', 'a,d']))
+ self.run_scenario(s, lambda s: s.str.get_dummies(','), check_subset=True)
+
+ # Pandas docs example 1
+ s = pd.Series(['a|b', 'a', 'a|c']).astype('category')
+ self.run_scenario(s, lambda s: s.str.get_dummies(','), check_subset=True)
+
+ # Pandas docs example 2
+ # Shouldn't still work even though np.nan is not considered a category
+ # because we automatically create a nan column
+ s = pd.Series(['a|b', np.nan, 'a|c']).astype('category')
+ self.run_scenario(s, lambda s: s.str.get_dummies(), check_subset=True)
+
+ # Should have two columns c, nan
+ s = pd.Series(['a|b', 'b|c', 'a|c', 'c', 'd'])
+ s = s.astype(pd.CategoricalDtype(categories=['a', 'b', 'c']))
+ self.run_scenario(s, lambda s: s.str.get_dummies(), check_subset=True)
+
+ # Explicitly pass nan as a category
+ s = pd.Series(['a|b', 'b|c', 'a|c', 'c', 'd'])
+ s = s.astype(pd.CategoricalDtype(categories=['a', 'b', 'c', 'nan']))
+ self.run_scenario(s, lambda s: s.str.get_dummies(), check_subset=True)
+
+ # Bools do not work because they are not a string type
+ with self.assertRaisesRegex(
+ AttributeError, r"Can only use .str accessor with string values"):
+ s = pd.Series([True, False, False, True])
+ self.run_scenario(s, lambda s: s.str.get_dummies(), check_subset=True)
+
+ # Bools casted to string work
+ s = pd.Series([True, False, False, True]).astype('str').astype('category')
+ self.run_scenario(s, lambda s: s.str.get_dummies(), check_subset=True)
Review comment:
Refactored!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]