This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 406455d Revert "[SPARK-36231][PYTHON] Support arithmetic operations of decimal(nan) series" 406455d is described below commit 406455d79f787486f9e6fab1dce0d9a2645b8d14 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Mon Nov 22 11:33:05 2021 +0900 Revert "[SPARK-36231][PYTHON] Support arithmetic operations of decimal(nan) series" This reverts commit 4529dba42769f13ab8cfbb9798a5f82eaaf17b34. --- python/pyspark/pandas/data_type_ops/num_ops.py | 23 +-------- .../pandas/tests/data_type_ops/test_num_ops.py | 56 ++++++++++++++-------- .../pandas/tests/data_type_ops/testing_utils.py | 50 ++++++++++++++++--- 3 files changed, 80 insertions(+), 49 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index e08d6e9..3e74664 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -55,7 +55,7 @@ def _non_fractional_astype( elif isinstance(spark_type, BooleanType): return _as_bool_type(index_ops, dtype) elif isinstance(spark_type, StringType): - return _as_string_type(index_ops, dtype, null_str="NaN") + return _as_string_type(index_ops, dtype, null_str=str(np.nan)) else: return _as_other_type(index_ops, dtype, spark_type) @@ -447,29 +447,10 @@ class DecimalOps(FractionalOps): return index_ops.copy() def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike: + # TODO(SPARK-36230): check index_ops.hasnans after fixing SPARK-36230 dtype, spark_type = pandas_on_spark_type(dtype) - if is_integer_dtype(dtype) and not isinstance(dtype, extension_dtypes): - if index_ops.hasnans: - raise ValueError( - "Cannot convert %s with missing values to integer" % self.pretty_name - ) return _non_fractional_astype(index_ops, dtype, spark_type) - def rpow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: - _sanitize_list_like(right) - if not isinstance(right, numbers.Number): - raise TypeError("Exponentiation can not be applied to given types.") - - def rpow_func(left: Column, right: Any) -> Column: - return ( - F.when(left.isNull(), np.nan) - .when(SF.lit(right == 1), right) - .otherwise(Column.__rpow__(left, right)) - ) - - right = transform_boolean_operand_to_numeric(right) - return column_op(rpow_func)(left, right) - class IntegralExtensionOps(IntegralOps): """ diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index f4b36f9..4d1fb23 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -172,13 +172,11 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - if col in ["float", "float_w_nan"]: + if col == "float": self.assert_eq(pser ** pser, psser ** psser) self.assert_eq(pser ** pser.astype(bool), psser ** psser.astype(bool)) self.assert_eq(pser ** True, psser ** True) self.assert_eq(pser ** False, psser ** False) - self.assert_eq(pser ** 1, psser ** 1) - self.assert_eq(pser ** 0, psser ** 0) for n_col in self.non_numeric_df_cols: if n_col == "bool": @@ -186,6 +184,18 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): else: self.assertRaises(TypeError, lambda: psser ** psdf[n_col]) + # TODO(SPARK-36031): Merge test_pow_with_nan into test_pow + def test_pow_with_float_nan(self): + for col in self.numeric_w_nan_df_cols: + if col == "float_w_nan": + pser, psser = self.numeric_w_nan_pdf[col], self.numeric_w_nan_psdf[col] + self.assert_eq(pser ** pser, psser ** psser) + self.assert_eq(pser ** pser.astype(bool), psser ** psser.astype(bool)) + self.assert_eq(pser ** True, psser ** True) + self.assert_eq(pser ** False, psser ** False) + self.assert_eq(pser ** 1, psser ** 1) + self.assert_eq(pser ** 0, psser ** 0) + def test_radd(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: @@ -334,36 +344,40 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): self.assert_eq(ps.from_pandas(pser), psser) def test_isnull(self): - pdf, psdf = self.pdf, self.psdf - for col in self.numeric_df_cols: + pdf, psdf = self.numeric_w_nan_pdf, self.numeric_w_nan_psdf + for col in self.numeric_w_nan_df_cols: self.assert_eq(pdf[col].isnull(), psdf[col].isnull()) def test_astype(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] - - for int_type in [int, np.int32, np.int16, np.int8]: - if not pser.hasnans: - self.assert_eq(pser.astype(int_type), psser.astype(int_type)) - else: - self.assertRaisesRegex( - ValueError, - "Cannot convert %s with missing " - "values to integer" % psser._dtype_op.pretty_name, - lambda: psser.astype(int_type), - ) - - # TODO(SPARK-37039): the np.nan series.astype(bool) should be True - if not pser.hasnans: - self.assert_eq(pser.astype(bool), psser.astype(bool)) - + self.assert_eq(pser.astype(int), psser.astype(int)) self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) + self.assert_eq(pser.astype(np.int32), psser.astype(np.int32)) + self.assert_eq(pser.astype(np.int16), psser.astype(np.int16)) + self.assert_eq(pser.astype(np.int8), psser.astype(np.int8)) self.assert_eq(pser.astype(str), psser.astype(str)) + self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype("category"), psser.astype("category")) cat_type = CategoricalDtype(categories=[2, 1, 3]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) + self.assertRaisesRegex( + ValueError, + "Cannot convert fractions with missing values to integer", + lambda: self.float_withnan_psser.astype(int), + ) + self.assertRaisesRegex( + ValueError, + "Cannot convert fractions with missing values to integer", + lambda: self.float_withnan_psser.astype(np.int32), + ) + self.assert_eq(self.float_withnan_psser.astype(str), self.float_withnan_psser.astype(str)) + self.assert_eq(self.float_withnan_psser.astype(bool), self.float_withnan_psser.astype(bool)) + self.assert_eq( + self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category") + ) if extension_object_dtypes_available and extension_float_dtypes_available: pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) psser = ps.from_pandas(pser) diff --git a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py index 01bd494..b7c50d2 100644 --- a/python/pyspark/pandas/tests/data_type_ops/testing_utils.py +++ b/python/pyspark/pandas/tests/data_type_ops/testing_utils.py @@ -49,14 +49,8 @@ class TestCasesUtils(object): dtypes = [np.int32, int, np.float32, float] sers = [pd.Series([1, 2, 3], dtype=dtype) for dtype in dtypes] sers.append(pd.Series([decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(3)])) - sers.append(pd.Series([decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(np.nan)])) - sers.append(pd.Series([1, 2, np.nan], dtype=float)) pdf = pd.concat(sers, axis=1) - pdf.columns = [dtype.__name__ for dtype in dtypes] + [ - "decimal", - "decimal_nan", - "float_nan", - ] + pdf.columns = [dtype.__name__ for dtype in dtypes] + ["decimal"] return pdf @property @@ -75,6 +69,25 @@ class TestCasesUtils(object): def integral_psdf(self): return ps.from_pandas(self.integral_pdf) + # TODO(SPARK-36031): Merge self.numeric_w_nan_p(s)df into self.numeric_p(s)df + @property + def numeric_w_nan_pdf(self): + psers = { + "float_w_nan": pd.Series([1, 2, np.nan]), + "decimal_w_nan": pd.Series( + [decimal.Decimal(1), decimal.Decimal(2), decimal.Decimal(np.nan)] + ), + } + return pd.concat(psers, axis=1) + + @property + def numeric_w_nan_psdf(self): + return ps.from_pandas(self.numeric_w_nan_pdf) + + @property + def numeric_w_nan_df_cols(self): + return self.numeric_w_nan_pdf.columns + @property def non_numeric_pdf(self): psers = { @@ -120,10 +133,33 @@ class TestCasesUtils(object): return [ps.from_pandas(pser) for pser in self.numeric_psers] @property + def decimal_withnan_pser(self): + return pd.Series([decimal.Decimal(1.0), decimal.Decimal(2.0), decimal.Decimal(np.nan)]) + + @property + def decimal_withnan_psser(self): + return ps.from_pandas(self.decimal_withnan_pser) + + @property + def float_withnan_pser(self): + return pd.Series([1, 2, np.nan]) + + @property + def float_withnan_psser(self): + return ps.from_pandas(self.float_withnan_pser) + + @property def numeric_pser_psser_pairs(self): return zip(self.numeric_psers, self.numeric_pssers) @property + def numeric_withnan_pser_psser_pairs(self): + return zip( + self.numeric_psers + [self.decimal_withnan_pser, self.float_withnan_pser], + self.numeric_pssers + [self.decimal_withnan_psser, self.float_withnan_psser], + ) + + @property def non_numeric_psers(self): psers = { "string": pd.Series(["x", "y", "z"]), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org