This is an automated email from the ASF dual-hosted git repository. ueshin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new c418803 [SPARK-35847][PYTHON] Manage InternalField in DataTypeOps.isnull c418803 is described below commit c418803df7723d3bebce7792774d2b761a83be40 Author: Takuya UESHIN <ues...@databricks.com> AuthorDate: Tue Jun 22 12:54:01 2021 -0700 [SPARK-35847][PYTHON] Manage InternalField in DataTypeOps.isnull ### What changes were proposed in this pull request? Properly set `InternalField` for `DataTypeOps.isnull`. ### Why are the changes needed? The result of `DataTypeOps.isnull` must always be non-nullable boolean. We should manage `InternalField` for this case. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added some more tests. Closes #33005 from ueshin/issues/SPARK-35847/isnull_field. Authored-by: Takuya UESHIN <ues...@databricks.com> Signed-off-by: Takuya UESHIN <ues...@databricks.com> --- python/pyspark/pandas/data_type_ops/base.py | 7 +++- python/pyspark/pandas/data_type_ops/num_ops.py | 12 +++++- .../pandas/tests/data_type_ops/test_num_ops.py | 46 +++++++++++++++++----- .../pandas/tests/data_type_ops/test_string_ops.py | 3 ++ 4 files changed, 55 insertions(+), 13 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/base.py b/python/pyspark/pandas/data_type_ops/base.py index 0a411ec..3eef0db 100644 --- a/python/pyspark/pandas/data_type_ops/base.py +++ b/python/pyspark/pandas/data_type_ops/base.py @@ -333,7 +333,12 @@ class DataTypeOps(object, metaclass=ABCMeta): return col.replace({np.nan: None}) def isnull(self, index_ops: T_IndexOps) -> T_IndexOps: - return index_ops._with_new_scol(index_ops.spark.column.isNull()) + return index_ops._with_new_scol( + index_ops.spark.column.isNull(), + field=index_ops._internal.data_fields[0].copy( + dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False + ), + ) def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: raise TypeError("astype can not be applied to %s." % self.pretty_name) diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index 851ee2a..7f7a17f 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -367,7 +367,10 @@ class FractionalOps(NumericOps): def isnull(self, index_ops: T_IndexOps) -> T_IndexOps: return index_ops._with_new_scol( - index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column) + index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column), + field=index_ops._internal.data_fields[0].copy( + dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False + ), ) def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: @@ -404,7 +407,12 @@ class DecimalOps(FractionalOps): return "decimal" def isnull(self, index_ops: T_IndexOps) -> T_IndexOps: - return index_ops._with_new_scol(index_ops.spark.column.isNull()) + return index_ops._with_new_scol( + index_ops.spark.column.isNull(), + field=index_ops._internal.data_fields[0].copy( + dtype=np.dtype("bool"), spark_type=BooleanType(), nullable=False + ), + ) def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> T_IndexOps: dtype, spark_type = pandas_on_spark_type(dtype) diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py index e4e1eecd..b8b579b 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py @@ -320,29 +320,55 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils): @unittest.skipIf(not extension_dtypes_available, "pandas extension dtypes are not available") class IntegralExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils): - def test_from_to_pandas(self): - data = [1, 2, 3, None] + @property + def intergral_extension_psers(self): dtypes = ["Int8", "Int16", "Int32", "Int64"] - for dtype in dtypes: - pser = pd.Series(data, dtype=dtype) - psser = ps.Series(data, dtype=dtype) + return [pd.Series([1, 2, 3, None], dtype=dtype) for dtype in dtypes] + + @property + def intergral_extension_pssers(self): + return [ps.from_pandas(pser) for pser in self.intergral_extension_psers] + + @property + def intergral_extension_pser_psser_pairs(self): + return zip(self.intergral_extension_psers, self.intergral_extension_pssers) + + def test_from_to_pandas(self): + for pser, psser in self.intergral_extension_pser_psser_pairs: self.check_extension(pser, psser.to_pandas()) self.check_extension(ps.from_pandas(pser), psser) + def test_isnull(self): + for pser, psser in self.intergral_extension_pser_psser_pairs: + self.assert_eq(pser.isnull(), psser.isnull()) + @unittest.skipIf( not extension_float_dtypes_available, "pandas extension float dtypes are not available" ) class FractionalExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils): - def test_from_to_pandas(self): - data = [0.1, 0.2, 0.3, None] + @property + def fractional_extension_psers(self): dtypes = ["Float32", "Float64"] - for dtype in dtypes: - pser = pd.Series(data, dtype=dtype) - psser = ps.Series(data, dtype=dtype) + return [pd.Series([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in dtypes] + + @property + def fractional_extension_pssers(self): + return [ps.from_pandas(pser) for pser in self.fractional_extension_psers] + + @property + def fractional_extension_pser_psser_pairs(self): + return zip(self.fractional_extension_psers, self.fractional_extension_pssers) + + def test_from_to_pandas(self): + for pser, psser in self.fractional_extension_pser_psser_pairs: self.check_extension(pser, psser.to_pandas()) self.check_extension(ps.from_pandas(pser), psser) + def test_isnull(self): + for pser, psser in self.fractional_extension_pser_psser_pairs: + self.assert_eq(pser.isnull(), psser.isnull()) + if __name__ == "__main__": from pyspark.pandas.tests.data_type_ops.test_num_ops import * # noqa: F401 diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py index 3ca16af..de9cdca 100644 --- a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py +++ b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py @@ -214,6 +214,9 @@ class StringExtensionOpsTest(StringOpsTest, PandasOnSparkTestCase, TestCasesUtil self.assert_eq(pser, psser.to_pandas()) self.assert_eq(ps.from_pandas(pser), psser) + def test_isnull(self): + self.assert_eq(self.pser.isnull(), self.psser.isnull()) + if __name__ == "__main__": from pyspark.pandas.tests.data_type_ops.test_string_ops import * # noqa: F401 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org