[spark] branch master updated: [SPARK-35847][PYTHON] Manage InternalField in DataTypeOps.isnull

ueshin Tue, 22 Jun 2021 12:54:40 -0700

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new c418803  [SPARK-35847][PYTHON] Manage InternalField in 
DataTypeOps.isnull
c418803 is described below

commit c418803df7723d3bebce7792774d2b761a83be40
Author: Takuya UESHIN <ues...@databricks.com>
AuthorDate: Tue Jun 22 12:54:01 2021 -0700

    [SPARK-35847][PYTHON] Manage InternalField in DataTypeOps.isnull
    
    ### What changes were proposed in this pull request?
    
    Properly set `InternalField` for `DataTypeOps.isnull`.
    
    ### Why are the changes needed?
    
    The result of `DataTypeOps.isnull` must always be non-nullable boolean.
    We should manage `InternalField` for this case.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Added some more tests.
    
    Closes #33005 from ueshin/issues/SPARK-35847/isnull_field.
    
    Authored-by: Takuya UESHIN <ues...@databricks.com>
    Signed-off-by: Takuya UESHIN <ues...@databricks.com>
---
 python/pyspark/pandas/data_type_ops/base.py        |  7 +++-
 python/pyspark/pandas/data_type_ops/num_ops.py     | 12 +++++-
 .../pandas/tests/data_type_ops/test_num_ops.py     | 46 +++++++++++++++++-----
 .../pandas/tests/data_type_ops/test_string_ops.py  |  3 ++
 4 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/base.py 
b/python/pyspark/pandas/data_type_ops/base.py
index 0a411ec..3eef0db 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -333,7 +333,12 @@ class DataTypeOps(object, metaclass=ABCMeta):
         return col.replace({np.nan: None})
 
     def isnull(self, index_ops: T_IndexOps) -> T_IndexOps:
-        return index_ops._with_new_scol(index_ops.spark.column.isNull())
+        return index_ops._with_new_scol(
+            index_ops.spark.column.isNull(),
+            field=index_ops._internal.data_fields[0].copy(
+                dtype=np.dtype("bool"), spark_type=BooleanType(), 
nullable=False
+            ),
+        )
 
     def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> 
T_IndexOps:
         raise TypeError("astype can not be applied to %s." % self.pretty_name)
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py 
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 851ee2a..7f7a17f 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -367,7 +367,10 @@ class FractionalOps(NumericOps):
 
     def isnull(self, index_ops: T_IndexOps) -> T_IndexOps:
         return index_ops._with_new_scol(
-            index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column)
+            index_ops.spark.column.isNull() | F.isnan(index_ops.spark.column),
+            field=index_ops._internal.data_fields[0].copy(
+                dtype=np.dtype("bool"), spark_type=BooleanType(), 
nullable=False
+            ),
         )
 
     def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> 
T_IndexOps:
@@ -404,7 +407,12 @@ class DecimalOps(FractionalOps):
         return "decimal"
 
     def isnull(self, index_ops: T_IndexOps) -> T_IndexOps:
-        return index_ops._with_new_scol(index_ops.spark.column.isNull())
+        return index_ops._with_new_scol(
+            index_ops.spark.column.isNull(),
+            field=index_ops._internal.data_fields[0].copy(
+                dtype=np.dtype("bool"), spark_type=BooleanType(), 
nullable=False
+            ),
+        )
 
     def astype(self, index_ops: T_IndexOps, dtype: Union[str, type, Dtype]) -> 
T_IndexOps:
         dtype, spark_type = pandas_on_spark_type(dtype)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
index e4e1eecd..b8b579b 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_num_ops.py
@@ -320,29 +320,55 @@ class NumOpsTest(PandasOnSparkTestCase, TestCasesUtils):
 
 @unittest.skipIf(not extension_dtypes_available, "pandas extension dtypes are 
not available")
 class IntegralExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
-    def test_from_to_pandas(self):
-        data = [1, 2, 3, None]
+    @property
+    def intergral_extension_psers(self):
         dtypes = ["Int8", "Int16", "Int32", "Int64"]
-        for dtype in dtypes:
-            pser = pd.Series(data, dtype=dtype)
-            psser = ps.Series(data, dtype=dtype)
+        return [pd.Series([1, 2, 3, None], dtype=dtype) for dtype in dtypes]
+
+    @property
+    def intergral_extension_pssers(self):
+        return [ps.from_pandas(pser) for pser in 
self.intergral_extension_psers]
+
+    @property
+    def intergral_extension_pser_psser_pairs(self):
+        return zip(self.intergral_extension_psers, 
self.intergral_extension_pssers)
+
+    def test_from_to_pandas(self):
+        for pser, psser in self.intergral_extension_pser_psser_pairs:
             self.check_extension(pser, psser.to_pandas())
             self.check_extension(ps.from_pandas(pser), psser)
 
+    def test_isnull(self):
+        for pser, psser in self.intergral_extension_pser_psser_pairs:
+            self.assert_eq(pser.isnull(), psser.isnull())
+
 
 @unittest.skipIf(
     not extension_float_dtypes_available, "pandas extension float dtypes are 
not available"
 )
 class FractionalExtensionOpsTest(PandasOnSparkTestCase, TestCasesUtils):
-    def test_from_to_pandas(self):
-        data = [0.1, 0.2, 0.3, None]
+    @property
+    def fractional_extension_psers(self):
         dtypes = ["Float32", "Float64"]
-        for dtype in dtypes:
-            pser = pd.Series(data, dtype=dtype)
-            psser = ps.Series(data, dtype=dtype)
+        return [pd.Series([0.1, 0.2, 0.3, None], dtype=dtype) for dtype in 
dtypes]
+
+    @property
+    def fractional_extension_pssers(self):
+        return [ps.from_pandas(pser) for pser in 
self.fractional_extension_psers]
+
+    @property
+    def fractional_extension_pser_psser_pairs(self):
+        return zip(self.fractional_extension_psers, 
self.fractional_extension_pssers)
+
+    def test_from_to_pandas(self):
+        for pser, psser in self.fractional_extension_pser_psser_pairs:
             self.check_extension(pser, psser.to_pandas())
             self.check_extension(ps.from_pandas(pser), psser)
 
+    def test_isnull(self):
+        for pser, psser in self.fractional_extension_pser_psser_pairs:
+            self.assert_eq(pser.isnull(), psser.isnull())
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.data_type_ops.test_num_ops import *  # noqa: F401
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
index 3ca16af..de9cdca 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
@@ -214,6 +214,9 @@ class StringExtensionOpsTest(StringOpsTest, 
PandasOnSparkTestCase, TestCasesUtil
         self.assert_eq(pser, psser.to_pandas())
         self.assert_eq(ps.from_pandas(pser), psser)
 
+    def test_isnull(self):
+        self.assert_eq(self.pser.isnull(), self.psser.isnull())
+
 
 if __name__ == "__main__":
     from pyspark.pandas.tests.data_type_ops.test_string_ops import *  # noqa: 
F401

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-35847][PYTHON] Manage InternalField in DataTypeOps.isnull

Reply via email to