This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 4f56958c1231 [SPARK-46471][PS][TESTS][FOLLOWUPS] Reorganize
`OpsOnDiffFramesEnabledTests`: Factor out more tests
4f56958c1231 is described below
commit 4f56958c1231794da71160a385427ffd730bb396
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Sun Dec 24 17:43:59 2023 +0800
[SPARK-46471][PS][TESTS][FOLLOWUPS] Reorganize
`OpsOnDiffFramesEnabledTests`: Factor out more tests
### What changes were proposed in this pull request?
factor out following tests:
- test_bitwise
- test_bitwise_extension_dtype
- test_combine_first
- test_compare
- test_concat_column_axis -> test_concat_column_axis_inner &
test_concat_column_axis_outer
### Why are the changes needed?
for testing parallelism
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #44469 from zhengruifeng/ps_test_diff_ops_2.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
dev/sparktestsupport/modules.py | 10 +
.../connect/diff_frames_ops/test_parity_bitwise.py | 41 +++
.../diff_frames_ops/test_parity_combine_first.py | 41 +++
.../diff_frames_ops/test_parity_compare_series.py | 41 +++
.../diff_frames_ops/test_parity_concat_inner.py | 41 +++
.../diff_frames_ops/test_parity_concat_outer.py | 41 +++
.../pandas/tests/diff_frames_ops/test_bitwise.py | 110 +++++++++
.../tests/diff_frames_ops/test_combine_first.py | 110 +++++++++
.../tests/diff_frames_ops/test_compare_series.py | 155 ++++++++++++
.../tests/diff_frames_ops/test_concat_inner.py | 123 +++++++++
.../tests/diff_frames_ops/test_concat_outer.py | 81 ++++++
.../pandas/tests/test_ops_on_diff_frames.py | 275 +--------------------
12 files changed, 795 insertions(+), 274 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 33e7dd3af97a..939e88bf95b2 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -872,6 +872,11 @@ pyspark_pandas_slow = Module(
"pyspark.pandas.tests.diff_frames_ops.test_arithmetic_chain_ext_float",
"pyspark.pandas.tests.diff_frames_ops.test_assign_frame",
"pyspark.pandas.tests.diff_frames_ops.test_assign_series",
+ "pyspark.pandas.tests.diff_frames_ops.test_bitwise",
+ "pyspark.pandas.tests.diff_frames_ops.test_combine_first",
+ "pyspark.pandas.tests.diff_frames_ops.test_compare_series",
+ "pyspark.pandas.tests.diff_frames_ops.test_concat_inner",
+ "pyspark.pandas.tests.diff_frames_ops.test_concat_outer",
"pyspark.pandas.tests.diff_frames_ops.test_basic_slow",
"pyspark.pandas.tests.diff_frames_ops.test_cov",
"pyspark.pandas.tests.diff_frames_ops.test_corrwith",
@@ -1239,6 +1244,11 @@ pyspark_pandas_connect_part3 = Module(
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_arithmetic_chain_ext_float",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_assign_series",
+ "pyspark.pandas.tests.connect.diff_frames_ops.test_parity_bitwise",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_combine_first",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_compare_series",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_inner",
+
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_outer",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_aggregate",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_groupby_apply",
diff --git
a/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_bitwise.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_bitwise.py
new file mode 100644
index 000000000000..75335adc4162
--- /dev/null
+++ b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_bitwise.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.diff_frames_ops.test_bitwise import BitwiseMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class BitwiseParityTests(
+ BitwiseMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_bitwise
import * # noqa
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git
a/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_combine_first.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_combine_first.py
new file mode 100644
index 000000000000..3ee500c316e9
--- /dev/null
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_combine_first.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.diff_frames_ops.test_combine_first import
CombineFirstMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class CombineFirstParityTests(
+ CombineFirstMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_combine_first import *
# noqa
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git
a/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_compare_series.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_compare_series.py
new file mode 100644
index 000000000000..af866a5948ad
--- /dev/null
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_compare_series.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.diff_frames_ops.test_compare_series import
CompareSeriesMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class CompareSeriesParityTests(
+ CompareSeriesMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from
pyspark.pandas.tests.connect.diff_frames_ops.test_parity_compare_series import
* # noqa
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git
a/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_concat_inner.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_concat_inner.py
new file mode 100644
index 000000000000..fd13d617792e
--- /dev/null
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_concat_inner.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.diff_frames_ops.test_concat_inner import
ConcatInnerMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class ConcatInnerParityTests(
+ ConcatInnerMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_inner
import * # noqa
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git
a/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_concat_outer.py
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_concat_outer.py
new file mode 100644
index 000000000000..f7fa2c550c30
--- /dev/null
+++
b/python/pyspark/pandas/tests/connect/diff_frames_ops/test_parity_concat_outer.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.diff_frames_ops.test_concat_outer import
ConcatOuterMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class ConcatOuterParityTests(
+ ConcatOuterMixin,
+ PandasOnSparkTestUtils,
+ ReusedConnectTestCase,
+):
+ pass
+
+
+if __name__ == "__main__":
+ from pyspark.pandas.tests.connect.diff_frames_ops.test_parity_concat_outer
import * # noqa
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_bitwise.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_bitwise.py
new file mode 100644
index 000000000000..04e9734ff823
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_bitwise.py
@@ -0,0 +1,110 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.typedef.typehints import extension_object_dtypes_available
+
+
+class BitwiseMixin:
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
+
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
+
+ def test_bitwise(self):
+ pser1 = pd.Series([True, False, True, False, np.nan, np.nan, True,
False, np.nan])
+ pser2 = pd.Series([True, False, False, True, True, False, np.nan,
np.nan, np.nan])
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ self.assert_eq(pser1 | pser2, (psser1 | psser2).sort_index())
+ self.assert_eq(pser1 & pser2, (psser1 & psser2).sort_index())
+
+ pser1 = pd.Series([True, False, np.nan], index=list("ABC"))
+ pser2 = pd.Series([False, True, np.nan], index=list("DEF"))
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ self.assert_eq(pser1 | pser2, (psser1 | psser2).sort_index())
+ self.assert_eq(pser1 & pser2, (psser1 & psser2).sort_index())
+
+ @unittest.skipIf(
+ not extension_object_dtypes_available, "pandas extension object dtypes
are not available"
+ )
+ def test_bitwise_extension_dtype(self):
+ pser1 = pd.Series(
+ [True, False, True, False, np.nan, np.nan, True, False, np.nan],
dtype="boolean"
+ )
+ pser2 = pd.Series(
+ [True, False, False, True, True, False, np.nan, np.nan, np.nan],
dtype="boolean"
+ )
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ self.assert_eq((psser1 | psser2).sort_index(), pser1 | pser2)
+ self.assert_eq((psser1 & psser2).sort_index(), pser1 & pser2)
+
+ pser1 = pd.Series([True, False, np.nan], index=list("ABC"),
dtype="boolean")
+ pser2 = pd.Series([False, True, np.nan], index=list("DEF"),
dtype="boolean")
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ # a pandas bug?
+ # assert_eq((psser1 | psser2).sort_index(), pser1 | pser2)
+ # assert_eq((psser1 & psser2).sort_index(), pser1 & pser2)
+ self.assert_eq(
+ (psser1 | psser2).sort_index(),
+ pd.Series([True, None, None, None, True, None],
index=list("ABCDEF"), dtype="boolean"),
+ )
+ self.assert_eq(
+ (psser1 & psser2).sort_index(),
+ pd.Series(
+ [None, False, None, False, None, None], index=list("ABCDEF"),
dtype="boolean"
+ ),
+ )
+
+
+class BitwiseTests(
+ BitwiseMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.diff_frames_ops.test_bitwise import * # noqa:
F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_combine_first.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_combine_first.py
new file mode 100644
index 000000000000..3fae57ac47c0
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_combine_first.py
@@ -0,0 +1,110 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class CombineFirstMixin:
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
+
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
+
+ def test_combine_first(self):
+ pser1 = pd.Series({"falcon": 330.0, "eagle": 160.0})
+ pser2 = pd.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0})
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ self.assert_eq(
+ psser1.combine_first(psser2).sort_index(),
pser1.combine_first(pser2).sort_index()
+ )
+ with self.assertRaisesRegex(
+ TypeError, "`combine_first` only allows `Series` for parameter
`other`"
+ ):
+ psser1.combine_first(50)
+
+ psser1.name = ("X", "A")
+ psser2.name = ("Y", "B")
+ pser1.name = ("X", "A")
+ pser2.name = ("Y", "B")
+ self.assert_eq(
+ psser1.combine_first(psser2).sort_index(),
pser1.combine_first(pser2).sort_index()
+ )
+
+ # MultiIndex
+ midx1 = pd.MultiIndex(
+ [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length",
"power"]],
+ [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]],
+ )
+ midx2 = pd.MultiIndex(
+ [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
+ [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+ )
+ pser1 = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx1)
+ pser2 = pd.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3],
index=midx2)
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+
+ self.assert_eq(
+ psser1.combine_first(psser2).sort_index(),
pser1.combine_first(pser2).sort_index()
+ )
+
+ # DataFrame
+ pdf1 = pd.DataFrame({"A": [None, 0], "B": [4, None]})
+ psdf1 = ps.from_pandas(pdf1)
+ pdf2 = pd.DataFrame({"C": [3, 3], "B": [1, 1]})
+ psdf2 = ps.from_pandas(pdf2)
+
+ self.assert_eq(pdf1.combine_first(pdf2),
psdf1.combine_first(psdf2).sort_index())
+
+ pdf1.columns = pd.MultiIndex.from_tuples([("A", "willow"), ("B",
"pine")])
+ psdf1 = ps.from_pandas(pdf1)
+ pdf2.columns = pd.MultiIndex.from_tuples([("C", "oak"), ("B", "pine")])
+ psdf2 = ps.from_pandas(pdf2)
+
+ self.assert_eq(pdf1.combine_first(pdf2),
psdf1.combine_first(psdf2).sort_index())
+
+
+class CombineFirstTests(
+ CombineFirstMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.diff_frames_ops.test_combine_first import * #
noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_compare_series.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_compare_series.py
new file mode 100644
index 000000000000..c548f8a2d32c
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_compare_series.py
@@ -0,0 +1,155 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class CompareSeriesMixin:
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
+
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
+
+ def test_compare(self):
+ pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])
+ pser2 = pd.Series(["a", "c", np.nan, np.nan, "h"])
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+ self.assert_eq(
+ pser1.compare(pser2).sort_index(),
+ psser1.compare(psser2).sort_index(),
+ )
+
+ # `keep_shape=True`
+ self.assert_eq(
+ pser1.compare(pser2, keep_shape=True).sort_index(),
+ psser1.compare(psser2, keep_shape=True).sort_index(),
+ )
+ # `keep_equal=True`
+ self.assert_eq(
+ pser1.compare(pser2, keep_equal=True).sort_index(),
+ psser1.compare(psser2, keep_equal=True).sort_index(),
+ )
+ # `keep_shape=True` and `keep_equal=True`
+ self.assert_eq(
+ pser1.compare(pser2, keep_shape=True,
keep_equal=True).sort_index(),
+ psser1.compare(psser2, keep_shape=True,
keep_equal=True).sort_index(),
+ )
+
+ # MultiIndex
+ pser1.index = pd.MultiIndex.from_tuples(
+ [("a", "x"), ("b", "y"), ("c", "z"), ("x", "k"), ("q", "l")]
+ )
+ pser2.index = pd.MultiIndex.from_tuples(
+ [("a", "x"), ("b", "y"), ("c", "z"), ("x", "k"), ("q", "l")]
+ )
+ psser1 = ps.from_pandas(pser1)
+ psser2 = ps.from_pandas(pser2)
+ self.assert_eq(
+ pser1.compare(pser2).sort_index(),
+ psser1.compare(psser2).sort_index(),
+ )
+
+ # `keep_shape=True` with MultiIndex
+ self.assert_eq(
+ pser1.compare(pser2, keep_shape=True).sort_index(),
+ psser1.compare(psser2, keep_shape=True).sort_index(),
+ )
+ # `keep_equal=True` with MultiIndex
+ self.assert_eq(
+ pser1.compare(pser2, keep_equal=True).sort_index(),
+ psser1.compare(psser2, keep_equal=True).sort_index(),
+ )
+ # `keep_shape=True` and `keep_equal=True` with MultiIndex
+ self.assert_eq(
+ pser1.compare(pser2, keep_shape=True,
keep_equal=True).sort_index(),
+ psser1.compare(psser2, keep_shape=True,
keep_equal=True).sort_index(),
+ )
+
+ # Different Index
+ with self.assertRaisesRegex(
+ ValueError, "Can only compare identically-labeled Series objects"
+ ):
+ psser1 = ps.Series(
+ [1, 2, 3, 4, 5],
+ index=pd.Index([1, 2, 3, 4, 5]),
+ )
+ psser2 = ps.Series(
+ [2, 2, 3, 4, 1],
+ index=pd.Index([5, 4, 3, 2, 1]),
+ )
+ psser1.compare(psser2)
+ # Different MultiIndex
+ with self.assertRaisesRegex(
+ ValueError, "Can only compare identically-labeled Series objects"
+ ):
+ psser1 = ps.Series(
+ [1, 2, 3, 4, 5],
+ index=pd.MultiIndex.from_tuples(
+ [("a", "x"), ("b", "y"), ("c", "z"), ("x", "k"), ("q",
"l")]
+ ),
+ )
+ psser2 = ps.Series(
+ [2, 2, 3, 4, 1],
+ index=pd.MultiIndex.from_tuples(
+ [("a", "x"), ("b", "y"), ("c", "a"), ("x", "k"), ("q",
"l")]
+ ),
+ )
+ psser1.compare(psser2)
+ # SPARK-37495: Skip identical index checking of Series.compare when
config
+ # 'compute.eager_check' is disabled
+ psser1 = ps.Series([1, 2, 3, 4, 5], index=pd.Index([1, 2, 3, 4, 5]))
+ psser2 = ps.Series([1, 2, 3, 4, 5, 6], index=pd.Index([1, 2, 4, 3, 6,
7]))
+ expected = ps.DataFrame(
+ {"self": [3, 4, 5, np.nan, np.nan], "other": [4, 3, np.nan, 5.0,
6.0]},
+ index=[3, 4, 5, 6, 7],
+ )
+
+ with ps.option_context("compute.eager_check", False):
+ self.assert_eq(expected, psser1.compare(psser2))
+
+
+class CompareSeriesTests(
+ CompareSeriesMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.diff_frames_ops.test_compare_series import * #
noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_concat_inner.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_concat_inner.py
new file mode 100644
index 000000000000..57e0d3948944
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_concat_inner.py
@@ -0,0 +1,123 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class ConcatTestingFuncMixin:
+ def _test_frames(self):
+ pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
+ pdf1.columns.names = ["AB"]
+ pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5])
+ pdf2.columns.names = ["CD"]
+ psdf1 = ps.from_pandas(pdf1)
+ psdf2 = ps.from_pandas(pdf2)
+
+ psdf3 = psdf1.copy()
+ psdf4 = psdf2.copy()
+ pdf3 = pdf1.copy()
+ pdf4 = pdf2.copy()
+
+ columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")],
names=["X", "AB"])
+ pdf3.columns = columns
+ psdf3.columns = columns
+
+ columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")],
names=["Y", "CD"])
+ pdf4.columns = columns
+ psdf4.columns = columns
+
+ pdf5 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
+ pdf6 = pd.DataFrame({"C": [1, 2, 3]}, index=[1, 3, 5])
+ psdf5 = ps.from_pandas(pdf5)
+ psdf6 = ps.from_pandas(pdf6)
+
+ objs = [
+ ([psdf1.A, psdf2.C], [pdf1.A, pdf2.C]),
+ # TODO: ([psdf1, psdf2.C], [pdf1, pdf2.C]),
+ ([psdf1.A, psdf2], [pdf1.A, pdf2]),
+ ([psdf1.A, psdf2.C], [pdf1.A, pdf2.C]),
+ ([psdf3[("X", "A")], psdf4[("X", "C")]], [pdf3[("X", "A")],
pdf4[("X", "C")]]),
+ ([psdf3, psdf4[("X", "C")]], [pdf3, pdf4[("X", "C")]]),
+ ([psdf3[("X", "A")], psdf4], [pdf3[("X", "A")], pdf4]),
+ ([psdf3, psdf4], [pdf3, pdf4]),
+ ([psdf5, psdf6], [pdf5, pdf6]),
+ ([psdf6, psdf5], [pdf6, pdf5]),
+ ]
+
+ return objs
+
+
+class ConcatInnerMixin(ConcatTestingFuncMixin):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
+
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
+
+ def test_concat_column_axis_inner(self):
+ join = "inner"
+
+ objs = self._test_frames()
+ for i, (psdfs, pdfs) in enumerate(objs):
+ for ignore_index in [True, False]:
+ with self.subTest(ignore_index=ignore_index, join=join,
pdfs=pdfs, pair=i):
+ actual = ps.concat(psdfs, axis=1,
ignore_index=ignore_index, join=join)
+ expected = pd.concat(pdfs, axis=1,
ignore_index=ignore_index, join=join)
+ self.assert_eq(
+
repr(actual.sort_values(list(actual.columns)).reset_index(drop=True)),
+
repr(expected.sort_values(list(expected.columns)).reset_index(drop=True)),
+ )
+ actual = ps.concat(
+ psdfs, axis=1, ignore_index=ignore_index, join=join,
sort=True
+ )
+ expected = pd.concat(
+ pdfs, axis=1, ignore_index=ignore_index, join=join,
sort=True
+ )
+ self.assert_eq(
+ repr(actual.reset_index(drop=True)),
+ repr(expected.reset_index(drop=True)),
+ )
+
+
+class ConcatInnerTests(
+ ConcatInnerMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.diff_frames_ops.test_concat_inner import * #
noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/diff_frames_ops/test_concat_outer.py
b/python/pyspark/pandas/tests/diff_frames_ops/test_concat_outer.py
new file mode 100644
index 000000000000..bc6942b73226
--- /dev/null
+++ b/python/pyspark/pandas/tests/diff_frames_ops/test_concat_outer.py
@@ -0,0 +1,81 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.pandas.config import set_option, reset_option
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+from pyspark.pandas.tests.diff_frames_ops.test_concat_inner import
ConcatTestingFuncMixin
+
+
+class ConcatOuterMixin(ConcatTestingFuncMixin):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ set_option("compute.ops_on_diff_frames", True)
+
+ @classmethod
+ def tearDownClass(cls):
+ reset_option("compute.ops_on_diff_frames")
+ super().tearDownClass()
+
+ def test_concat_column_axis_outer(self):
+ join = "outer"
+
+ objs = self._test_frames()
+ for i, (psdfs, pdfs) in enumerate(objs):
+ for ignore_index in [True, False]:
+ with self.subTest(ignore_index=ignore_index, join=join,
pdfs=pdfs, pair=i):
+ actual = ps.concat(psdfs, axis=1,
ignore_index=ignore_index, join=join)
+ expected = pd.concat(pdfs, axis=1,
ignore_index=ignore_index, join=join)
+ self.assert_eq(
+
repr(actual.sort_values(list(actual.columns)).reset_index(drop=True)),
+
repr(expected.sort_values(list(expected.columns)).reset_index(drop=True)),
+ )
+ actual = ps.concat(
+ psdfs, axis=1, ignore_index=ignore_index, join=join,
sort=True
+ )
+ expected = pd.concat(
+ pdfs, axis=1, ignore_index=ignore_index, join=join,
sort=True
+ )
+ self.assert_eq(
+ repr(actual.reset_index(drop=True)),
+ repr(expected.reset_index(drop=True)),
+ )
+
+
+class ConcatOuterTests(
+ ConcatOuterMixin,
+ PandasOnSparkTestCase,
+ SQLTestUtils,
+):
+ pass
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.diff_frames_ops.test_concat_outer import * #
noqa: F401
+
+ try:
+ import xmlrunner
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 505e96e68752..75410a65227d 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -15,21 +15,15 @@
# limitations under the License.
#
-from itertools import product
+
import unittest
import pandas as pd
-import numpy as np
from pyspark import pandas as ps
from pyspark.pandas.config import set_option, reset_option
from pyspark.testing.pandasutils import PandasOnSparkTestCase
from pyspark.testing.sqlutils import SQLTestUtils
-from pyspark.pandas.typedef.typehints import (
- extension_dtypes_available,
- extension_float_dtypes_available,
- extension_object_dtypes_available,
-)
class OpsOnDiffFramesEnabledTestsMixin:
@@ -232,175 +226,6 @@ class OpsOnDiffFramesEnabledTestsMixin:
pser.name = psser.name = "B"
self.assert_eq(pser.loc[pdf2.A > -3].sort_index(), psser.loc[psdf2.A >
-3].sort_index())
- def test_bitwise(self):
- pser1 = pd.Series([True, False, True, False, np.nan, np.nan, True,
False, np.nan])
- pser2 = pd.Series([True, False, False, True, True, False, np.nan,
np.nan, np.nan])
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- self.assert_eq(pser1 | pser2, (psser1 | psser2).sort_index())
- self.assert_eq(pser1 & pser2, (psser1 & psser2).sort_index())
-
- pser1 = pd.Series([True, False, np.nan], index=list("ABC"))
- pser2 = pd.Series([False, True, np.nan], index=list("DEF"))
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- self.assert_eq(pser1 | pser2, (psser1 | psser2).sort_index())
- self.assert_eq(pser1 & pser2, (psser1 & psser2).sort_index())
-
- @unittest.skipIf(
- not extension_object_dtypes_available, "pandas extension object dtypes
are not available"
- )
- def test_bitwise_extension_dtype(self):
- pser1 = pd.Series(
- [True, False, True, False, np.nan, np.nan, True, False, np.nan],
dtype="boolean"
- )
- pser2 = pd.Series(
- [True, False, False, True, True, False, np.nan, np.nan, np.nan],
dtype="boolean"
- )
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- self.assert_eq((psser1 | psser2).sort_index(), pser1 | pser2)
- self.assert_eq((psser1 & psser2).sort_index(), pser1 & pser2)
-
- pser1 = pd.Series([True, False, np.nan], index=list("ABC"),
dtype="boolean")
- pser2 = pd.Series([False, True, np.nan], index=list("DEF"),
dtype="boolean")
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- # a pandas bug?
- # assert_eq((psser1 | psser2).sort_index(), pser1 | pser2)
- # assert_eq((psser1 & psser2).sort_index(), pser1 & pser2)
- self.assert_eq(
- (psser1 | psser2).sort_index(),
- pd.Series([True, None, None, None, True, None],
index=list("ABCDEF"), dtype="boolean"),
- )
- self.assert_eq(
- (psser1 & psser2).sort_index(),
- pd.Series(
- [None, False, None, False, None, None], index=list("ABCDEF"),
dtype="boolean"
- ),
- )
-
- def test_concat_column_axis(self):
- pdf1 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
- pdf1.columns.names = ["AB"]
- pdf2 = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}, index=[1, 3, 5])
- pdf2.columns.names = ["CD"]
- psdf1 = ps.from_pandas(pdf1)
- psdf2 = ps.from_pandas(pdf2)
-
- psdf3 = psdf1.copy()
- psdf4 = psdf2.copy()
- pdf3 = pdf1.copy()
- pdf4 = pdf2.copy()
-
- columns = pd.MultiIndex.from_tuples([("X", "A"), ("X", "B")],
names=["X", "AB"])
- pdf3.columns = columns
- psdf3.columns = columns
-
- columns = pd.MultiIndex.from_tuples([("X", "C"), ("X", "D")],
names=["Y", "CD"])
- pdf4.columns = columns
- psdf4.columns = columns
-
- pdf5 = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5]}, index=[1, 2, 3])
- pdf6 = pd.DataFrame({"C": [1, 2, 3]}, index=[1, 3, 5])
- psdf5 = ps.from_pandas(pdf5)
- psdf6 = ps.from_pandas(pdf6)
-
- ignore_indexes = [True, False]
- joins = ["inner", "outer"]
-
- objs = [
- ([psdf1.A, psdf2.C], [pdf1.A, pdf2.C]),
- # TODO: ([psdf1, psdf2.C], [pdf1, pdf2.C]),
- ([psdf1.A, psdf2], [pdf1.A, pdf2]),
- ([psdf1.A, psdf2.C], [pdf1.A, pdf2.C]),
- ([psdf3[("X", "A")], psdf4[("X", "C")]], [pdf3[("X", "A")],
pdf4[("X", "C")]]),
- ([psdf3, psdf4[("X", "C")]], [pdf3, pdf4[("X", "C")]]),
- ([psdf3[("X", "A")], psdf4], [pdf3[("X", "A")], pdf4]),
- ([psdf3, psdf4], [pdf3, pdf4]),
- ([psdf5, psdf6], [pdf5, pdf6]),
- ([psdf6, psdf5], [pdf6, pdf5]),
- ]
-
- for ignore_index, join in product(ignore_indexes, joins):
- for i, (psdfs, pdfs) in enumerate(objs):
- with self.subTest(ignore_index=ignore_index, join=join,
pdfs=pdfs, pair=i):
- actual = ps.concat(psdfs, axis=1,
ignore_index=ignore_index, join=join)
- expected = pd.concat(pdfs, axis=1,
ignore_index=ignore_index, join=join)
- self.assert_eq(
-
repr(actual.sort_values(list(actual.columns)).reset_index(drop=True)),
-
repr(expected.sort_values(list(expected.columns)).reset_index(drop=True)),
- )
- actual = ps.concat(
- psdfs, axis=1, ignore_index=ignore_index, join=join,
sort=True
- )
- expected = pd.concat(
- pdfs, axis=1, ignore_index=ignore_index, join=join,
sort=True
- )
- self.assert_eq(
- repr(actual.reset_index(drop=True)),
- repr(expected.reset_index(drop=True)),
- )
-
- def test_combine_first(self):
- pser1 = pd.Series({"falcon": 330.0, "eagle": 160.0})
- pser2 = pd.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0})
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- self.assert_eq(
- psser1.combine_first(psser2).sort_index(),
pser1.combine_first(pser2).sort_index()
- )
- with self.assertRaisesRegex(
- TypeError, "`combine_first` only allows `Series` for parameter
`other`"
- ):
- psser1.combine_first(50)
-
- psser1.name = ("X", "A")
- psser2.name = ("Y", "B")
- pser1.name = ("X", "A")
- pser2.name = ("Y", "B")
- self.assert_eq(
- psser1.combine_first(psser2).sort_index(),
pser1.combine_first(pser2).sort_index()
- )
-
- # MultiIndex
- midx1 = pd.MultiIndex(
- [["lama", "cow", "falcon", "koala"], ["speed", "weight", "length",
"power"]],
- [[0, 3, 1, 1, 1, 2, 2, 2], [0, 2, 0, 3, 2, 0, 1, 3]],
- )
- midx2 = pd.MultiIndex(
- [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
- [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
- )
- pser1 = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1], index=midx1)
- pser2 = pd.Series([-45, 200, -1.2, 30, -250, 1.5, 320, 1, -0.3],
index=midx2)
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
-
- self.assert_eq(
- psser1.combine_first(psser2).sort_index(),
pser1.combine_first(pser2).sort_index()
- )
-
- # DataFrame
- pdf1 = pd.DataFrame({"A": [None, 0], "B": [4, None]})
- psdf1 = ps.from_pandas(pdf1)
- pdf2 = pd.DataFrame({"C": [3, 3], "B": [1, 1]})
- psdf2 = ps.from_pandas(pdf2)
-
- self.assert_eq(pdf1.combine_first(pdf2),
psdf1.combine_first(psdf2).sort_index())
-
- pdf1.columns = pd.MultiIndex.from_tuples([("A", "willow"), ("B",
"pine")])
- psdf1 = ps.from_pandas(pdf1)
- pdf2.columns = pd.MultiIndex.from_tuples([("C", "oak"), ("B", "pine")])
- psdf2 = ps.from_pandas(pdf2)
-
- self.assert_eq(pdf1.combine_first(pdf2),
psdf1.combine_first(psdf2).sort_index())
-
def test_insert(self):
#
# Basic DataFrame
@@ -443,104 +268,6 @@ class OpsOnDiffFramesEnabledTestsMixin:
pdf.insert(0, ("b", "c", ""), pser)
self.assert_eq(psdf.sort_index(), pdf.sort_index())
- def test_compare(self):
- pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])
- pser2 = pd.Series(["a", "c", np.nan, np.nan, "h"])
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
- self.assert_eq(
- pser1.compare(pser2).sort_index(),
- psser1.compare(psser2).sort_index(),
- )
-
- # `keep_shape=True`
- self.assert_eq(
- pser1.compare(pser2, keep_shape=True).sort_index(),
- psser1.compare(psser2, keep_shape=True).sort_index(),
- )
- # `keep_equal=True`
- self.assert_eq(
- pser1.compare(pser2, keep_equal=True).sort_index(),
- psser1.compare(psser2, keep_equal=True).sort_index(),
- )
- # `keep_shape=True` and `keep_equal=True`
- self.assert_eq(
- pser1.compare(pser2, keep_shape=True,
keep_equal=True).sort_index(),
- psser1.compare(psser2, keep_shape=True,
keep_equal=True).sort_index(),
- )
-
- # MultiIndex
- pser1.index = pd.MultiIndex.from_tuples(
- [("a", "x"), ("b", "y"), ("c", "z"), ("x", "k"), ("q", "l")]
- )
- pser2.index = pd.MultiIndex.from_tuples(
- [("a", "x"), ("b", "y"), ("c", "z"), ("x", "k"), ("q", "l")]
- )
- psser1 = ps.from_pandas(pser1)
- psser2 = ps.from_pandas(pser2)
- self.assert_eq(
- pser1.compare(pser2).sort_index(),
- psser1.compare(psser2).sort_index(),
- )
-
- # `keep_shape=True` with MultiIndex
- self.assert_eq(
- pser1.compare(pser2, keep_shape=True).sort_index(),
- psser1.compare(psser2, keep_shape=True).sort_index(),
- )
- # `keep_equal=True` with MultiIndex
- self.assert_eq(
- pser1.compare(pser2, keep_equal=True).sort_index(),
- psser1.compare(psser2, keep_equal=True).sort_index(),
- )
- # `keep_shape=True` and `keep_equal=True` with MultiIndex
- self.assert_eq(
- pser1.compare(pser2, keep_shape=True,
keep_equal=True).sort_index(),
- psser1.compare(psser2, keep_shape=True,
keep_equal=True).sort_index(),
- )
-
- # Different Index
- with self.assertRaisesRegex(
- ValueError, "Can only compare identically-labeled Series objects"
- ):
- psser1 = ps.Series(
- [1, 2, 3, 4, 5],
- index=pd.Index([1, 2, 3, 4, 5]),
- )
- psser2 = ps.Series(
- [2, 2, 3, 4, 1],
- index=pd.Index([5, 4, 3, 2, 1]),
- )
- psser1.compare(psser2)
- # Different MultiIndex
- with self.assertRaisesRegex(
- ValueError, "Can only compare identically-labeled Series objects"
- ):
- psser1 = ps.Series(
- [1, 2, 3, 4, 5],
- index=pd.MultiIndex.from_tuples(
- [("a", "x"), ("b", "y"), ("c", "z"), ("x", "k"), ("q",
"l")]
- ),
- )
- psser2 = ps.Series(
- [2, 2, 3, 4, 1],
- index=pd.MultiIndex.from_tuples(
- [("a", "x"), ("b", "y"), ("c", "a"), ("x", "k"), ("q",
"l")]
- ),
- )
- psser1.compare(psser2)
- # SPARK-37495: Skip identical index checking of Series.compare when
config
- # 'compute.eager_check' is disabled
- psser1 = ps.Series([1, 2, 3, 4, 5], index=pd.Index([1, 2, 3, 4, 5]))
- psser2 = ps.Series([1, 2, 3, 4, 5, 6], index=pd.Index([1, 2, 4, 3, 6,
7]))
- expected = ps.DataFrame(
- {"self": [3, 4, 5, np.nan, np.nan], "other": [4, 3, np.nan, 5.0,
6.0]},
- index=[3, 4, 5, 6, 7],
- )
-
- with ps.option_context("compute.eager_check", False):
- self.assert_eq(expected, psser1.compare(psser2))
-
def test_different_columns(self):
psdf1 = self.psdf1
psdf4 = self.psdf4
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]