itholic commented on a change in pull request #34113:
URL: https://github.com/apache/spark/pull/34113#discussion_r718114641
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, I think it might be possible to reduce the job for each iteration,
but at least one Spark job is required for comparing each level of index.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, I think it might be possible to reduce the job for each iteration,
but at least one Spark job is required for comparing each level of index. Let
me address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, I think it might be possible to reduce the Spark job for each
iteration, but at least one Spark job is required for comparing each level of
index. Let me address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, I think it might be possible to reduce the Spark job for each
iteration, but still at least one Spark job is required for comparing each
level of index. Let me address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, I think it might be possible to reduce the Spark job for each
iteration, but still at least one Spark job is required for comparing each
level of index values. Let me address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, it might be possible to reduce the Spark job for each iteration,
but I think at least one Spark job is still required for comparing each level
of index values. Let me address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, it might be possible to reduce the Spark job for each iteration,
but I think at least one Spark job is still required for comparing each level
of index values. Let me try to address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,43 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Notes
+ -----
+ This API can be expensive since it has logic to sort and compare the
values of
+ all levels of indices that belong to MultiIndex.
+
+ Examples
+ --------
+ >>> from pyspark.pandas.config import set_option, reset_option
+ >>> set_option("compute.ops_on_diff_frames", True)
+
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+
+ >>> reset_option("compute.ops_on_diff_frames")
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ for nlevel in range(nlevels):
Review comment:
Yeah, it might be possible to reduce the Spark job for each iteration,
but I think at least one Spark job is still required for comparing each level
of index values. Let me try to address it.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,42 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Examples
+ --------
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ self_sdf = self._internal.spark_frame
+ other_sdf = other._internal.spark_frame
+ subtract_list = []
+ for nlevel in range(nlevels):
+ self_index_scol = self._internal.index_spark_columns[nlevel]
+ other_index_scol = other._internal.index_spark_columns[nlevel]
+ self_subtract_other = self_sdf.select(self_index_scol).subtract(
Review comment:
Correct, and
```python
ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "y"), ("b", "x")])
ps.MultiIndex.from_tuples([("b", "x"), ("a", "y")])
```
is also considered as same.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,42 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Examples
+ --------
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ self_sdf = self._internal.spark_frame
+ other_sdf = other._internal.spark_frame
+ subtract_list = []
+ for nlevel in range(nlevels):
+ self_index_scol = self._internal.index_spark_columns[nlevel]
+ other_index_scol = other._internal.index_spark_columns[nlevel]
+ self_subtract_other = self_sdf.select(self_index_scol).subtract(
Review comment:
Correct.
And
```python
ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "y"), ("b", "x")])
ps.MultiIndex.from_tuples([("b", "x"), ("a", "y")])
```
is also considered as same.
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,42 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Examples
+ --------
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ self_sdf = self._internal.spark_frame
+ other_sdf = other._internal.spark_frame
+ subtract_list = []
+ for nlevel in range(nlevels):
+ self_index_scol = self._internal.index_spark_columns[nlevel]
+ other_index_scol = other._internal.index_spark_columns[nlevel]
+ self_subtract_other = self_sdf.select(self_index_scol).subtract(
Review comment:
I think we don't have to preserve the same values.
It only compares unique value of each levels.
For example,
```python
>>> pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "y"),
("b", "x")])
>>> pmidx2 = pd.MultiIndex.from_tuples([("b", "x"), ("a", "y")])
>>> pmidx1.equal_levels(pmidx2)
True
```
##########
File path: python/pyspark/pandas/indexes/multi.py
##########
@@ -1137,6 +1137,42 @@ def intersection(self, other: Union[DataFrame, Series,
Index, List]) -> "MultiIn
)
return cast(MultiIndex, DataFrame(internal).index)
+ def equal_levels(self, other: "MultiIndex") -> bool:
+ """
+ Return True if the levels of both MultiIndex objects are the same
+
+ Examples
+ --------
+ >>> psmidx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"z")])
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("b", "y"), ("a", "x"), ("c",
"z")])
+ >>> psmidx1.equal_levels(psmidx2)
+ True
+
+ >>> psmidx2 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c",
"j")])
+ >>> psmidx1.equal_levels(psmidx2)
+ False
+ """
+ nlevels = self.nlevels
+ if nlevels != other.nlevels:
+ return False
+
+ self_sdf = self._internal.spark_frame
+ other_sdf = other._internal.spark_frame
+ subtract_list = []
+ for nlevel in range(nlevels):
+ self_index_scol = self._internal.index_spark_columns[nlevel]
+ other_index_scol = other._internal.index_spark_columns[nlevel]
+ self_subtract_other = self_sdf.select(self_index_scol).subtract(
Review comment:
I think we don't need to preserve the same values.
It only compares unique value of each levels.
For example,
```python
>>> pmidx1 = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("a", "y"),
("b", "x")])
>>> pmidx2 = pd.MultiIndex.from_tuples([("b", "x"), ("a", "y")])
>>> pmidx1.equal_levels(pmidx2)
True
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]