itholic commented on a change in pull request #34114:
URL: https://github.com/apache/spark/pull/34114#discussion_r721904463
##########
File path: python/pyspark/pandas/data_type_ops/base.py
##########
@@ -349,11 +349,90 @@ def ge(self, left: IndexOpsLike, right: Any) ->
SeriesOrIndex:
raise TypeError(">= can not be applied to %s." % self.pretty_name)
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
- from pyspark.pandas.base import column_op
-
- _sanitize_list_like(right)
+ if isinstance(right, (list, tuple)):
+ from pyspark.pandas.series import first_series, scol_for
+ from pyspark.pandas.frame import DataFrame
+ from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME,
InternalField
+
+ sdf = left._internal.spark_frame
+ structed_scol = F.struct(
+ sdf[NATURAL_ORDER_COLUMN_NAME],
+ *left._internal.index_spark_columns,
+ left.spark.column
+ )
+ # The size of the list is expected to be small.
+ collected_structed_scol = F.collect_list(structed_scol)
+ # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee
the order.
+ collected_structed_scol = F.array_sort(collected_structed_scol)
+ right_values_scol = F.array([F.lit(x) for x in right]) # type:
ignore
+ index_scol_names = left._internal.index_spark_column_names
+ scol_name =
left._internal.spark_column_name_for(left._internal.column_labels[0])
+ # Compare the values of left and right by using zip_with function.
+ cond = F.zip_with(
+ collected_structed_scol,
+ right_values_scol,
+ lambda x, y: F.struct(
+ *[
+ x[index_scol_name].alias(index_scol_name)
+ for index_scol_name in index_scol_names
+ ],
+ F.when(x[scol_name].isNull() | y.isNull(), False)
+ .otherwise(
+ F.when(
+ F.assert_true(
+ # If the comparing result is null,
+ # that means the length of `left` and `right`
is not the same.
+ (x[scol_name] == y).isNotNull(),
Review comment:
Oh... yeah seems like it's hard to determine the length base on
nullability.
`null | null` can be null, but the different length is not the only case for
nullable.
I'd revert the code to the previous way, which is compare the length at the
beginning of the function.
I think it's okay to just compare the length, since the input value expected
to be small.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]