[GitHub] spark pull request #22326: [SPARK-25314][SQL] Fix Python UDF accessing attri...

xuanyuanking Tue, 25 Sep 2018 01:33:42 -0700

Github user xuanyuanking commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22326#discussion_r220102842
  
    --- Diff: python/pyspark/sql/tests.py ---
    @@ -547,6 +547,92 @@ def test_udf_in_filter_on_top_of_join(self):
             df = left.crossJoin(right).filter(f("a", "b"))
             self.assertEqual(df.collect(), [Row(a=1, b=1)])
     
    +    def test_udf_in_join_condition(self):
    +        # regression test for SPARK-25314
    +        from pyspark.sql.functions import udf
    +        left = self.spark.createDataFrame([Row(a=1)])
    +        right = self.spark.createDataFrame([Row(b=1)])
    +        f = udf(lambda a, b: a == b, BooleanType())
    +        df = left.join(right, f("a", "b"))
    +        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
    +            self.assertEqual(df.collect(), [Row(a=1, b=1)])
    +
    +    def test_udf_in_left_semi_join_condition(self):
    +        # regression test for SPARK-25314
    +        from pyspark.sql.functions import udf
    +        left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, 
a1=2, a2=2)])
    +        right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1)])
    +        f = udf(lambda a, b: a == b, BooleanType())
    +        df = left.join(right, f("a", "b"), "leftsemi")
    +        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
    +            self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1)])
    +
    +    def test_udf_and_filter_in_join_condition(self):
    +        # regression test for SPARK-25314
    +        # test the complex scenario with both udf(non-deterministic)
    +        # and normal filter(deterministic)
    +        from pyspark.sql.functions import udf
    +        left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, 
a1=2, a2=2)])
    +        right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=2, 
b1=1, b2=2)])
    +        f = udf(lambda a, b: a == b, BooleanType())
    +        df = left.join(right, [f("a", "b1"), left.a == 1, right.b == 2])
    +        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
    +            self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=2, 
b1=1, b2=2)])
    +
    +    def test_udf_and_filter_in_left_semi_join_condition(self):
    +        # regression test for SPARK-25314
    +        # test the complex scenario with both udf(non-deterministic)
    +        # and normal filter(deterministic)
    +        from pyspark.sql.functions import udf
    +        left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, 
a1=2, a2=2)])
    +        right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=2, 
b1=1, b2=2)])
    +        f = udf(lambda a, b: a == b, BooleanType())
    +        df = left.join(right, [f("a", "b1"), left.a == 1, right.b == 2], 
"left_semi")
    +        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
    +            self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1)])
    +
    +    def test_udf_and_common_filter_in_join_condition(self):
    +        # regression test for SPARK-25314
    +        # test the complex scenario with both udf(non-deterministic)
    +        # and common filter(deterministic)
    +        from pyspark.sql.functions import udf
    +        left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, 
a1=2, a2=2)])
    +        right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, 
b1=3, b2=1)])
    +        f = udf(lambda a, b: a == b, BooleanType())
    +        df = left.join(right, [f("a", "b"), left.a1 == right.b1])
    +        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
    +            self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1, b=1, 
b1=1, b2=1)])
    +
    +    def test_udf_and_common_filter_in_left_semi_join_condition(self):
    +        # regression test for SPARK-25314
    +        # test the complex scenario with both udf(non-deterministic)
    +        # and common filter(deterministic)
    +        from pyspark.sql.functions import udf
    +        left = self.spark.createDataFrame([Row(a=1, a1=1, a2=1), Row(a=2, 
a1=2, a2=2)])
    +        right = self.spark.createDataFrame([Row(b=1, b1=1, b2=1), Row(b=1, 
b1=3, b2=1)])
    +        f = udf(lambda a, b: a == b, BooleanType())
    +        df = left.join(right, [f("a", "b"), left.a1 == right.b1], 
"left_semi")
    +        with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
    +            self.assertEqual(df.collect(), [Row(a=1, a1=1, a2=1)])
    +
    +    def test_udf_not_supported_in_join_condition(self):
    --- End diff --
    
    Add this test for 
https://github.com/apache/spark/pull/22326/files#diff-a636a87d8843eeccca90140be91d4fafR1249.



---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #22326: [SPARK-25314][SQL] Fix Python UDF accessing attri...

Reply via email to