HyukjinKwon commented on code in PR #37144:
URL: https://github.com/apache/spark/pull/37144#discussion_r917346716
##########
python/pyspark/sql/tests/test_functions.py:
##########
@@ -55,6 +55,74 @@
class FunctionsTests(ReusedSQLTestCase):
+ def test_function_parity(self):
+ # This test compares the available list of functions in
pyspark.sql.functions with those
+ # available in the Scala/Java DataFrame API in
org.apache.spark.sql.functions.
+ #
+ # NOTE FOR DEVELOPERS:
+ # If this test fails one of the following needs to happen
+ # * If a function was added to org.apache.spark.sql.functions it
either needs to be added to
+ # pyspark.sql.functions or added to the below
expected_missing_in_py set.
+ # * If a function was added to pyspark.sql.functions that was already
in
+ # org.apache.spark.sql.functions then it needs to be removed from
expected_missing_in_py
+ # below. If the function has a different name it needs to be added
to py_equiv_jvm
+ # mapping.
+ # * If it's not related to an added/removed function then likely one
of the exclusion lists
+ # need updated.
+ from pyspark import SparkContext
+ from pyspark.sql import functions as py_functions
+ from inspect import getmembers, isfunction
+
+ assert SparkContext._active_spark_context is not None
+ jvm_functions = SparkContext._active_spark_context._jvm.functions
+
+ jvm_fn_set = {name for (name, value) in getmembers(jvm_functions)}
+ py_fn_set = {
+ name for (name, value) in getmembers(py_functions, isfunction) if
name[0] != "_"
+ }
+
+ # Functions on the JVM side we do not expect to be available in python
because they are
+ # depreciated, irrelevant to python, or have equivalents.
+ jvm_excluded_fn = [
+ "callUDF", # depreciated, use call_udf
+ "typedlit", # Scala only
+ "typedLit", # Scala only
+ "monotonicallyIncreasingId", # depreciated, use
monotonically_increasing_id
+ "negate", # equivalent to python -expression
+ "not", # equivalent to python ~expression
+ ]
+ # Excluded functions on the python side
+ py_excluded_fn = [
+ "pandas_udf", # Python only
Review Comment:
I think we don't need to test this case. It's pretty unlikely there are some
PySpark API that are not implemented mistakenly only in JVM
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]