This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c39a82593c3 [SPARK-44965][PYTHON] Hide internal functions/variables
from `pyspark.sql.functions`
c39a82593c3 is described below
commit c39a82593c3b85e507d6431966bc840ba8c06d60
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Tue Aug 29 09:29:22 2023 +0800
[SPARK-44965][PYTHON] Hide internal functions/variables from
`pyspark.sql.functions`
### What changes were proposed in this pull request?
Hide internal functions/variables from `pyspark.sql.functions`
### Why are the changes needed?
internal functions/variables should not be exposed to end users:
```
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 3.4.1
/_/
Using Python version 3.10.12 (main, Jul 5 2023 15:02:25)
Spark context Web UI available at http://localhost:4040/
Spark context available as 'sc' (master = local[*], app id =
local-1692949938125).
SparkSession available as 'spark'.
In [1]: from pyspark.sql.functions import *
In [2]: ??to_str
Signature: to_str(value: Any) -> Optional[str]
Source:
def to_str(value: Any) -> Optional[str]:
"""
A wrapper over str(), but converts bool values to lower case strings.
If None is given, just returns None, instead of converting it to string
"None".
"""
if isinstance(value, bool):
return str(value).lower()
elif value is None:
return value
else:
return str(value)
File: ~/.dev/bin/spark-3.4.1-bin-hadoop3/python/pyspark/sql/utils.py
Type: function
```
`to_str` here is a internal helper function
### Does this PR introduce _any_ user-facing change?
yes
### How was this patch tested?
CI
### Was this patch authored or co-authored using generative AI tooling?
NO
Closes #42680 from zhengruifeng/py_func_all.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/functions.py | 430 +++++++++++++++++++++++++++++
python/pyspark/sql/tests/test_functions.py | 33 +++
2 files changed, 463 insertions(+)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 5d5557cb916..43b82d31368 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -79,6 +79,436 @@ if has_numpy:
# since it requires making every single overridden definition.
+__all__ = [
+ "abs",
+ "acos",
+ "acosh",
+ "add_months",
+ "aes_decrypt",
+ "aes_encrypt",
+ "aggregate",
+ "any_value",
+ "approxCountDistinct",
+ "approx_count_distinct",
+ "approx_percentile",
+ "array",
+ "array_agg",
+ "array_append",
+ "array_compact",
+ "array_contains",
+ "array_distinct",
+ "array_except",
+ "array_insert",
+ "array_intersect",
+ "array_join",
+ "array_max",
+ "array_min",
+ "array_position",
+ "array_prepend",
+ "array_remove",
+ "array_repeat",
+ "array_size",
+ "array_sort",
+ "array_union",
+ "arrays_overlap",
+ "arrays_zip",
+ "asc",
+ "asc_nulls_first",
+ "asc_nulls_last",
+ "ascii",
+ "asin",
+ "asinh",
+ "assert_true",
+ "atan",
+ "atan2",
+ "atanh",
+ "avg",
+ "base64",
+ "bin",
+ "bit_and",
+ "bit_count",
+ "bit_get",
+ "bit_length",
+ "bit_or",
+ "bit_xor",
+ "bitmap_bit_position",
+ "bitmap_bucket_number",
+ "bitmap_construct_agg",
+ "bitmap_count",
+ "bitmap_or_agg",
+ "bitwiseNOT",
+ "bitwise_not",
+ "bool_and",
+ "bool_or",
+ "broadcast",
+ "bround",
+ "btrim",
+ "bucket",
+ "call_function",
+ "call_udf",
+ "cardinality",
+ "cast",
+ "cbrt",
+ "ceil",
+ "ceiling",
+ "char",
+ "char_length",
+ "character_length",
+ "coalesce",
+ "col",
+ "collect_list",
+ "collect_set",
+ "column",
+ "concat",
+ "concat_ws",
+ "contains",
+ "conv",
+ "convert_timezone",
+ "corr",
+ "cos",
+ "cosh",
+ "cot",
+ "count",
+ "countDistinct",
+ "count_distinct",
+ "count_if",
+ "count_min_sketch",
+ "covar_pop",
+ "covar_samp",
+ "crc32",
+ "create_map",
+ "csc",
+ "cume_dist",
+ "curdate",
+ "current_catalog",
+ "current_database",
+ "current_date",
+ "current_schema",
+ "current_timestamp",
+ "current_timezone",
+ "current_user",
+ "date_add",
+ "date_diff",
+ "date_format",
+ "date_from_unix_date",
+ "date_part",
+ "date_sub",
+ "date_trunc",
+ "dateadd",
+ "datediff",
+ "datepart",
+ "day",
+ "dayofmonth",
+ "dayofweek",
+ "dayofyear",
+ "days",
+ "decode",
+ "degrees",
+ "dense_rank",
+ "desc",
+ "desc_nulls_first",
+ "desc_nulls_last",
+ "e",
+ "element_at",
+ "elt",
+ "encode",
+ "endswith",
+ "equal_null",
+ "every",
+ "exists",
+ "exp",
+ "explode",
+ "explode_outer",
+ "expm1",
+ "expr",
+ "extract",
+ "factorial",
+ "filter",
+ "find_in_set",
+ "first",
+ "first_value",
+ "flatten",
+ "floor",
+ "forall",
+ "format_number",
+ "format_string",
+ "from_csv",
+ "from_json",
+ "from_unixtime",
+ "from_utc_timestamp",
+ "get",
+ "get_json_object",
+ "getbit",
+ "greatest",
+ "grouping",
+ "grouping_id",
+ "hash",
+ "hex",
+ "histogram_numeric",
+ "hll_sketch_agg",
+ "hll_sketch_estimate",
+ "hll_union",
+ "hll_union_agg",
+ "hour",
+ "hours",
+ "hypot",
+ "ifnull",
+ "ilike",
+ "initcap",
+ "inline",
+ "inline_outer",
+ "input_file_block_length",
+ "input_file_block_start",
+ "input_file_name",
+ "instr",
+ "isnan",
+ "isnotnull",
+ "isnull",
+ "java_method",
+ "json_array_length",
+ "json_object_keys",
+ "json_tuple",
+ "kurtosis",
+ "lag",
+ "last",
+ "last_day",
+ "last_value",
+ "lcase",
+ "lead",
+ "least",
+ "left",
+ "length",
+ "levenshtein",
+ "like",
+ "lit",
+ "ln",
+ "localtimestamp",
+ "locate",
+ "log",
+ "log10",
+ "log1p",
+ "log2",
+ "lower",
+ "lpad",
+ "ltrim",
+ "make_date",
+ "make_dt_interval",
+ "make_interval",
+ "make_timestamp",
+ "make_timestamp_ltz",
+ "make_timestamp_ntz",
+ "make_ym_interval",
+ "map_concat",
+ "map_contains_key",
+ "map_entries",
+ "map_filter",
+ "map_from_arrays",
+ "map_from_entries",
+ "map_keys",
+ "map_values",
+ "map_zip_with",
+ "mask",
+ "max",
+ "max_by",
+ "md5",
+ "mean",
+ "median",
+ "min",
+ "min_by",
+ "minute",
+ "mode",
+ "monotonically_increasing_id",
+ "month",
+ "months",
+ "months_between",
+ "named_struct",
+ "nanvl",
+ "negate",
+ "negative",
+ "next_day",
+ "now",
+ "nth_value",
+ "ntile",
+ "nullif",
+ "nvl",
+ "nvl2",
+ "octet_length",
+ "overlay",
+ "overload",
+ "parse_url",
+ "percent_rank",
+ "percentile",
+ "percentile_approx",
+ "pi",
+ "pmod",
+ "posexplode",
+ "posexplode_outer",
+ "position",
+ "positive",
+ "pow",
+ "power",
+ "printf",
+ "product",
+ "quarter",
+ "radians",
+ "raise_error",
+ "rand",
+ "randn",
+ "rank",
+ "reduce",
+ "reflect",
+ "regexp",
+ "regexp_count",
+ "regexp_extract",
+ "regexp_extract_all",
+ "regexp_instr",
+ "regexp_like",
+ "regexp_replace",
+ "regexp_substr",
+ "regr_avgx",
+ "regr_avgy",
+ "regr_count",
+ "regr_intercept",
+ "regr_r2",
+ "regr_slope",
+ "regr_sxx",
+ "regr_sxy",
+ "regr_syy",
+ "repeat",
+ "replace",
+ "reverse",
+ "right",
+ "rint",
+ "rlike",
+ "round",
+ "row_number",
+ "rpad",
+ "rtrim",
+ "schema_of_csv",
+ "schema_of_json",
+ "sec",
+ "second",
+ "sentences",
+ "sequence",
+ "session_window",
+ "sha",
+ "sha1",
+ "sha2",
+ "shiftLeft",
+ "shiftRight",
+ "shiftRightUnsigned",
+ "shiftleft",
+ "shiftright",
+ "shiftrightunsigned",
+ "shuffle",
+ "sign",
+ "signum",
+ "sin",
+ "sinh",
+ "size",
+ "skewness",
+ "slice",
+ "some",
+ "sort_array",
+ "soundex",
+ "spark_partition_id",
+ "split",
+ "split_part",
+ "sqrt",
+ "stack",
+ "startswith",
+ "std",
+ "stddev",
+ "stddev_pop",
+ "stddev_samp",
+ "str_to_map",
+ "struct",
+ "substr",
+ "substring",
+ "substring_index",
+ "sum",
+ "sumDistinct",
+ "sum_distinct",
+ "tan",
+ "tanh",
+ "timestamp_micros",
+ "timestamp_millis",
+ "timestamp_seconds",
+ "toDegrees",
+ "toRadians",
+ "to_binary",
+ "to_char",
+ "to_csv",
+ "to_date",
+ "to_json",
+ "to_number",
+ "to_timestamp",
+ "to_timestamp_ltz",
+ "to_timestamp_ntz",
+ "to_unix_timestamp",
+ "to_utc_timestamp",
+ "to_varchar",
+ "transform",
+ "transform_keys",
+ "transform_values",
+ "translate",
+ "trim",
+ "trunc",
+ "try_add",
+ "try_aes_decrypt",
+ "try_avg",
+ "try_divide",
+ "try_element_at",
+ "try_multiply",
+ "try_subtract",
+ "try_sum",
+ "try_to_binary",
+ "try_to_number",
+ "try_to_timestamp",
+ "typeof",
+ "ucase",
+ "udf",
+ "udtf",
+ "unbase64",
+ "unhex",
+ "unix_date",
+ "unix_micros",
+ "unix_millis",
+ "unix_seconds",
+ "unix_timestamp",
+ "unwrap_udt",
+ "upper",
+ "url_decode",
+ "url_encode",
+ "user",
+ "var_pop",
+ "var_samp",
+ "variance",
+ "version",
+ "weekday",
+ "weekofyear",
+ "when",
+ "width_bucket",
+ "window",
+ "window_time",
+ "xpath",
+ "xpath_boolean",
+ "xpath_double",
+ "xpath_float",
+ "xpath_int",
+ "xpath_long",
+ "xpath_number",
+ "xpath_short",
+ "xpath_string",
+ "xxhash64",
+ "year",
+ "years",
+ "zip_with",
+ "pandas_udf",
+ "PandasUDFType",
+]
+
+
def _get_jvm_function(name: str, sc: SparkContext) -> Callable:
"""
Retrieves JVM function identified by name from
diff --git a/python/pyspark/sql/tests/test_functions.py
b/python/pyspark/sql/tests/test_functions.py
index c484e10ec1a..0633b8c4341 100644
--- a/python/pyspark/sql/tests/test_functions.py
+++ b/python/pyspark/sql/tests/test_functions.py
@@ -92,6 +92,39 @@ class FunctionsTestsMixin:
expected_missing_in_py, missing_in_py, "Missing functions in
pyspark not as expected"
)
+ def test_public_function(self):
+ inspected_list = {name for (name, value) in getmembers(F, isfunction)
if name[0] != "_"}
+
+ public_list = set(F.__all__)
+
+ # check alias: both function 'pow' and its alias 'power' should be
included
+ self.assertTrue("pow" in inspected_list)
+ self.assertTrue("power" in inspected_list)
+ self.assertTrue("pow" in public_list)
+ self.assertTrue("power" in public_list)
+
+ inspected_execuded_list = {
+ "get_active_spark_context", # internal helper function
+ "try_remote_functions", # internal helper function
+ "to_str", # internal helper function
+ }
+
+ self.assertEqual(
+ inspected_list - public_list,
+ inspected_execuded_list,
+ "Inspected functions NOT exposed!",
+ )
+
+ public_execuded_list = {
+ "PandasUDFType", # type, not a function
+ }
+
+ self.assertEqual(
+ public_list - inspected_list,
+ public_execuded_list,
+ "Non-existent functions exposed!",
+ )
+
def test_explode(self):
d = [
Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]