This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b23ae15da019 [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework b23ae15da019 is described below commit b23ae15da019082891d71853682329c2d24c2e9e Author: Haejoon Lee <haejoon....@databricks.com> AuthorDate: Sun Dec 3 22:49:30 2023 -0800 [SPARK-46232][PYTHON] Migrate all remaining ValueError into PySpark error framework ### What changes were proposed in this pull request? This PR proposes to migrate all remaining `ValueError` from `pyspark/sql/*` into PySpark error framework, `PySparkValueError` with assigning dedicated error classes. ### Why are the changes needed? To improve the error handling in PySpark. ### Does this PR introduce _any_ user-facing change? No API changes, but the user-facing error messages will be improved. ### How was this patch tested? The existing CI should pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44149 from itholic/migrate_value_error. Authored-by: Haejoon Lee <haejoon....@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- python/pyspark/errors/error_classes.py | 19 +++++++++++++++++-- python/pyspark/sql/pandas/serializers.py | 5 +++-- python/pyspark/sql/pandas/typehints.py | 12 +++++++++--- python/pyspark/sql/pandas/types.py | 7 +++++-- python/pyspark/sql/sql_formatter.py | 7 ++++--- 5 files changed, 38 insertions(+), 12 deletions(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index c7199ac938be..d0c0d1c115b0 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -287,6 +287,11 @@ ERROR_CLASSES_JSON = """ "NumPy array input should be of <dimensions> dimensions." ] }, + "INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP" : { + "message" : [ + "Invalid number of dataframes in group <dataframes_in_group>." + ] + }, "INVALID_PANDAS_UDF" : { "message" : [ "Invalid function: <detail>" @@ -803,9 +808,9 @@ ERROR_CLASSES_JSON = """ "Expected <expected> values for `<item>`, got <actual>." ] }, - "TYPE_HINT_REQUIRED" : { + "TYPE_HINT_SHOULD_BE_SPECIFIED" : { "message" : [ - "A <arg_type> is required <where>." + "Type hints for <target> should be specified; however, got <sig>." ] }, "UDF_RETURN_TYPE" : { @@ -888,6 +893,11 @@ ERROR_CLASSES_JSON = """ "Unknown response: <response>." ] }, + "UNKNOWN_VALUE_FOR" : { + "message" : [ + "Unknown value for `<var>`." + ] + }, "UNSUPPORTED_DATA_TYPE" : { "message" : [ "Unsupported DataType `<data_type>`." @@ -983,6 +993,11 @@ ERROR_CLASSES_JSON = """ "Value for `<arg_name>` only supports the 'pearson', got '<arg_value>'." ] }, + "VALUE_NOT_PLAIN_COLUMN_REFERENCE" : { + "message" : [ + "Value <val> in <field_name> should be a plain column reference such as `df.col` or `col('column')`." + ] + }, "VALUE_NOT_POSITIVE" : { "message" : [ "Value for `<arg_name>` must be positive, got '<arg_value>'." diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 8ffb7407714b..6c5bd826a023 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -707,8 +707,9 @@ class CogroupArrowUDFSerializer(ArrowStreamGroupUDFSerializer): yield batches1, batches2 elif dataframes_in_group != 0: - raise ValueError( - "Invalid number of dataframes in group {0}".format(dataframes_in_group) + raise PySparkValueError( + error_class="INVALID_NUMBER_OF_DATAFRAMES_IN_GROUP", + message_parameters={"dataframes_in_group": str(dataframes_in_group)}, ) diff --git a/python/pyspark/sql/pandas/typehints.py b/python/pyspark/sql/pandas/typehints.py index f0c13e66a63d..37ba02a94d58 100644 --- a/python/pyspark/sql/pandas/typehints.py +++ b/python/pyspark/sql/pandas/typehints.py @@ -18,7 +18,7 @@ from inspect import Signature from typing import Any, Callable, Dict, Optional, Union, TYPE_CHECKING from pyspark.sql.pandas.utils import require_minimum_pandas_version -from pyspark.errors import PySparkNotImplementedError +from pyspark.errors import PySparkNotImplementedError, PySparkValueError if TYPE_CHECKING: from pyspark.sql.pandas._typing import ( @@ -51,12 +51,18 @@ def infer_eval_type( annotations[parameter] for parameter in sig.parameters if parameter in annotations ] if len(parameters_sig) != len(sig.parameters): - raise ValueError("Type hints for all parameters should be specified; however, got %s" % sig) + raise PySparkValueError( + error_class="TYPE_HINT_SHOULD_BE_SPECIFIED", + message_parameters={"target": "all parameters", "sig": str(sig)}, + ) # Check if the return has a type hint return_annotation = type_hints.get("return", sig.return_annotation) if sig.empty is return_annotation: - raise ValueError("Type hint for the return type should be specified; however, got %s" % sig) + raise PySparkValueError( + error_class="TYPE_HINT_SHOULD_BE_SPECIFIED", + message_parameters={"target": "the return type", "sig": str(sig)}, + ) # Series, Frame or Union[DataFrame, Series], ... -> Series or Frame is_series_or_frame = all( diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py index f4005a47357b..36c982eb519c 100644 --- a/python/pyspark/sql/pandas/types.py +++ b/python/pyspark/sql/pandas/types.py @@ -49,7 +49,7 @@ from pyspark.sql.types import ( UserDefinedType, _create_row, ) -from pyspark.errors import PySparkTypeError, UnsupportedOperationException +from pyspark.errors import PySparkTypeError, UnsupportedOperationException, PySparkValueError if TYPE_CHECKING: import pandas as pd @@ -716,7 +716,10 @@ def _create_converter_to_pandas( return convert_struct_as_dict else: - raise ValueError(f"Unknown value for `struct_in_pandas`: {_struct_in_pandas}") + raise PySparkValueError( + error_class="UNKNOWN_VALUE_FOR", + message_parameters={"var": str(_struct_in_pandas)}, + ) elif isinstance(dt, TimestampType): assert timezone is not None diff --git a/python/pyspark/sql/sql_formatter.py b/python/pyspark/sql/sql_formatter.py index 5e79b9ff5ea9..a27f7205a2d7 100644 --- a/python/pyspark/sql/sql_formatter.py +++ b/python/pyspark/sql/sql_formatter.py @@ -25,6 +25,7 @@ from py4j.java_gateway import is_instance_of if typing.TYPE_CHECKING: from pyspark.sql import SparkSession, DataFrame from pyspark.sql.functions import lit +from pyspark.errors import PySparkValueError class SQLStringFormatter(string.Formatter): @@ -61,9 +62,9 @@ class SQLStringFormatter(string.Formatter): ): return jexpr.sql() else: - raise ValueError( - "%s in %s should be a plain column reference such as `df.col` " - "or `col('column')`" % (val, field_name) + raise PySparkValueError( + error_class="VALUE_NOT_PLAIN_COLUMN_REFERENCE", + message_parameters={"val": str(val), "field_name": field_name}, ) elif isinstance(val, DataFrame): for df, n in self._temp_views: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org