This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 70bb9319e50 [SPARK-45996][PYTHON][CONNECT] Show proper dependency
requirement messages for Spark Connect
70bb9319e50 is described below
commit 70bb9319e504a8bde7984a12a6614d2c3e636ee6
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Mon Nov 20 13:03:17 2023 +0900
[SPARK-45996][PYTHON][CONNECT] Show proper dependency requirement messages
for Spark Connect
### What changes were proposed in this pull request?
This PR improve the error messages for the dependency requirement for
Python Spark Connect.
### Why are the changes needed?
In order to improve error messages. This is what you get for now:
```
/.../pyspark/shell.py:57: UserWarning: Failed to initialize Spark session.
warnings.warn("Failed to initialize Spark session.")
Traceback (most recent call last):
File "/.../pyspark/shell.py", line 52, in <module>
spark = SparkSession.builder.getOrCreate()
File "/.../pyspark/sql/session.py", line 476, in getOrCreate
from pyspark.sql.connect.session import SparkSession as
RemoteSparkSession
File "/.../pyspark/sql/connect/session.py", line 53, in <module>
from pyspark.sql.connect.client import SparkConnectClient,
ChannelBuilder
File "/.../pyspark/sql/connect/client/__init__.py", line 22, in <module>
from pyspark.sql.connect.client.core import * # noqa: F401,F403
File "/.../pyspark/sql/connect/client/core.py", line 51, in <module>
import google.protobuf.message
ModuleNotFoundError: No module named 'google
```
```
/.../pyspark/shell.py:57: UserWarning: Failed to initialize Spark session.
warnings.warn("Failed to initialize Spark session.")
Traceback (most recent call last):
File "/.../pyspark/shell.py", line 52, in <module>
spark = SparkSession.builder.getOrCreate()
File "/.../pyspark/sql/session.py", line 476, in getOrCreate
from pyspark.sql.connect.session import SparkSession as
RemoteSparkSession
File "/.../pyspark/sql/connect/session.py", line 53, in <module>
from pyspark.sql.connect.client import SparkConnectClient,
ChannelBuilder
File "/.../pyspark/sql/connect/client/__init__.py", line 22, in <module>
from pyspark.sql.connect.client.core import * # noqa: F401,F403
File "/.../pyspark/sql/connect/client/core.py", line 52, in <module>
from grpc_status import rpc_status
ModuleNotFoundError: No module named 'grpc_status'
```
### Does this PR introduce _any_ user-facing change?
Yes, it changes the user-facing error messages.
### How was this patch tested?
Manually tested as below:
```bash
➜ spark git:(master) ✗ conda create -y -n python3.10 python=3.10
...
➜ spark git:(master) ✗ conda activate python3.10
(python3.10) ➜ spark git:(master) ✗ ./bin/pyspark --remote local
...
raise ImportError(
ImportError: Pandas >= 1.4.4 must be installed; however, it was not found.
(python3.10) ➜ spark git:(master) ✗ pip install 'pandas >= 1.4.4'
...
(python3.10) ➜ spark git:(SPARK-45996) ✗ ./bin/pyspark --remote local
...
raise ImportError(
ImportError: PyArrow >= 4.0.0 must be installed; however, it was not found.
(python3.10) ➜ spark git:(SPARK-45996) pip install 'PyArrow >= 4.0.0'
...
(python3.10) ➜ spark git:(SPARK-45996) ./bin/pyspark --remote local
...
raise ImportError(
ImportError: grpcio >= 1.48.1 must be installed; however, it was not found.
(python3.10) ➜ spark git:(SPARK-45996) pip install 'grpcio >= 1.48.1'
...
(python3.10) ➜ spark git:(SPARK-45996) ./bin/pyspark --remote local
...
raise ImportError(
ImportError: grpc-status >= 1.48.1 must be installed; however, it was not
found.
(python3.10) ➜ spark git:(SPARK-45996) ✗ pip install 'grpcio-status >=
1.48.1'
...
(python3.10) ➜ spark git:(SPARK-45996) ✗ ./bin/pyspark --remote local
...
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 4.0.0.dev0
/_/
Using Python version 3.10.13 (main, Sep 11 2023 08:39:02)
Client connected to the Spark Connect server at localhost
SparkSession available as 'spark'.
>>> spark.range(10).show()
+---+
| id|
+---+
| 0|
...
```
Note that `grpcio-status` includes the common `googleapis-common-protos`
(see
https://github.com/grpc/grpc/blob/master/src/python/grpcio_status/setup.py#L67-L69)
so it wasn't explicitly installed.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43894 from HyukjinKwon/SPARK-45996.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/requirements.txt | 2 +-
python/docs/source/getting_started/install.rst | 2 +-
python/pyspark/sql/connect/utils.py | 33 +++++++++++++++++++++++---
3 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/dev/requirements.txt b/dev/requirements.txt
index 2658f8eec82..fc76407c448 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -54,7 +54,7 @@ py
grpcio>=1.48,<1.57
grpcio-status>=1.48,<1.57
protobuf==4.25.1
-googleapis-common-protos==1.56.4
+googleapis-common-protos>=1.56.4
# Spark Connect python proto generation plugin (optional)
mypy-protobuf==3.3.0
diff --git a/python/docs/source/getting_started/install.rst
b/python/docs/source/getting_started/install.rst
index 154429f276a..b01831bc846 100644
--- a/python/docs/source/getting_started/install.rst
+++ b/python/docs/source/getting_started/install.rst
@@ -161,7 +161,7 @@ Package Supported version Note
`numpy` >=1.21 Required for pandas API
on Spark and MLLib DataFrame-based API; Optional for Spark SQL
`grpcio` >=1.48,<1.57 Required for Spark Connect
`grpcio-status` >=1.48,<1.57 Required for Spark Connect
-`googleapis-common-protos` ==1.56.4 Required for Spark Connect
+`googleapis-common-protos` >=1.56.4 Required for Spark Connect
========================== =========================
======================================================================================
Note that PySpark requires Java 17 or later with ``JAVA_HOME`` properly set.
diff --git a/python/pyspark/sql/connect/utils.py
b/python/pyspark/sql/connect/utils.py
index e96529e44f8..fd85d75060b 100644
--- a/python/pyspark/sql/connect/utils.py
+++ b/python/pyspark/sql/connect/utils.py
@@ -34,6 +34,8 @@ def check_dependencies(mod_name: str) -> None:
require_minimum_pandas_version()
require_minimum_pyarrow_version()
require_minimum_grpc_version()
+ require_minimum_grpcio_status_version()
+ require_minimum_googleapis_common_protos_version()
def require_minimum_grpc_version() -> None:
@@ -44,14 +46,39 @@ def require_minimum_grpc_version() -> None:
import grpc
except ImportError as error:
raise ImportError(
- "grpcio >= %s must be installed; however, " "it was not found." %
minimum_grpc_version
+ f"grpcio >= {minimum_grpc_version} must be installed; however, it
was not found."
) from error
if LooseVersion(grpc.__version__) < LooseVersion(minimum_grpc_version):
raise ImportError(
- "grpcio >= %s must be installed; however, "
- "your version was %s." % (minimum_grpc_version, grpc.__version__)
+ f"grpcio >= {minimum_grpc_version} must be installed; however, "
+ f"your version was {grpc.__version__}."
)
+def require_minimum_grpcio_status_version() -> None:
+ """Raise ImportError if grpcio-status is not installed"""
+ minimum_grpc_version = "1.48.1"
+
+ try:
+ import grpc_status # noqa
+ except ImportError as error:
+ raise ImportError(
+ f"grpcio-status >= {minimum_grpc_version} must be installed;
however, it was not found."
+ ) from error
+
+
+def require_minimum_googleapis_common_protos_version() -> None:
+ """Raise ImportError if googleapis-common-protos is not installed"""
+ minimum_common_protos_version = "1.56.4"
+
+ try:
+ import google.rpc # noqa
+ except ImportError as error:
+ raise ImportError(
+ f"googleapis-common-protos >= {minimum_common_protos_version} must
be installed; "
+ "however, it was not found."
+ ) from error
+
+
def get_python_ver() -> str:
return "%d.%d" % sys.version_info[:2]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]