This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ee3a612b28c1 [SPARK-48705][PYTHON] Explicitly use worker_main when it 
starts with pyspark
ee3a612b28c1 is described below

commit ee3a612b28c185e7f4d558c09efbf93ebe46be2b
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Wed Jun 26 14:42:06 2024 +0900

    [SPARK-48705][PYTHON] Explicitly use worker_main when it starts with pyspark
    
    ### What changes were proposed in this pull request?
    
    This PR proposes to use worker modules when it starts with `pyspark`.
    
    ### Why are the changes needed?
    
    Because now daemon uses worker module from the first argument, it could 
break if users are using custom workers with providing parameters. They aren't 
actual API but it is better to be more strict on the argument provided.
    
    Here is an example of sentry-python
    
    ```
    ModuleNotFoundError                       Traceback (most recent call last)
    File <command-558609978944307>, line 4
          1 import sentry_sdk
          2 from sentry_sdk.integrations.spark import SparkWorkerIntegration
    ----> 4 sentry_sdk.init(
          5     dsn="https://examplePublicKeyo0.ingest.sentry.io/0";,
          6     traces_sample_rate=1.0,
          7     profiles_sample_rate=1.0,
          8     integrations=[
          9         SparkWorkerIntegration(),
         10     ],
         11 )
    
    File /.../python3.10/site-packages/sentry_sdk/hub.py:121, in _init(*args, 
**kwargs)
        115 def _init(*args, **kwargs):
        116     # type: (*Optional[str], **Any) -> ContextManager[Any]
        117     """Initializes the SDK and optionally integrations.
        118
        119     This takes the same arguments as the client constructor.
        120     """
    --> 121     client = Client(*args, **kwargs)  # type: ignore
        122     Hub.current.bind_client(client)
        123     _check_python_deprecations()
    
    File /.../python3.10/site-packages/sentry_sdk/client.py:128, in 
_Client.__init__(self, *args, **kwargs)
        124 def __init__(self, *args, **kwargs):
        125     # type: (*Any, **Any) -> None
        126     self.options = get_options(*args, **kwargs)  # type: Dict[str, 
Any]
    --> 128     self._init_impl()
    
    File /.../python3.10/site-packages/sentry_sdk/client.py:162, in 
_Client._init_impl(self)
        155 if self.options["request_bodies"] not in request_bodies:
        156     raise ValueError(
        157         "Invalid value for request_bodies. Must be one of 
{}".format(
        158             request_bodies
        159         )
        160     )
    --> 162 self.integrations = setup_integrations(
        163     self.options["integrations"],
        164     with_defaults=self.options["default_integrations"],
        165     with_auto_enabling_integrations=self.options[
        166         "auto_enabling_integrations"
        167     ],
        168 )
        170 sdk_name = get_sdk_name(list(self.integrations.keys()))
        171 SDK_INFO["name"] = sdk_name
    
    File /.../python3.10/site-packages/sentry_sdk/integrations/__init__.py:124, 
in setup_integrations(integrations, with_defaults, 
with_auto_enabling_integrations)
        120 logger.debug(
        121     "Setting up previously not enabled integration %s", identifier
        122 )
        123 try:
    --> 124     type(integration).setup_once()
        125 except NotImplementedError:
        126     if getattr(integration, "install", None) is not None:
    
    File 
/.../python3.10/site-packages/sentry_sdk/integrations/spark/spark_worker.py:31, 
in SparkWorkerIntegration.setup_once()
         28 staticmethod
         29 def setup_once():
         30     # type: () -> None
    ---> 31     import pyspark.daemon as original_daemon
         33     original_daemon.worker_main = _sentry_worker_main
    
    File /.../python/pyspark/daemon.py:40
         37 if len(sys.argv) > 1:
         38     import importlib
    ---> 40     worker_module = importlib.import_module(sys.argv[1])
         41     worker_main = worker_module.main
         42 else:
    
    File /usr/lib/python3.10/importlib/__init__.py:126, in import_module(name, 
package)
        124             break
        125         level += 1
    --> 126 return _bootstrap._gcd_import(name[level:], package, level)
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it recovers the support of 
[sentry-python](https://github.com/getsentry/sentry-python).
    
    ### How was this patch tested?
    
    Manually tested.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #47077 from HyukjinKwon/SPARK-48705.
    
    Authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/daemon.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/daemon.py b/python/pyspark/daemon.py
index bbbc495d053e..a23af109ea6d 100644
--- a/python/pyspark/daemon.py
+++ b/python/pyspark/daemon.py
@@ -30,7 +30,7 @@ from signal import SIGHUP, SIGTERM, SIGCHLD, SIG_DFL, 
SIG_IGN, SIGINT
 
 from pyspark.serializers import read_int, write_int, write_with_length, 
UTF8Deserializer
 
-if len(sys.argv) > 1:
+if len(sys.argv) > 1 and sys.argv[1].startswith("pyspark"):
     import importlib
 
     worker_module = importlib.import_module(sys.argv[1])


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to