This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 003be89e2af8 [SPARK-50687][PYTHON] Optimize the logic to get stack
traces for DataFrameQueryContext
003be89e2af8 is described below
commit 003be89e2af82a32694e23dfa1bc97f5b8043496
Author: Hyukjin Kwon <[email protected]>
AuthorDate: Fri Dec 27 19:08:38 2024 +0900
[SPARK-50687][PYTHON] Optimize the logic to get stack traces for
DataFrameQueryContext
### What changes were proposed in this pull request?
This PR proposes to optimize the logic to get stack traces for
DataFrameQueryContext by avoid using `inspect.getframeinfo` that is slow.
```
8000 0.019 0.000 0.976 0.000 inspect.py:1658()
```
### Why are the changes needed?
The main overhead is from `inspect.getframeinfo` so this PR directly uses
the frames from `inspect.currentframe`
### Does this PR introduce _any_ user-facing change?
It improves the performance of `DataFrameQueryContext`.
### How was this patch tested?
Manually tested as below:
```python
import cProfile
from pyspark.sql.functions import col
def foo():
for _ in range(1000):
col("id")
cProfile.run('foo()', sort='tottime')
```
**Before:**
```
5793561 function calls (5789561 primitive calls) in 3.615 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
27000 1.784 0.000 1.784 0.000 {method 'recv_into' of
'_socket.socket' objects}
1000 0.413 0.000 0.787 0.001 inspect.py:969(getmodule)
969000 0.137 0.000 0.171 0.000 inspect.py:283(ismodule)
27000 0.135 0.000 0.135 0.000 {method 'sendall' of
'_socket.socket' objects}
967000 0.097 0.000 0.097 0.000 {built-in method
builtins.hasattr}
27000 0.080 0.000 2.125 0.000
clientserver.py:523(send_command)
8000 0.063 0.000 0.063 0.000 {built-in method
builtins.next}
1000 0.061 0.000 0.061 0.000 {built-in method posix.getcwd}
1266556 0.051 0.000 0.056 0.000 {built-in method
builtins.isinstance}
935000 0.036 0.000 0.036 0.000 {method 'get' of 'dict'
objects}
27000 0.033 0.000 2.213 0.000
java_gateway.py:1015(send_command)
54778 0.032 0.000 0.039 0.000 protocol.py:214(smart_decode)
24000 0.025 0.000 0.030 0.000
inspect.py:1790(_shadowed_dict)
```
**After:**
```
1575042 function calls (1573042 primitive calls) in 2.094 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
27000 1.175 0.000 1.175 0.000 {method 'recv_into' of
'_socket.socket' objects}
27000 0.101 0.000 1.504 0.000
clientserver.py:523(send_command)
27000 0.067 0.000 0.067 0.000 {method 'sendall' of
'_socket.socket' objects}
54852 0.039 0.000 0.048 0.000 protocol.py:214(smart_decode)
27000 0.038 0.000 1.607 0.000
java_gateway.py:1015(send_command)
24000 0.029 0.000 0.035 0.000
inspect.py:1790(_shadowed_dict)
27000 0.027 0.000 1.223 0.000 socket.py:692(readinto)
27000 0.025 0.000 1.249 0.000 {method 'readline' of
'_io.BufferedReader' objects}
9000 0.020 0.000 0.195 0.000
java_gateway.py:1290(_get_args)
27000 0.020 0.000 0.035 0.000
clientserver.py:271(_get_connection)
54000 0.020 0.000 0.036 0.000 __init__.py:1467(debug)
...
```
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #49315 from HyukjinKwon/optimize-get-frame.
Authored-by: Hyukjin Kwon <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/errors/utils.py | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/python/pyspark/errors/utils.py b/python/pyspark/errors/utils.py
index d928afc813a4..5488940645a1 100644
--- a/python/pyspark/errors/utils.py
+++ b/python/pyspark/errors/utils.py
@@ -34,6 +34,8 @@ from typing import (
overload,
cast,
)
+from types import FrameType
+
import pyspark
from pyspark.errors.error_classes import ERROR_CLASSES_MAP
@@ -199,18 +201,15 @@ def _capture_call_site(depth: int) -> str:
# Filtering out PySpark code and keeping user code only
pyspark_root = os.path.dirname(pyspark.__file__)
- def inspect_stack() -> Iterator[inspect.FrameInfo]:
+ def inspect_stack() -> Iterator[FrameType]:
frame = inspect.currentframe()
while frame:
- frameinfo = (frame,) + inspect.getframeinfo(frame, context=0)
- yield inspect.FrameInfo(*frameinfo)
+ yield frame
frame = frame.f_back
- stack = (
- frame_info for frame_info in inspect_stack() if pyspark_root not in
frame_info.filename
- )
+ stack = (f for f in inspect_stack() if pyspark_root not in
f.f_code.co_filename)
- selected_frames: Iterator[inspect.FrameInfo] = itertools.islice(stack,
depth)
+ selected_frames: Iterator[FrameType] = itertools.islice(stack, depth)
# We try import here since IPython is not a required dependency
try:
@@ -226,7 +225,8 @@ def _capture_call_site(depth: int) -> str:
selected_frames = (
frame
for frame in selected_frames
- if (ipy_root not in frame.filename) and (ipykernel_root not in
frame.filename)
+ if (ipy_root not in frame.f_code.co_filename)
+ and (ipykernel_root not in frame.f_code.co_filename)
)
except ImportError:
ipython = None
@@ -234,10 +234,11 @@ def _capture_call_site(depth: int) -> str:
# Identifying the cell is useful when the error is generated from IPython
Notebook
if ipython:
call_sites = [
- f"line {frame.lineno} in cell [{ipython.execution_count}]" for
frame in selected_frames
+ f"line {frame.f_lineno} in cell [{ipython.execution_count}]"
+ for frame in selected_frames
]
else:
- call_sites = [f"{frame.filename}:{frame.lineno}" for frame in
selected_frames]
+ call_sites = [f"{frame.f_code.co_filename}:{frame.f_lineno}" for frame
in selected_frames]
call_sites_str = "\n".join(call_sites)
return call_sites_str
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]