This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new b522f8c5cc GH-39313: [Python] Fix race condition in
_pandas_api#_check_import (#39314)
b522f8c5cc is described below
commit b522f8c5cc152ea97d33340e3bef852e063dad46
Author: Tom Jarosz <[email protected]>
AuthorDate: Tue Jan 9 05:25:21 2024 -0800
GH-39313: [Python] Fix race condition in _pandas_api#_check_import (#39314)
### Rationale for this change
See:
```
cdef inline bint _have_pandas_internal(self):
if not self._tried_importing_pandas:
self._check_import(raise_=False)
return self._have_pandas
```
The method `_check_import`:
1) sets `_tried_importing_pandas` to true
2) does some things which take time...
3) sets `_have_pandas` to true (if we indeed do have pandas)
Suppose thread 1 calls `_have_pandas_internal`. If thread 1 is at step 2
while thread 2 calls `_have_pandas_internal`, `_have_pandas_internal` may
incorrectly return False for thread 2 as thread 1 has set
`_tried_importing_pandas` to true, but has not yet (but will) set
`_have_pandas` to True. `_have_pandas_internal` will return True for thread 1.
After my fix, `_have_pandas_internal` will not return an incorrect value in
the scenario described above. It would instead result in a redundant, but (I
believe) harmless, invocation of `_check_import`.
### What changes are included in this PR?
Changes ordering of "trying to import pandas" and "recording that pandas
import has been tried"
### Are these changes tested?
yes, see test committed
### Are there any user-facing changes?
This PR resolves a user-facing race condition
https://github.com/apache/arrow/issues/39313
* Closes: #39313
Lead-authored-by: Thomas Jarosz <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/pandas-shim.pxi | 22 ++++++++++-------
python/pyarrow/tests/arrow_39313.py | 47 +++++++++++++++++++++++++++++++++++++
python/pyarrow/tests/test_pandas.py | 6 +++++
3 files changed, 67 insertions(+), 8 deletions(-)
diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi
index 273575b779..0409e133ad 100644
--- a/python/pyarrow/pandas-shim.pxi
+++ b/python/pyarrow/pandas-shim.pxi
@@ -18,6 +18,7 @@
# pandas lazy-loading API shim that reduces API call and import overhead
import warnings
+from threading import Lock
cdef class _PandasAPIShim(object):
@@ -34,12 +35,13 @@ cdef class _PandasAPIShim(object):
object _pd, _types_api, _compat_module
object _data_frame, _index, _series, _categorical_type
object _datetimetz_type, _extension_array, _extension_dtype
- object _array_like_types, _is_extension_array_dtype
+ object _array_like_types, _is_extension_array_dtype, _lock
bint has_sparse
bint _pd024
bint _is_v1, _is_ge_v21
def __init__(self):
+ self._lock = Lock()
self._tried_importing_pandas = False
self._have_pandas = 0
@@ -96,13 +98,17 @@ cdef class _PandasAPIShim(object):
self.has_sparse = False
cdef inline _check_import(self, bint raise_=True):
- if self._tried_importing_pandas:
- if not self._have_pandas and raise_:
- self._import_pandas(raise_)
- return
-
- self._tried_importing_pandas = True
- self._import_pandas(raise_)
+ if not self._tried_importing_pandas:
+ with self._lock:
+ if not self._tried_importing_pandas:
+ try:
+ self._import_pandas(raise_)
+ finally:
+ self._tried_importing_pandas = True
+ return
+
+ if not self._have_pandas and raise_:
+ self._import_pandas(raise_)
def series(self, *args, **kwargs):
self._check_import()
diff --git a/python/pyarrow/tests/arrow_39313.py
b/python/pyarrow/tests/arrow_39313.py
new file mode 100644
index 0000000000..1e769f49d9
--- /dev/null
+++ b/python/pyarrow/tests/arrow_39313.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This file is called from a test in test_pandas.py.
+
+from threading import Thread
+
+import pandas as pd
+from pyarrow.pandas_compat import _pandas_api
+
+if __name__ == "__main__":
+ wait = True
+ num_threads = 10
+ df = pd.DataFrame()
+ results = []
+
+ def rc():
+ while wait:
+ pass
+ results.append(_pandas_api.is_data_frame(df))
+
+ threads = [Thread(target=rc) for _ in range(num_threads)]
+
+ for t in threads:
+ t.start()
+
+ wait = False
+
+ for t in threads:
+ t.join()
+
+ assert len(results) == num_threads
+ assert all(results), "`is_data_frame` returned False when given a
DataFrame"
diff --git a/python/pyarrow/tests/test_pandas.py
b/python/pyarrow/tests/test_pandas.py
index 3353bebce7..d15ee82d5d 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -34,6 +34,7 @@ import pytest
from pyarrow.pandas_compat import get_logical_type, _pandas_api
from pyarrow.tests.util import invoke_script, random_ascii, rands
import pyarrow.tests.strategies as past
+import pyarrow.tests.util as test_util
from pyarrow.vendored.version import Version
import pyarrow as pa
@@ -5008,3 +5009,8 @@ def test_nested_chunking_valid():
schema = pa.schema([("maps", map_type)])
roundtrip(pd.DataFrame({"maps": [map_of_los, map_of_los, map_of_los]}),
schema=schema)
+
+
+def test_is_data_frame_race_condition():
+ # See https://github.com/apache/arrow/issues/39313
+ test_util.invoke_script('arrow_39313.py')