Moritz Meister created ARROW-18276:
--------------------------------------
Summary: Reading from hdfs using pyarrow 10.0.0 throws OSError:
[Errno 22] Opening HDFS file
Key: ARROW-18276
URL: https://issues.apache.org/jira/browse/ARROW-18276
Project: Apache Arrow
Issue Type: Bug
Components: Python
Affects Versions: 10.0.0
Environment: pyarrow 10.0.0
fsspec 2022.7.1
pandas 1.3.3
python 3.8.11.
Reporter: Moritz Meister
Hey!
I am trying to read a CSV file using pyarrow together with fsspec from HDFS.
I used to do this with pyarrow 9.0.0 and fsspec 2022.7.1, however, after I
upgraded to pyarrow 10.0.0 this stopped working.
I am not quite sure if this is an incompatibility introduced in the new pyarrow
version or if it is a Bug in fsspec. So if I am in the wrong place here, please
let me know.
Apart from pyarrow 10.0.0 and fsspec 2022.7.1, I am using pandas version 1.3.3
and python 3.8.11.
Here is the full stack trace
```python
pd.read_csv("hdfs://10.0.2.15:8020/Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py
in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col,
usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters,
true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows,
na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates,
infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates,
iterator, chunksize, compression, thousands, decimal, lineterminator,
quotechar, quoting, doublequote, escapechar, comment, encoding,
encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines,
delim_whitespace, low_memory, memory_map, float_precision, storage_options)
584 kwds.update(kwds_defaults)
585
--> 586 return _read(filepath_or_buffer, kwds)
587
588
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py
in _read(filepath_or_buffer, kwds)
480
481 # Create the parser.
--> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
483
484 if chunksize or iterator:
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py
in __init__(self, f, engine, **kwds)
809 self.options["has_index_names"] = kwds["has_index_names"]
810
--> 811 self._engine = self._make_engine(self.engine)
812
813 def close(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/readers.py
in _make_engine(self, engine)
1038 )
1039 # error: Too many arguments for "ParserBase"
-> 1040 return mapping[engine](self.f, **self.options) # type:
ignore[call-arg]
1041
1042 def _failover_to_python(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/c_parser_wrapper.py
in __init__(self, src, **kwds)
49
50 # open handles
---> 51 self._open_handles(src, kwds)
52 assert self.handles is not None
53
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/parsers/base_parser.py
in _open_handles(self, src, kwds)
220 Let the readers open IOHandles after they are done with their
potential raises.
221 """
--> 222 self.handles = get_handle(
223 src,
224 "r",
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py
in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text,
errors, storage_options)
607
608 # open URLs
--> 609 ioargs = _get_filepath_or_buffer(
610 path_or_buf,
611 encoding=encoding,
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pandas/io/common.py
in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode,
storage_options)
356
357 try:
--> 358 file_obj = fsspec.open(
359 filepath_or_buffer, mode=fsspec_mode,
**(storage_options or {})
360 ).open()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in
open(self)
133 during the life of the file-like it generates.
134 """
--> 135 return self.__enter__()
136
137 def close(self):
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/core.py in
__enter__(self)
101 mode = self.mode.replace("t", "").replace("b", "") + "b"
102
--> 103 f = self.fs.open(self.path, mode=mode)
104
105 self.fobjects = [f]
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/spec.py in
open(self, path, mode, block_size, cache_options, compression, **kwargs)
1092 else:
1093 ac = kwargs.pop("autocommit", not self._intrans)
-> 1094 f = self._open(
1095 path,
1096 mode=mode,
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py
in wrapper(*args, **kwargs)
15 def wrapper(*args, **kwargs):
16 try:
---> 17 return func(*args, **kwargs)
18 except OSError as exception:
19 if not exception.args:
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/fsspec/implementations/arrow.py
in _open(self, path, mode, block_size, **kwargs)
157 # disable compression auto-detection
158 _kwargs["compression"] = None
--> 159 stream = method(path, **_kwargs)
160
161 return ArrowFile(self, stream, path, mode, block_size, **kwargs)
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/_fs.pyx in
pyarrow._fs.FileSystem.open_input_stream()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in
pyarrow.lib.pyarrow_internal_check_status()
/srv/hops/anaconda/envs/theenv/lib/python3.8/site-packages/pyarrow/error.pxi in
pyarrow.lib.check_status()
OSError: [Errno 22] Opening HDFS file
'10.0.2.15:8020/Projects/test/test_Training_Datasets/td_1/td/part-00000-acb66eca-636e-4917-9673-e6646f0b4486-c000.csv'
failed. Detail: [errno 22] Invalid argument
```
However, if I leave out the namenode IP and port, it works as expected:
```python
pd.read_csv("hdfs:///Projects/testing/testing_Training_Datasets/transactions_view_fraud_batch_fv_1_1/validation/part-00000-42b57ad2-57eb-4a63-bfaa-7375e82863e8-c000.csv")
```
Any help is appreciated, thank you!
--
This message was sent by Atlassian Jira
(v8.20.10#820010)