[
https://issues.apache.org/jira/browse/ARROW-2036?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16345915#comment-16345915
]
ASF GitHub Bot commented on ARROW-2036:
---------------------------------------
wesm closed pull request #1517: ARROW-2036: [Python] Support standard IOBase
methods on NativeFile
URL: https://github.com/apache/arrow/pull/1517
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/python/pyarrow/io-hdfs.pxi b/python/pyarrow/io-hdfs.pxi
index 3abf045f9..83b14b687 100644
--- a/python/pyarrow/io-hdfs.pxi
+++ b/python/pyarrow/io-hdfs.pxi
@@ -413,9 +413,7 @@ cdef class HadoopFileSystem:
&wr_handle))
out.wr_file = <shared_ptr[OutputStream]> wr_handle
-
- out.is_readable = False
- out.is_writeable = 1
+ out.is_writable = True
else:
with nogil:
check_status(self.client.get()
@@ -423,7 +421,6 @@ cdef class HadoopFileSystem:
out.rd_file = <shared_ptr[RandomAccessFile]> rd_handle
out.is_readable = True
- out.is_writeable = 0
if c_buffer_size == 0:
c_buffer_size = 2 ** 16
@@ -431,7 +428,7 @@ cdef class HadoopFileSystem:
out.mode = mode
out.buffer_size = c_buffer_size
out.parent = _HdfsFileNanny(self, out)
- out.is_open = True
+ out.closed = False
out.own_file = True
return out
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index bb363bacc..bd508cf57 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -39,13 +39,14 @@ cdef extern from "Python.h":
cdef class NativeFile:
-
def __cinit__(self):
- self.is_open = False
+ self.closed = True
self.own_file = False
+ self.is_readable = False
+ self.is_writable = False
def __dealloc__(self):
- if self.is_open and self.own_file:
+ if self.own_file and not self.closed:
self.close()
def __enter__(self):
@@ -65,34 +66,52 @@ cdef class NativeFile:
def __get__(self):
# Emulate built-in file modes
- if self.is_readable and self.is_writeable:
+ if self.is_readable and self.is_writable:
return 'rb+'
elif self.is_readable:
return 'rb'
- elif self.is_writeable:
+ elif self.is_writable:
return 'wb'
else:
raise ValueError('File object is malformed, has no mode')
+ def readable(self):
+ self._assert_open()
+ return self.is_readable
+
+ def writable(self):
+ self._assert_open()
+ return self.is_writable
+
+ def seekable(self):
+ self._assert_open()
+ return self.is_readable
+
def close(self):
- if self.is_open:
+ if not self.closed:
with nogil:
if self.is_readable:
check_status(self.rd_file.get().Close())
else:
check_status(self.wr_file.get().Close())
- self.is_open = False
+ self.closed = True
+
+ def flush(self):
+ """Flush the buffer stream, if applicable.
+
+ No-op to match the IOBase interface."""
+ self._assert_open()
cdef read_handle(self, shared_ptr[RandomAccessFile]* file):
self._assert_readable()
file[0] = <shared_ptr[RandomAccessFile]> self.rd_file
cdef write_handle(self, shared_ptr[OutputStream]* file):
- self._assert_writeable()
+ self._assert_writable()
file[0] = <shared_ptr[OutputStream]> self.wr_file
def _assert_open(self):
- if not self.is_open:
+ if self.closed:
raise ValueError("I/O operation on closed file")
def _assert_readable(self):
@@ -100,10 +119,10 @@ cdef class NativeFile:
if not self.is_readable:
raise IOError("only valid on readonly files")
- def _assert_writeable(self):
+ def _assert_writable(self):
self._assert_open()
- if not self.is_writeable:
- raise IOError("only valid on writeable files")
+ if not self.is_writable:
+ raise IOError("only valid on writable files")
def size(self):
"""
@@ -175,7 +194,7 @@ cdef class NativeFile:
Write byte from any object implementing buffer protocol (bytes,
bytearray, ndarray, pyarrow.Buffer)
"""
- self._assert_writeable()
+ self._assert_writable()
if isinstance(data, six.string_types):
data = tobytes(data)
@@ -224,6 +243,12 @@ cdef class NativeFile:
return PyObject_to_object(obj)
+ def read1(self, nbytes=None):
+ """Read and return up to n bytes.
+
+ Alias for read, needed to match the IOBase interface."""
+ return self.read(nbytes=None)
+
def read_buffer(self, nbytes=None):
cdef:
int64_t c_nbytes
@@ -333,7 +358,7 @@ cdef class NativeFile:
Pipe file-like object to file
"""
write_queue = Queue(50)
- self._assert_writeable()
+ self._assert_writable()
buffer_size = buffer_size or DEFAULT_BUFFER_SIZE
@@ -390,16 +415,14 @@ cdef class PythonFile(NativeFile):
if mode.startswith('w'):
self.wr_file.reset(new PyOutputStream(handle))
- self.is_readable = 0
- self.is_writeable = 1
+ self.is_writable = True
elif mode.startswith('r'):
self.rd_file.reset(new PyReadableFile(handle))
- self.is_readable = 1
- self.is_writeable = 0
+ self.is_readable = True
else:
raise ValueError('Invalid file mode: {0}'.format(mode))
- self.is_open = True
+ self.closed = False
cdef class MemoryMappedFile(NativeFile):
@@ -409,11 +432,6 @@ cdef class MemoryMappedFile(NativeFile):
cdef:
object path
- def __cinit__(self):
- self.is_open = False
- self.is_readable = 0
- self.is_writeable = 0
-
@staticmethod
def create(path, size):
cdef:
@@ -426,11 +444,11 @@ cdef class MemoryMappedFile(NativeFile):
cdef MemoryMappedFile result = MemoryMappedFile()
result.path = path
- result.is_readable = 1
- result.is_writeable = 1
+ result.is_readable = True
+ result.is_writable = True
result.wr_file = <shared_ptr[OutputStream]> handle
result.rd_file = <shared_ptr[RandomAccessFile]> handle
- result.is_open = True
+ result.closed = False
return result
@@ -444,14 +462,14 @@ cdef class MemoryMappedFile(NativeFile):
if mode in ('r', 'rb'):
c_mode = FileMode_READ
- self.is_readable = 1
+ self.is_readable = True
elif mode in ('w', 'wb'):
c_mode = FileMode_WRITE
- self.is_writeable = 1
+ self.is_writable = True
elif mode in ('r+', 'r+b', 'rb+'):
c_mode = FileMode_READWRITE
- self.is_readable = 1
- self.is_writeable = 1
+ self.is_readable = True
+ self.is_writable = True
else:
raise ValueError('Invalid file mode: {0}'.format(mode))
@@ -460,7 +478,7 @@ cdef class MemoryMappedFile(NativeFile):
self.wr_file = <shared_ptr[OutputStream]> handle
self.rd_file = <shared_ptr[RandomAccessFile]> handle
- self.is_open = True
+ self.closed = False
def memory_map(path, mode='r'):
@@ -484,7 +502,7 @@ def memory_map(path, mode='r'):
def create_memory_map(path, size):
"""
Create memory map at indicated path of the given size, return open
- writeable file object
+ writable file object
Parameters
----------
@@ -513,16 +531,14 @@ cdef class OSFile(NativeFile):
shared_ptr[Readable] handle
c_string c_path = encode_file_path(path)
- self.is_readable = self.is_writeable = 0
-
if mode in ('r', 'rb'):
self._open_readable(c_path, maybe_unbox_memory_pool(memory_pool))
elif mode in ('w', 'wb'):
- self._open_writeable(c_path)
+ self._open_writable(c_path)
else:
raise ValueError('Invalid file mode: {0}'.format(mode))
- self.is_open = True
+ self.closed = False
cdef _open_readable(self, c_string path, CMemoryPool* pool):
cdef shared_ptr[ReadableFile] handle
@@ -530,15 +546,15 @@ cdef class OSFile(NativeFile):
with nogil:
check_status(ReadableFile.Open(path, pool, &handle))
- self.is_readable = 1
+ self.is_readable = True
self.rd_file = <shared_ptr[RandomAccessFile]> handle
- cdef _open_writeable(self, c_string path):
+ cdef _open_writable(self, c_string path):
cdef shared_ptr[FileOutputStream] handle
with nogil:
check_status(FileOutputStream.Open(path, &handle))
- self.is_writeable = 1
+ self.is_writable = True
self.wr_file = <shared_ptr[OutputStream]> handle
@@ -546,9 +562,8 @@ cdef class FixedSizeBufferWriter(NativeFile):
def __cinit__(self, Buffer buffer):
self.wr_file.reset(new CFixedSizeBufferWriter(buffer.buffer))
- self.is_readable = 0
- self.is_writeable = 1
- self.is_open = True
+ self.is_writable = True
+ self.closed = False
def set_memcopy_threads(self, int num_threads):
cdef CFixedSizeBufferWriter* writer = \
@@ -738,14 +753,13 @@ cdef class BufferOutputStream(NativeFile):
self.buffer = _allocate_buffer(maybe_unbox_memory_pool(memory_pool))
self.wr_file.reset(new CBufferOutputStream(
<shared_ptr[CResizableBuffer]> self.buffer))
- self.is_readable = 0
- self.is_writeable = 1
- self.is_open = True
+ self.is_writable = True
+ self.closed = False
def get_result(self):
with nogil:
check_status(self.wr_file.get().Close())
- self.is_open = False
+ self.closed = True
return pyarrow_wrap_buffer(<shared_ptr[CBuffer]> self.buffer)
@@ -753,9 +767,8 @@ cdef class MockOutputStream(NativeFile):
def __cinit__(self):
self.wr_file.reset(new CMockOutputStream())
- self.is_readable = 0
- self.is_writeable = 1
- self.is_open = True
+ self.is_writable = True
+ self.closed = False
def size(self):
return (<CMockOutputStream*>self.wr_file.get()).GetExtentBytesWritten()
@@ -780,9 +793,8 @@ cdef class BufferReader(NativeFile):
self.buffer = frombuffer(obj)
self.rd_file.reset(new CBufferReader(self.buffer.buffer))
- self.is_readable = 1
- self.is_writeable = 0
- self.is_open = True
+ self.is_readable = True
+ self.closed = False
def frombuffer(object obj):
@@ -834,8 +846,8 @@ cdef get_writer(object source, shared_ptr[OutputStream]*
writer):
if isinstance(source, NativeFile):
nf = source
- if not nf.is_writeable:
- raise IOError('Native file is not writeable')
+ if not nf.is_writable:
+ raise IOError('Native file is not writable')
nf.write_handle(writer)
else:
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index 7534b0d0e..a30a228ae 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -429,7 +429,7 @@ def write_tensor(Tensor tensor, NativeFile dest):
int32_t metadata_length
int64_t body_length
- dest._assert_writeable()
+ dest._assert_writable()
with nogil:
check_status(
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
index f264f089c..4081fc50e 100644
--- a/python/pyarrow/ipc.py
+++ b/python/pyarrow/ipc.py
@@ -65,7 +65,7 @@ class RecordBatchStreamWriter(lib._RecordBatchWriter):
Parameters
----------
sink : str, pyarrow.NativeFile, or file-like Python object
- Either a file path, or a writeable file object
+ Either a file path, or a writable file object
schema : pyarrow.Schema
The Arrow schema for data to be written to the file
"""
@@ -96,7 +96,7 @@ class RecordBatchFileWriter(lib._RecordBatchFileWriter):
Parameters
----------
sink : str, pyarrow.NativeFile, or file-like Python object
- Either a file path, or a writeable file object
+ Either a file path, or a writable file object
schema : pyarrow.Schema
The Arrow schema for data to be written to the file
"""
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 90f749d6d..161562c04 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -333,8 +333,8 @@ cdef class NativeFile:
shared_ptr[RandomAccessFile] rd_file
shared_ptr[OutputStream] wr_file
bint is_readable
- bint is_writeable
- bint is_open
+ bint is_writable
+ readonly bint closed
bint own_file
# By implementing these "virtual" functions (all functions in Cython
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 3f7aa2e1c..da26b101d 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-from io import BytesIO
+from io import BytesIO, TextIOWrapper
import gc
import os
import pytest
@@ -482,27 +482,48 @@ def test_native_file_modes(tmpdir):
with pa.OSFile(path, mode='r') as f:
assert f.mode == 'rb'
+ assert f.readable()
+ assert not f.writable()
+ assert f.seekable()
with pa.OSFile(path, mode='rb') as f:
assert f.mode == 'rb'
+ assert f.readable()
+ assert not f.writable()
+ assert f.seekable()
with pa.OSFile(path, mode='w') as f:
assert f.mode == 'wb'
+ assert not f.readable()
+ assert f.writable()
+ assert not f.seekable()
with pa.OSFile(path, mode='wb') as f:
assert f.mode == 'wb'
+ assert not f.readable()
+ assert f.writable()
+ assert not f.seekable()
with open(path, 'wb') as f:
f.write(b'foooo')
with pa.memory_map(path, 'r') as f:
assert f.mode == 'rb'
+ assert f.readable()
+ assert not f.writable()
+ assert f.seekable()
with pa.memory_map(path, 'r+') as f:
assert f.mode == 'rb+'
+ assert f.readable()
+ assert f.writable()
+ assert f.seekable()
with pa.memory_map(path, 'r+b') as f:
assert f.mode == 'rb+'
+ assert f.readable()
+ assert f.writable()
+ assert f.seekable()
def test_native_file_raises_ValueError_after_close(tmpdir):
@@ -511,19 +532,56 @@ def
test_native_file_raises_ValueError_after_close(tmpdir):
f.write(b'foooo')
with pa.OSFile(path, mode='rb') as os_file:
- pass
+ assert not os_file.closed
+ assert os_file.closed
with pa.memory_map(path, mode='rb') as mmap_file:
- pass
+ assert not mmap_file.closed
+ assert mmap_file.closed
files = [os_file,
mmap_file]
methods = [('tell', ()),
('seek', (0,)),
- ('size', ())]
+ ('size', ()),
+ ('flush', ()),
+ ('readable', ()),
+ ('writable', ()),
+ ('seekable', ())]
for f in files:
for method, args in methods:
with pytest.raises(ValueError):
getattr(f, method)(*args)
+
+
+def test_native_file_TextIOWrapper(tmpdir):
+ data = (u'foooo\n'
+ u'barrr\n'
+ u'bazzz\n')
+
+ path = os.path.join(str(tmpdir), guid())
+ with open(path, 'wb') as f:
+ f.write(data.encode('utf-8'))
+
+ with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
+ assert fil.readable()
+ res = fil.read()
+ assert res == data
+ assert fil.closed
+
+ with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
+ # Iteration works
+ lines = list(fil)
+ assert ''.join(lines) == data
+
+ # Writing
+ path2 = os.path.join(str(tmpdir), guid())
+ with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil:
+ assert fil.writable()
+ fil.write(data)
+
+ with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil:
+ res = fil.read()
+ assert res == data
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> NativeFile should support standard IOBase methods
> -------------------------------------------------
>
> Key: ARROW-2036
> URL: https://issues.apache.org/jira/browse/ARROW-2036
> Project: Apache Arrow
> Issue Type: Improvement
> Components: Python
> Reporter: Jim Crist
> Priority: Minor
> Labels: pull-request-available
> Fix For: 0.9.0
>
>
> If `NativeFile` supported most/all of the standard IOBase methods
> ([https://docs.python.org/3/library/io.html#io.IOBase),] then it'd be easier
> to use arrow files with other python libraries. Would at least be nice to
> support enough operations to use `io.TextIOWrapper`.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)