bkietz commented on a change in pull request #7156:
URL: https://github.com/apache/arrow/pull/7156#discussion_r426712453
##########
File path: python/pyarrow/_dataset.pyx
##########
@@ -42,6 +43,51 @@ def _forbid_instantiation(klass, subclasses_instead=True):
raise TypeError(msg)
+ctypedef CResult[shared_ptr[CRandomAccessFile]] CCustomOpen()
+
+cdef class FileSource:
+
+ cdef:
+ # XXX why is shared_ptr necessary here? CFileSource shouldn't need it
+ CFileSource wrapped
+
+ def __cinit__(self, file, FileSystem filesystem=None):
+ cdef:
+ shared_ptr[CFileSystem] c_filesystem
+ c_string c_path
+ function[CCustomOpen] c_open
+ shared_ptr[CBuffer] c_buffer
+
+ if isinstance(file, FileSource):
+ self.wrapped = (<FileSource> file).wrapped
+
+ elif isinstance(file, Buffer):
+ c_buffer = pyarrow_unwrap_buffer(file)
+ self.wrapped = CFileSource(move(c_buffer))
+
+ elif _is_path_like(file):
+ if filesystem is None:
+ raise ValueError("cannot construct a FileSource from "
+ "a path without a FileSystem")
+ c_filesystem = filesystem.unwrap()
+ c_path = tobytes(_stringify_path(file))
+ self.wrapped = CFileSource(move(c_path), move(c_filesystem))
+
+ else:
+ c_open = BindMethod[CCustomOpen](
+ wrap_python_file(file, mode='r'),
+ &NativeFile.get_random_access_file)
Review comment:
The intention of `FileSource` is precisely a lazy readable file. For
example when dealing with an S3 filesystem this allows us to refer to files
during discovery and dataset construction without even a `HeadObject` request.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]