westonpace commented on code in PR #13662:
URL: https://github.com/apache/arrow/pull/13662#discussion_r929419924


##########
cpp/src/arrow/io/file.h:
##########
@@ -54,11 +54,12 @@ class ARROW_EXPORT FileOutputStream : public OutputStream {
   /// \brief Open a file descriptor for writing.  The underlying file isn't
   /// truncated.
   /// \param[in] fd file descriptor
+  /// \param[in] reuse should we use the page cache for the write

Review Comment:
   ```suggestion
     /// \param[in] reuse if false, the written data will not be kept in the OS 
page cache.
     ///                  This will have some negative impact on write 
throughput but can be
     ///                  used during large writes to avoid evicting more 
important data from
     ///                  the page cache.
   ```



##########
cpp/src/arrow/filesystem/localfs.h:
##########
@@ -37,7 +37,7 @@ struct ARROW_EXPORT LocalFileSystemOptions {
   /// Whether OpenInputStream and OpenInputFile return a mmap'ed file,
   /// or a regular one.
   bool use_mmap = false;
-
+  bool reuse = true;

Review Comment:
   Please add a doc comment explaining the meaning of this flag.



##########
cpp/src/arrow/io/file.cc:
##########
@@ -167,8 +169,23 @@ class OSFile {
     if (length < 0) {
       return Status::IOError("Length must be non-negative");
     }
+#ifdef __linux__
+    if (reuse_) {
+      return ::arrow::internal::FileWrite(fd_.fd(),
+                                          reinterpret_cast<const 
uint8_t*>(data), length);
+    } else {
+      auto status = ::arrow::internal::FileWrite(
+          fd_.fd(), reinterpret_cast<const uint8_t*>(data), length);
+      posix_fadvise(fd_.fd(), offset_, length, POSIX_FADV_DONTNEED);
+      offset_ += length;
+      return status;
+    }
+#else
+    // the posix_fadvise is not implemented on other filesystems.
+    // there could be alternative ways of getting this working but not 
implemented here.

Review Comment:
   ```suggestion
       // posix_fadvise is not implemented on other filesystems.
       // there could be alternative ways of getting this working but not 
implemented here.
   ```



##########
cpp/src/arrow/filesystem/localfs.cc:
##########
@@ -434,14 +434,15 @@ namespace {
 
 Result<std::shared_ptr<io::OutputStream>> OpenOutputStreamGeneric(const 
std::string& path,
                                                                   bool 
truncate,
-                                                                  bool append) 
{
+                                                                  bool append,
+                                                                  bool reuse) {
   RETURN_NOT_OK(ValidatePath(path));
   ARROW_ASSIGN_OR_RAISE(auto fn, PlatformFilename::FromString(path));
   const bool write_only = true;
-  ARROW_ASSIGN_OR_RAISE(
-      auto fd, ::arrow::internal::FileOpenWritable(fn, write_only, truncate, 
append));
+  ARROW_ASSIGN_OR_RAISE(auto fd, ::arrow::internal::FileOpenWritable(
+                                     fn, write_only, truncate, append, 
!reuse));

Review Comment:
   ```suggestion
     // When reuse is false we need to open the file in sync mode because 
fadvise will
     // not drop the data from cache unless it has finished writing.
     ARROW_ASSIGN_OR_RAISE(auto fd, ::arrow::internal::FileOpenWritable(
                                        fn, write_only, truncate, append, 
!reuse));
   ```



##########
cpp/src/arrow/io/file.cc:
##########
@@ -207,6 +224,8 @@ class OSFile {
   FileDescriptor fd_;
   FileMode::type mode_;
   int64_t size_{-1};
+  bool reuse_;
+  int64_t offset_{0};

Review Comment:
   Does this need to be updated on a seek?



##########
python/pyarrow/_fs.pyx:
##########
@@ -782,6 +782,9 @@ cdef class LocalFileSystem(FileSystem):
     use_mmap : bool, default False
         Whether open_input_stream and open_input_file should return
         a mmap'ed file or a regular file.
+    reuse : bool, default True
+        If set to False, will use posix_fadvise to (try) not use the page 
cache.
+        This will only work on Linux!!

Review Comment:
   ```suggestion
           If set to False, will use posix_fadvise to (try) not use the page 
cache.
           This will only work on Linux.
   ```
   
   Why `(try)`?  Also, we should make this comment consistent with the one in 
the C++.  The user should be advised this will have a negative impact on 
performance.  The user should be informed that they might want to do this to 
prevent swapping out important information.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to