westonpace commented on a change in pull request #11535:
URL: https://github.com/apache/arrow/pull/11535#discussion_r739588637



##########
File path: cpp/src/arrow/ipc/reader.cc
##########
@@ -1216,15 +1216,23 @@ class RecordBatchFileReaderImpl : public 
RecordBatchFileReader {
   }
 
   Future<> ReadFooterAsync(arrow::internal::Executor* executor) {
+    // When reading the footer, read this much additional data in an
+    // attempt to avoid a second I/O operation (which can be slow on a
+    // high-latency filesystem like S3)

Review comment:
       ```suggestion
       // When reading the footer, read up to this much additional data in
       // an attempt to avoid a second I/O operation (which can be slow
       // on a high-latency filesystem like S3)
   ```

##########
File path: cpp/src/arrow/ipc/reader.cc
##########
@@ -1235,19 +1243,24 @@ class RecordBatchFileReaderImpl : public 
RecordBatchFileReader {
                                    "from end of file");
           }
 
-          if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, 
magic_size)) {
+          const uint8_t* magic_start = buffer->data() + readahead;
+          if (memcmp(magic_start + sizeof(int32_t), kArrowMagicBytes, 
magic_size)) {
             return Status::Invalid("Not an Arrow file");
           }
 
-          int32_t footer_length = BitUtil::FromLittleEndian(
-              *reinterpret_cast<const int32_t*>(buffer->data()));
-
+          int32_t footer_length =
+              
BitUtil::FromLittleEndian(util::SafeLoadAs<int32_t>(magic_start));
           if (footer_length <= 0 ||
               footer_length > self->footer_offset_ - magic_size * 2 - 4) {
             return Status::Invalid("File is smaller than indicated metadata 
size");
           }
 
           // Now read the footer
+          if (footer_length <= readahead) {
+            return SliceBuffer(buffer, buffer->size() - file_end_size - 
footer_length,
+                               footer_length);
+          }
+

Review comment:
       If this condition is false would it be faster to read just the remaining 
portion instead of rereading a part of the file?

##########
File path: cpp/src/arrow/ipc/read_write_test.cc
##########
@@ -1706,6 +1706,99 @@ TEST_F(TestFileFormat, ReadFieldSubset) { 
TestReadSubsetOfFields(); }
 
 TEST_F(TestFileFormatGenerator, ReadFieldSubset) { TestReadSubsetOfFields(); }
 
+class TrackedRandomAccessFile : public io::RandomAccessFile {
+ public:
+  explicit TrackedRandomAccessFile(io::RandomAccessFile* delegate)
+      : delegate_(delegate), read_(0) {}
+
+  Status Close() override { return delegate_->Close(); }
+  bool closed() const override { return delegate_->closed(); }
+  Result<int64_t> Tell() const override { return delegate_->Tell(); }
+  Status Seek(int64_t position) override { return delegate_->Seek(position); }
+  Result<int64_t> Read(int64_t nbytes, void* out) override {
+    read_++;
+    return delegate_->Read(nbytes, out);
+  }
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override {
+    read_++;
+    return delegate_->Read(nbytes);
+  }
+  bool supports_zero_copy() const override { return 
delegate_->supports_zero_copy(); }
+  Result<int64_t> GetSize() override { return delegate_->GetSize(); }
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override 
{
+    read_++;
+    return delegate_->ReadAt(position, nbytes, out);
+  }
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) 
override {
+    read_++;
+    return delegate_->ReadAt(position, nbytes);
+  }
+  Future<std::shared_ptr<Buffer>> ReadAsync(const io::IOContext& io_context,
+                                            int64_t position, int64_t nbytes) 
override {
+    read_++;
+    return delegate_->ReadAsync(io_context, position, nbytes);
+  }

Review comment:
       This might double count in an implementation like 
`arrow::io::RandomAccessFile::ReadAsync` which (potentially) spawns a thread 
task and calls `self->ReadAt`.

##########
File path: cpp/src/arrow/ipc/message.cc
##########
@@ -290,7 +291,8 @@ Result<std::unique_ptr<Message>> ReadMessage(int64_t 
offset, int32_t metadata_le
                            decoder.next_required_size());
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length));
+  int64_t to_read = body_length > 0 ? (metadata_length + body_length) : 
metadata_length;
+  ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, to_read));

Review comment:
       Nit: `metadata` is a slightly inaccurate name now.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to