This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new fed23f388b GH-48844: [C++] Check IPC Message body length consistency 
in IPC file (#48845)
fed23f388b is described below

commit fed23f388bc1ef0e574e9a818d64d10c38193e97
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Jan 13 20:36:07 2026 +0100

    GH-48844: [C++] Check IPC Message body length consistency in IPC file 
(#48845)
    
    ### Rationale for this change
    
    The IPC file exposes [redundant 
information](https://github.com/apache/arrow/blob/d54a2051cf9020a0fdf50836420c38ad14787abb/format/File.fbs#L39-L50)
 about Message sizes so as to allow for random access from the file footer.
    
    We tried adding [consistency 
checks](https://github.com/apache/arrow/issues/19596) in the past but this hit 
a bug in the JavaScript IPC writer at the time, so the checks were left 
disabled.
    
    The JavaScript implementation was fixed soon after (7 years ago), so this 
PR re-enables those checks so as to more easily detect potentially invalid IPC 
files.
    
    ### Are these changes tested?
    
    By existing tests.
    
    ### Are there any user-facing changes?
    
    No, unless they try reading invalid IPC files.
    
    * GitHub Issue: #48844
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/ipc/message.cc |  2 ++
 cpp/src/arrow/ipc/reader.cc  | 33 +++++++++++++++++++--------------
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc
index 7919878f14..8be09956f1 100644
--- a/cpp/src/arrow/ipc/message.cc
+++ b/cpp/src/arrow/ipc/message.cc
@@ -375,6 +375,8 @@ Result<std::unique_ptr<Message>> ReadMessage(int64_t 
offset, int32_t metadata_le
                            decoder.next_required_size());
   }
 
+  // TODO(GH-48846): we should take a body_length just like ReadMessageAsync
+  // and read metadata + body in one go.
   ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length));
   if (metadata->size() < metadata_length) {
     return Status::Invalid("Expected to read ", metadata_length,
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 4910b1596c..6a20dbb8c8 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -1180,31 +1180,36 @@ Status CheckAligned(const FileBlock& block) {
   return Status::OK();
 }
 
+template <typename MessagePtr>
+Result<MessagePtr> CheckBodyLength(MessagePtr message, const FileBlock& block) 
{
+  if (message->body_length() != block.body_length) {
+    return Status::Invalid(
+        "Mismatching body length for IPC message "
+        "(Block.bodyLength: ",
+        block.body_length, " vs. Message.bodyLength: ", 
message->body_length(), ")");
+  }
+  // NOTE: we cannot check metadata length as easily as we would have to 
account
+  // for the additional IPC signalisation (such as optional continuation 
bytes).
+  return message;
+}
+
 Result<std::unique_ptr<Message>> ReadMessageFromBlock(
     const FileBlock& block, io::RandomAccessFile* file,
     const FieldsLoaderFunction& fields_loader) {
   RETURN_NOT_OK(CheckAligned(block));
-  // TODO(wesm): this breaks integration tests, see ARROW-3256
-  // DCHECK_EQ((*out)->body_length(), block.body_length);
-
   ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, 
block.metadata_length,
                                                   file, fields_loader));
-  return message;
+  return CheckBodyLength(std::move(message), block);
 }
 
 Future<std::shared_ptr<Message>> ReadMessageFromBlockAsync(
     const FileBlock& block, io::RandomAccessFile* file, const io::IOContext& 
io_context) {
-  if (!bit_util::IsMultipleOf8(block.offset) ||
-      !bit_util::IsMultipleOf8(block.metadata_length) ||
-      !bit_util::IsMultipleOf8(block.body_length)) {
-    return Status::Invalid("Unaligned block in IPC file");
-  }
-
-  // TODO(wesm): this breaks integration tests, see ARROW-3256
-  // DCHECK_EQ((*out)->body_length(), block.body_length);
-
+  RETURN_NOT_OK(CheckAligned(block));
   return ReadMessageAsync(block.offset, block.metadata_length, 
block.body_length, file,
-                          io_context);
+                          io_context)
+      .Then([block](std::shared_ptr<Message> message) {
+        return CheckBodyLength(std::move(message), block);
+      });
 }
 
 class RecordBatchFileReaderImpl;

Reply via email to