pitrou commented on code in PR #49771:
URL: https://github.com/apache/arrow/pull/49771#discussion_r3201690023


##########
cpp/src/arrow/ipc/message.cc:
##########
@@ -565,6 +565,17 @@ Status DecodeMessage(MessageDecoder* decoder, 
io::InputStream* file) {
   auto metadata_length = decoder->next_required_size();
   ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length));
   if (metadata->size() != metadata_length) {
+    // The first sizeof(int32_t) bytes of the Arrow file magic ("ARRO") may 
have been
+    // misread as metadata_length. Check if the remaining bytes complete the 
magic.
+    const auto remaining_magic = 
internal::kArrowMagicBytes.substr(sizeof(int32_t));
+    if (metadata->size() >= static_cast<int64_t>(remaining_magic.size()) &&
+        std::string_view(reinterpret_cast<const char*>(metadata->data()),
+                         remaining_magic.size()) == remaining_magic) {

Review Comment:
   This would only work on a little-endian machine. We need to be 
endianness-agnostic (see `ConsumeInitialData`).



##########
cpp/src/arrow/ipc/read_write_test.cc:
##########
@@ -2265,6 +2265,52 @@ TEST(TestRecordBatchStreamReader, MalformedInput) {
   ASSERT_RAISES(Invalid, RecordBatchStreamReader::Open(&garbage_reader));
 }
 
+TEST(TestRecordBatchStreamReader, OpenFileFormatSuggestsFileReader) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeIntRecordBatch(&batch));
+
+  FileWriterHelper helper;
+  ASSERT_OK(helper.Init(batch->schema(), IpcWriteOptions::Defaults()));
+  ASSERT_OK(helper.WriteBatch(batch));
+  ASSERT_OK(helper.Finish());
+
+  io::BufferReader reader(helper.buffer_);
+  // Check we mention using the file_reader when we detect file format
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("file reader"),

Review Comment:
   Can we be a bit more specific and match a longer substring? For example "Try 
the IPC file reader".



##########
cpp/src/arrow/ipc/message.cc:
##########
@@ -565,6 +565,17 @@ Status DecodeMessage(MessageDecoder* decoder, 
io::InputStream* file) {
   auto metadata_length = decoder->next_required_size();
   ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length));
   if (metadata->size() != metadata_length) {
+    // The first sizeof(int32_t) bytes of the Arrow file magic ("ARRO") may 
have been
+    // misread as metadata_length. Check if the remaining bytes complete the 
magic.

Review Comment:
   How likely is it to have a metadata message exactly 1330795073 bytes long 
("ARRO" decoded as a 32-bit little-endian integer)? Perhaps we can check up 
front instead of trying to read so much data?
   
   cc @lidavidm @paleolimbot for opinions.



##########
cpp/src/arrow/ipc/reader.cc:
##########
@@ -1890,7 +1890,9 @@ class RecordBatchFileReaderImpl : public 
RecordBatchFileReader {
           const auto magic_start = buffer->data() + sizeof(int32_t);
           if (std::string_view(reinterpret_cast<const char*>(magic_start), 
kMagicSize) !=
               kArrowMagicBytes) {
-            return Status::Invalid("Not an Arrow file");
+            return Status::Invalid(
+                "Not an Arrow file. If this is an Arrow IPC streaming format 
file, use "
+                "the IPC stream reader instead.");

Review Comment:
   ```suggestion
                   "Not an Arrow file. If this is an Arrow IPC stream, use "
                   "the IPC stream reader instead.");
   ```



##########
cpp/src/arrow/ipc/read_write_test.cc:
##########
@@ -2265,6 +2265,52 @@ TEST(TestRecordBatchStreamReader, MalformedInput) {
   ASSERT_RAISES(Invalid, RecordBatchStreamReader::Open(&garbage_reader));
 }
 
+TEST(TestRecordBatchStreamReader, OpenFileFormatSuggestsFileReader) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeIntRecordBatch(&batch));
+
+  FileWriterHelper helper;
+  ASSERT_OK(helper.Init(batch->schema(), IpcWriteOptions::Defaults()));
+  ASSERT_OK(helper.WriteBatch(batch));
+  ASSERT_OK(helper.Finish());
+
+  io::BufferReader reader(helper.buffer_);
+  // Check we mention using the file_reader when we detect file format
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr("file reader"),
+                                  RecordBatchStreamReader::Open(&reader));
+}
+
+TEST(TestRecordBatchStreamReader, CorruptDataDoesNotSuggestFileReader) {
+  // Continuation marker + metadata_length = 100, then 8 bytes of non-magic 
data.
+  const std::string corrupt(
+      "\xff\xff\xff\xff"
+      "\x64\x00\x00\x00"
+      "ABABABAB",
+      16);
+  auto buffer = std::make_shared<Buffer>(corrupt);
+  io::BufferReader reader(buffer);
+  // Validate that we don't suggest file reader when file is just corrupt
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::Not(::testing::HasSubstr("file reader")),
+      RecordBatchStreamReader::Open(&reader));
+}
+
+TEST(TestRecordBatchFileReader, OpenStreamFormatSuggestsStreamReader) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(MakeIntRecordBatch(&batch));
+
+  StreamWriterHelper helper;
+  ASSERT_OK(helper.Init(batch->schema(), IpcWriteOptions::Defaults()));
+  ASSERT_OK(helper.WriteBatch(batch));
+  ASSERT_OK(helper.Finish());
+
+  auto buf_reader = std::make_shared<io::BufferReader>(helper.buffer_);
+  // Check we mention using the stream_reader when we detect stream format
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("stream reader"),

Review Comment:
   Same here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to