(arrow) branch main updated: GH-45497: [C++][CSV] Avoid buffer overflow when a line has too many columns (#45498)

apitrou Wed, 12 Feb 2025 04:55:33 -0800

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 25d8bed210 GH-45497: [C++][CSV] Avoid buffer overflow when a line has 
too many columns (#45498)
25d8bed210 is described below

commit 25d8bed2100a1bcc0bd5044782ed844e8016befb
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Feb 12 13:54:24 2025 +0100

    GH-45497: [C++][CSV] Avoid buffer overflow when a line has too many columns 
(#45498)
    
    ### What changes are included in this PR?
    
    1. Add guard against going past the buffer's end, while minimizing the 
performance overhead of the runtime check.
    2. Add error propagation for buffer (re)allocation, instead of aborting. 
This is unrelated to the reported issue, but is desirable nevertheless.
    
    With these changes, a CSV line with an unexpectedly large number of columns 
will raise an error such as "CSV parse error: Expected 13 columns, got ...".
    
    ### Are these changes tested?
    
    Yes, by additional unit tests.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #45497
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/csv/parser.cc      | 74 +++++++++++++++++++++++++++++++---------
 cpp/src/arrow/csv/parser_test.cc | 72 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 16 deletions(-)

diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index da3472a9d9..c08c01e2dd 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -126,16 +126,30 @@ class ValueDescWriter {
         {static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU, quoted_});
   }
 
-  void Finish(std::shared_ptr<Buffer>* out_values) {
-    ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
-    *out_values = values_buffer_;
+  Result<std::shared_ptr<Buffer>> Finish() {
+    RETURN_NOT_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
+    return std::move(values_buffer_);
+  }
+
+  const Status& status() const { return status_; }
+
+  // Convenience error-checking factory. The arguments are forwarded to the
+  // Derived class constructor.
+  template <typename... Args>
+  static Result<Derived> Make(Args&&... args) {
+    auto self = Derived(std::forward<Args>(args)...);
+    RETURN_NOT_OK(self.status());
+    return self;
   }
 
  protected:
   ValueDescWriter(MemoryPool* pool, int64_t values_capacity)
-      : values_size_(0), values_capacity_(values_capacity) {
-    values_buffer_ = *AllocateResizableBuffer(values_capacity_ * 
sizeof(*values_), pool);
-    values_ = 
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+      : values_size_(0), values_capacity_(values_capacity), 
status_(Status::OK()) {
+    status_ &= AllocateResizableBuffer(values_capacity_ * sizeof(*values_), 
pool)
+                   .Value(&values_buffer_);
+    if (status_.ok()) {
+      values_ = 
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+    }
   }
 
   std::shared_ptr<ResizableBuffer> values_buffer_;
@@ -145,6 +159,7 @@ class ValueDescWriter {
   bool quoted_;
   // Checkpointing, for when an incomplete line is encountered at end of block
   int64_t saved_values_size_;
+  Status status_;
 };
 
 // A helper class handling a growable buffer for values offsets.  This class is
@@ -157,11 +172,21 @@ class ResizableValueDescWriter : public 
ValueDescWriter<ResizableValueDescWriter
 
   void PushValue(ParsedValueDesc v) {
     if (ARROW_PREDICT_FALSE(values_size_ == values_capacity_)) {
-      values_capacity_ = values_capacity_ * 2;
-      ARROW_CHECK_OK(values_buffer_->Resize(values_capacity_ * 
sizeof(*values_)));
-      values_ = 
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+      int64_t new_capacity = values_capacity_ * 2;
+      auto resize_status = values_buffer_->Resize(new_capacity * 
sizeof(*values_));
+      if (resize_status.ok()) {
+        values_ = 
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+        values_capacity_ = new_capacity;
+      }
+      status_ &= std::move(resize_status);
+    }
+    // The `values_` pointer may have become invalid if the `Resize` call 
above failed.
+    // Note that ResizableValueDescWriter is less performance-critical than
+    // PresizedValueDescWriter, as it should only be called on the first 
line(s)
+    // of CSV data.
+    if (ARROW_PREDICT_TRUE(status_.ok())) {
+      values_[values_size_++] = v;
     }
-    values_[values_size_++] = v;
   }
 };
 
@@ -171,12 +196,26 @@ class ResizableValueDescWriter : public 
ValueDescWriter<ResizableValueDescWriter
 // faster CSV parsing code.
 class PresizedValueDescWriter : public 
ValueDescWriter<PresizedValueDescWriter> {
  public:
+  // The number of offsets being written will be `1 + num_rows * num_cols`,
+  // however we allow for one extraneous write in case of excessive columns,
+  // hence `2 + num_rows * num_cols` (see explanation in PushValue below).
   PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
-      : ValueDescWriter(pool, /*values_capacity=*/1 + num_rows * num_cols) {}
+      : ValueDescWriter(pool, /*values_capacity=*/2 + num_rows * num_cols) {}
 
   void PushValue(ParsedValueDesc v) {
     DCHECK_LT(values_size_, values_capacity_);
-    values_[values_size_++] = v;
+    values_[values_size_] = v;
+    // We must take care not to write past the buffer's end if the line being
+    // parsed has more than `num_cols` columns. The obvious solution of setting
+    // an error status hurts too much on benchmarks, which is why we instead
+    // cap `values_size_` to stay inside the buffer.
+    //
+    // Not setting an error immediately is not a problem since the `num_cols`
+    // mismatch is detected later in ParseLine.
+    //
+    // Note that we want `values_size_` to reflect the number of written values
+    // in the nominal case, which is why we choose a slightly larger 
`values_capacity_`.
+    values_size_ += (values_size_ != values_capacity_ - 1);
   }
 };
 
@@ -464,6 +503,7 @@ class BlockParserImpl {
         RETURN_NOT_OK((ParseLine<SpecializedOptions, true>(values_writer, 
parsed_writer,
                                                            data, data_end, 
is_final,
                                                            &line_end, 
bulk_filter)));
+        RETURN_NOT_OK(values_writer->status());
         if (line_end == data) {
           // Cannot parse any further
           *finished_parsing = true;
@@ -477,6 +517,7 @@ class BlockParserImpl {
         RETURN_NOT_OK((ParseLine<SpecializedOptions, false>(values_writer, 
parsed_writer,
                                                             data, data_end, 
is_final,
                                                             &line_end, 
bulk_filter)));
+        RETURN_NOT_OK(values_writer->status());
         if (line_end == data) {
           // Cannot parse any further
           *finished_parsing = true;
@@ -496,8 +537,7 @@ class BlockParserImpl {
     }
 
     // Append new buffers and update size
-    std::shared_ptr<Buffer> values_buffer;
-    values_writer->Finish(&values_buffer);
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, values_writer->Finish());
     if (values_buffer->size() > 0) {
       values_size_ +=
           static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc) 
- 1);
@@ -535,7 +575,7 @@ class BlockParserImpl {
         // Can't presize values when the number of columns is not known, first 
parse
         // a single line
         const int32_t rows_in_chunk = 1;
-        ResizableValueDescWriter values_writer(pool_);
+        ARROW_ASSIGN_OR_RAISE(auto values_writer, 
ResizableValueDescWriter::Make(pool_));
         values_writer.Start(parsed_writer);
 
         RETURN_NOT_OK(ParseChunk<SpecializedOptions>(
@@ -560,7 +600,9 @@ class BlockParserImpl {
           rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - 
batch_.num_rows_);
         }
 
-        PresizedValueDescWriter values_writer(pool_, rows_in_chunk, 
batch_.num_cols_);
+        ARROW_ASSIGN_OR_RAISE(
+            auto values_writer,
+            PresizedValueDescWriter::Make(pool_, rows_in_chunk, 
batch_.num_cols_));
         values_writer.Start(parsed_writer);
 
         RETURN_NOT_OK(ParseChunk<SpecializedOptions>(
diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc
index dd3d025202..0b5e717509 100644
--- a/cpp/src/arrow/csv/parser_test.cc
+++ b/cpp/src/arrow/csv/parser_test.cc
@@ -17,7 +17,9 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <sstream>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -586,6 +588,28 @@ TEST(BlockParser, QuotesSpecial) {
   }
 }
 
+std::vector<std::string> MismatchingNumColumns(int32_t num_cols, int32_t 
mismatch,
+                                               int64_t extra_lines = 0) {
+  auto write_line = [](int32_t num_cols, std::string_view prefix, 
std::ostream* out) {
+    for (int32_t i = 0; i < num_cols; ++i) {
+      *out << prefix << i << ",";
+    }
+    out->seekp(-1, std::ios_base::cur);
+    *out << "\n";
+  };
+
+  std::stringstream csv_data;
+  // Output first line with `num_cols` columns
+  write_line(num_cols, "a", &csv_data);
+  // Output second line with mismatching number of columns
+  write_line(num_cols + mismatch, "b", &csv_data);
+  // Output extra lines with `num_cols` columns each
+  for (int64_t i = 0; i < extra_lines; ++i) {
+    write_line(num_cols, "c", &csv_data);
+  }
+  return {csv_data.str()};
+}
+
 TEST(BlockParser, MismatchingNumColumns) {
   uint32_t out_size;
   {
@@ -621,6 +645,25 @@ TEST(BlockParser, MismatchingNumColumns) {
     EXPECT_RAISES_WITH_MESSAGE_THAT(
         Invalid, testing::HasSubstr("CSV parse error: Expected 2 columns, got 
1: a"), st);
   }
+  // Vary the number of columns and mismatch, to catch buffer overflow issues
+  for (int32_t num_cols : {1, 2, 5, 100}) {
+    ARROW_SCOPED_TRACE("num_cols = ", num_cols);
+    for (int32_t mismatch : {-5, -1, 1, 5, 10, 50, 1024, 32767}) {
+      if (mismatch + num_cols <= 0) {
+        continue;
+      }
+      ARROW_SCOPED_TRACE("mismatch = ", mismatch);
+      // Try to parse CSV data
+      auto csv_data = MismatchingNumColumns(num_cols, mismatch);
+      BlockParser parser(ParseOptions::Defaults(), num_cols, /*first_row=*/1);
+      Status st = Parse(parser, MakeCSVData(csv_data), &out_size);
+      std::stringstream expected_error;
+      expected_error << "Row #2: Expected " << num_cols << " columns, got "
+                     << num_cols + mismatch << ":";
+      EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, 
testing::HasSubstr(expected_error.str()),
+                                      st);
+    }
+  }
 }
 
 TEST(BlockParser, MismatchingNumColumnsHandler) {
@@ -724,6 +767,35 @@ TEST(BlockParser, MismatchingNumColumnsHandler) {
 
     ASSERT_NO_FATAL_FAILURE(AssertLastRowEq(parser, {"j", "k"}, {false, 
false}));
   }
+
+  // Vary the number of columns and mismatch, to catch buffer overflow issues
+  for (int32_t num_cols : {1, 2, 5, 100}) {
+    ARROW_SCOPED_TRACE("num_cols = ", num_cols);
+    for (int32_t mismatch : {-5, -1, 1, 5, 10, 50, 1024, 32767}) {
+      if (mismatch + num_cols <= 0) {
+        continue;
+      }
+      ARROW_SCOPED_TRACE("mismatch = ", mismatch);
+      // Parse CSV data
+      auto csv_data = MismatchingNumColumns(num_cols, mismatch, 
/*extra_lines=*/1);
+      ParseOptions opts = ParseOptions::Defaults();
+      CustomHandler handler;
+      opts.invalid_row_handler = handler;
+      BlockParser parser(opts, num_cols, /*first_row=*/1);
+      ASSERT_NO_FATAL_FAILURE(AssertParseOk(parser, MakeCSVData(csv_data)));
+      ASSERT_EQ(2, parser.num_rows());
+      ASSERT_EQ(3, parser.total_num_rows());
+      ASSERT_EQ(1, handler.rows.size());
+      const auto& invalid_row = handler.rows[0];
+      ASSERT_EQ(num_cols, invalid_row.first.expected_columns);
+      ASSERT_EQ(num_cols + mismatch, invalid_row.first.actual_columns);
+      ASSERT_EQ("b0", invalid_row.second.substr(0, 2));
+      std::vector<std::string> last_row;
+      GetLastRow(parser, &last_row);
+      ASSERT_EQ(last_row.front(), "c0");
+      ASSERT_EQ(last_row.back(), "c" + std::to_string(num_cols - 1));
+    }
+  }
 }
 
 TEST(BlockParser, Escaping) {

(arrow) branch main updated: GH-45497: [C++][CSV] Avoid buffer overflow when a line has too many columns (#45498)

Reply via email to