[arrow] branch master updated: ARROW-10318: [C++] Use pimpl idiom in CSV parser

bkietz Tue, 20 Oct 2020 09:19:57 -0700

This is an automated email from the ASF dual-hosted git repository.

bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 2321542  ARROW-10318: [C++] Use pimpl idiom in CSV parser
2321542 is described below

commit 23215422c717b6783f28fa402e08d63bb9f4afc6
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 20 12:18:23 2020 -0400

    ARROW-10318: [C++] Use pimpl idiom in CSV parser
    
    Hide more implementation details from `parser.h`.
    Also add some more realistic parsing micro-benchmarks.
    
    Interestingly, this increases performance on the CSV parser 
micro-benchmarks (at least here with clang 10.0 on Ubuntu 20.04).
    * before:
    ```
    
-------------------------------------------------------------------------------
    Benchmark                     Time             CPU   Iterations 
UserCounters...
    
-------------------------------------------------------------------------------
    ParseCSVQuotedBlock         264665 ns       264597 ns         7904 
bytes_per_second=612.723M/s
    ParseCSVEscapedBlock        258008 ns       257938 ns         7762 
bytes_per_second=591.568M/s
    ParseCSVFlightsExample     2241195 ns      2240631 ns          932 
bytes_per_second=433.608M/s
    ParseCSVVehiclesExample   19660852 ns     19656220 ns          110 
bytes_per_second=583.425M/s
    ParseCSVStocksExample      4323310 ns      4322451 ns          486 
bytes_per_second=485.489M/s
    ```
    * after:
    ```
    ParseCSVQuotedBlock         196201 ns       196167 ns        10717 
bytes_per_second=826.462M/s
    ParseCSVEscapedBlock        182517 ns       182479 ns        11919 
bytes_per_second=836.194M/s
    ParseCSVFlightsExample     1548348 ns      1548079 ns         1366 
bytes_per_second=627.588M/s
    ParseCSVVehiclesExample   17251422 ns     17244772 ns          121 
bytes_per_second=665.009M/s
    ParseCSVStocksExample      2253296 ns      2252727 ns          943 
bytes_per_second=931.539M/s
    ```
    
    Closes #8493 from pitrou/ARROW-10318-csv-parser-pimpl
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Benjamin Kietzman <[email protected]>
---
 cpp/src/arrow/csv/parser.cc           | 725 +++++++++++++++++-----------------
 cpp/src/arrow/csv/parser.h            | 158 ++++----
 cpp/src/arrow/csv/parser_benchmark.cc |  94 ++++-
 3 files changed, 520 insertions(+), 457 deletions(-)

diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index 77ad402..07e5612 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -30,43 +30,22 @@
 namespace arrow {
 namespace csv {
 
-static Status ParseError(const char* message) {
+using detail::DataBatch;
+using detail::ParsedValueDesc;
+
+namespace {
+
+Status ParseError(const char* message) {
   return Status::Invalid("CSV parse error: ", message);
 }
 
-static Status MismatchingColumns(int32_t expected, int32_t actual) {
+Status MismatchingColumns(int32_t expected, int32_t actual) {
   char s[50];
   snprintf(s, sizeof(s), "Expected %d columns, got %d", expected, actual);
   return ParseError(s);
 }
 
-static inline bool IsControlChar(uint8_t c) { return c < ' '; }
-
-int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
-                 const uint8_t** out_data) {
-  const auto end = data + size;
-  int32_t skipped_rows = 0;
-  *out_data = data;
-
-  for (; skipped_rows < num_rows; ++skipped_rows) {
-    uint8_t c;
-    do {
-      while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
-        ++data;
-      }
-      if (ARROW_PREDICT_FALSE(data == end)) {
-        return skipped_rows;
-      }
-      c = *data++;
-    } while (c != '\r' && c != '\n');
-    if (c == '\r' && data < end && *data == '\n') {
-      ++data;
-    }
-    *out_data = data;
-  }
-
-  return skipped_rows;
-}
+inline bool IsControlChar(uint8_t c) { return c < ' '; }
 
 template <bool Quoting, bool Escaping>
 class SpecializedOptions {
@@ -77,9 +56,9 @@ class SpecializedOptions {
 
 // A helper class allocating the buffer for parsed values and writing into it
 // without any further resizes, except at the end.
-class BlockParser::PresizedParsedWriter {
+class PresizedDataWriter {
  public:
-  PresizedParsedWriter(MemoryPool* pool, uint32_t size)
+  PresizedDataWriter(MemoryPool* pool, uint32_t size)
       : parsed_size_(0), parsed_capacity_(size) {
     parsed_buffer_ = *AllocateResizableBuffer(parsed_capacity_, pool);
     parsed_ = parsed_buffer_->mutable_data();
@@ -111,51 +90,44 @@ class BlockParser::PresizedParsedWriter {
   int64_t saved_parsed_size_;
 };
 
-// A helper class handling a growable buffer for values offsets.  This class is
-// used when the number of columns is not yet known and we therefore cannot
-// efficiently presize the target area for a given number of rows.
-class BlockParser::ResizableValuesWriter {
+template <typename Derived>
+class ValueDescWriter {
  public:
-  explicit ResizableValuesWriter(MemoryPool* pool)
-      : values_size_(0), values_capacity_(256) {
-    values_buffer_ = *AllocateResizableBuffer(values_capacity_ * 
sizeof(*values_), pool);
-    values_ = reinterpret_cast<ValueDesc*>(values_buffer_->mutable_data());
-  }
-
-  template <typename ParsedWriter>
-  void Start(ParsedWriter& parsed_writer) {
-    PushValue({static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU, 
false});
-  }
+  Derived* derived() { return static_cast<Derived*>(this); }
 
-  void Finish(std::shared_ptr<Buffer>* out_values) {
-    ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
-    *out_values = values_buffer_;
+  template <typename DataWriter>
+  void Start(DataWriter& parsed_writer) {
+    derived()->PushValue(
+        {static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU, false});
   }
 
   void BeginLine() { saved_values_size_ = values_size_; }
 
+  // Rollback the state that was saved in BeginLine()
+  void RollbackLine() { values_size_ = saved_values_size_; }
+
   void StartField(bool quoted) { quoted_ = quoted; }
 
-  template <typename ParsedWriter>
-  void FinishField(ParsedWriter* parsed_writer) {
-    PushValue({static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU, 
quoted_});
+  template <typename DataWriter>
+  void FinishField(DataWriter* parsed_writer) {
+    derived()->PushValue(
+        {static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU, quoted_});
   }
 
-  // Rollback the state that was saved in BeginLine()
-  void RollbackLine() { values_size_ = saved_values_size_; }
+  void Finish(std::shared_ptr<Buffer>* out_values) {
+    ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
+    *out_values = values_buffer_;
+  }
 
  protected:
-  void PushValue(ValueDesc v) {
-    if (ARROW_PREDICT_FALSE(values_size_ == values_capacity_)) {
-      values_capacity_ = values_capacity_ * 2;
-      ARROW_CHECK_OK(values_buffer_->Resize(values_capacity_ * 
sizeof(*values_)));
-      values_ = reinterpret_cast<ValueDesc*>(values_buffer_->mutable_data());
-    }
-    values_[values_size_++] = v;
+  ValueDescWriter(MemoryPool* pool, int64_t values_capacity)
+      : values_size_(0), values_capacity_(values_capacity) {
+    values_buffer_ = *AllocateResizableBuffer(values_capacity_ * 
sizeof(*values_), pool);
+    values_ = 
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
   }
 
   std::shared_ptr<ResizableBuffer> values_buffer_;
-  ValueDesc* values_;
+  ParsedValueDesc* values_;
   int64_t values_size_;
   int64_t values_capacity_;
   bool quoted_;
@@ -163,392 +135,415 @@ class BlockParser::ResizableValuesWriter {
   int64_t saved_values_size_;
 };
 
+// A helper class handling a growable buffer for values offsets.  This class is
+// used when the number of columns is not yet known and we therefore cannot
+// efficiently presize the target area for a given number of rows.
+class ResizableValueDescWriter : public 
ValueDescWriter<ResizableValueDescWriter> {
+ public:
+  explicit ResizableValueDescWriter(MemoryPool* pool)
+      : ValueDescWriter(pool, /*values_capacity=*/256) {}
+
+  void PushValue(ParsedValueDesc v) {
+    if (ARROW_PREDICT_FALSE(values_size_ == values_capacity_)) {
+      values_capacity_ = values_capacity_ * 2;
+      ARROW_CHECK_OK(values_buffer_->Resize(values_capacity_ * 
sizeof(*values_)));
+      values_ = 
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+    }
+    values_[values_size_++] = v;
+  }
+};
+
 // A helper class allocating the buffer for values offsets and writing into it
 // without any further resizes, except at the end.  This class is used once the
 // number of columns is known, as it eliminates resizes and generates simpler,
 // faster CSV parsing code.
-class BlockParser::PresizedValuesWriter {
+class PresizedValueDescWriter : public 
ValueDescWriter<PresizedValueDescWriter> {
  public:
-  PresizedValuesWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
-      : values_size_(0), values_capacity_(1 + num_rows * num_cols) {
-    values_buffer_ = *AllocateResizableBuffer(values_capacity_ * 
sizeof(*values_), pool);
-    values_ = reinterpret_cast<ValueDesc*>(values_buffer_->mutable_data());
-  }
-
-  template <typename ParsedWriter>
-  void Start(ParsedWriter& parsed_writer) {
-    PushValue({static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU, 
false});
-  }
-
-  void Finish(std::shared_ptr<Buffer>* out_values) {
-    ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
-    *out_values = values_buffer_;
-  }
+  PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
+      : ValueDescWriter(pool, /*values_capacity=*/1 + num_rows * num_cols) {}
 
-  void BeginLine() { saved_values_size_ = values_size_; }
-
-  void StartField(bool quoted) { quoted_ = quoted; }
-
-  template <typename ParsedWriter>
-  void FinishField(ParsedWriter* parsed_writer) {
-    PushValue({static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU, 
quoted_});
-  }
-
-  // Rollback the state that was saved in BeginLine()
-  void RollbackLine() { values_size_ = saved_values_size_; }
-
- protected:
-  void PushValue(ValueDesc v) {
+  void PushValue(ParsedValueDesc v) {
     DCHECK_LT(values_size_, values_capacity_);
     values_[values_size_++] = v;
   }
-
-  std::shared_ptr<ResizableBuffer> values_buffer_;
-  ValueDesc* values_;
-  int64_t values_size_;
-  const int64_t values_capacity_;
-  bool quoted_;
-  // Checkpointing, for when an incomplete line is encountered at end of block
-  int64_t saved_values_size_;
 };
 
-template <typename SpecializedOptions, typename ValuesWriter, typename 
ParsedWriter>
-Status BlockParser::ParseLine(ValuesWriter* values_writer, ParsedWriter* 
parsed_writer,
-                              const char* data, const char* data_end, bool 
is_final,
-                              const char** out_data) {
-  int32_t num_cols = 0;
-  char c;
+}  // namespace
+
+class BlockParserImpl {
+ public:
+  BlockParserImpl(MemoryPool* pool, ParseOptions options, int32_t num_cols,
+                  int32_t max_num_rows)
+      : pool_(pool), options_(options), max_num_rows_(max_num_rows), 
batch_(num_cols) {}
+
+  const DataBatch& parsed_batch() const { return batch_; }
 
-  DCHECK_GT(data_end, data);
+  template <typename SpecializedOptions, typename ValueDescWriter, typename 
DataWriter>
+  Status ParseLine(ValueDescWriter* values_writer, DataWriter* parsed_writer,
+                   const char* data, const char* data_end, bool is_final,
+                   const char** out_data) {
+    int32_t num_cols = 0;
+    char c;
 
-  auto FinishField = [&]() { values_writer->FinishField(parsed_writer); };
+    DCHECK_GT(data_end, data);
 
-  values_writer->BeginLine();
-  parsed_writer->BeginLine();
+    auto FinishField = [&]() { values_writer->FinishField(parsed_writer); };
 
-  // The parsing state machine
+    values_writer->BeginLine();
+    parsed_writer->BeginLine();
 
-  // Special case empty lines: do we start with a newline separator?
-  c = *data;
-  if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
-    if (c == '\r') {
-      data++;
-      if (data < data_end && *data == '\n') {
+    // The parsing state machine
+
+    // Special case empty lines: do we start with a newline separator?
+    c = *data;
+    if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
+      if (c == '\r') {
         data++;
+        if (data < data_end && *data == '\n') {
+          data++;
+        }
+        goto EmptyLine;
+      }
+      if (c == '\n') {
+        data++;
+        goto EmptyLine;
       }
-      goto EmptyLine;
-    }
-    if (c == '\n') {
-      data++;
-      goto EmptyLine;
     }
-  }
 
-FieldStart:
-  // At the start of a field
-  // Quoting is only recognized at start of field
-  if (SpecializedOptions::quoting && ARROW_PREDICT_FALSE(*data == 
options_.quote_char)) {
-    ++data;
-    values_writer->StartField(true /* quoted */);
-    goto InQuotedField;
-  } else {
-    values_writer->StartField(false /* quoted */);
-    goto InField;
-  }
+  FieldStart:
+    // At the start of a field
+    // Quoting is only recognized at start of field
+    if (SpecializedOptions::quoting &&
+        ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
+      ++data;
+      values_writer->StartField(true /* quoted */);
+      goto InQuotedField;
+    } else {
+      values_writer->StartField(false /* quoted */);
+      goto InField;
+    }
 
-InField:
-  // Inside a non-quoted part of a field
-  if (ARROW_PREDICT_FALSE(data == data_end)) {
-    goto AbortLine;
-  }
-  c = *data++;
-  if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == 
options_.escape_char)) {
+  InField:
+    // Inside a non-quoted part of a field
     if (ARROW_PREDICT_FALSE(data == data_end)) {
       goto AbortLine;
     }
     c = *data++;
-    parsed_writer->PushFieldChar(c);
-    goto InField;
-  }
-  if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
-    goto FieldEnd;
-  }
-  if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
-    if (c == '\r') {
-      // In the middle of a newline separator?
-      if (ARROW_PREDICT_TRUE(data < data_end) && *data == '\n') {
-        data++;
+    if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == 
options_.escape_char)) {
+      if (ARROW_PREDICT_FALSE(data == data_end)) {
+        goto AbortLine;
       }
-      goto LineEnd;
+      c = *data++;
+      parsed_writer->PushFieldChar(c);
+      goto InField;
     }
-    if (c == '\n') {
-      goto LineEnd;
+    if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
+      goto FieldEnd;
     }
-  }
-  parsed_writer->PushFieldChar(c);
-  goto InField;
+    if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
+      if (c == '\r') {
+        // In the middle of a newline separator?
+        if (ARROW_PREDICT_TRUE(data < data_end) && *data == '\n') {
+          data++;
+        }
+        goto LineEnd;
+      }
+      if (c == '\n') {
+        goto LineEnd;
+      }
+    }
+    parsed_writer->PushFieldChar(c);
+    goto InField;
 
-InQuotedField:
-  // Inside a quoted part of a field
-  if (ARROW_PREDICT_FALSE(data == data_end)) {
-    goto AbortLine;
-  }
-  c = *data++;
-  if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == 
options_.escape_char)) {
+  InQuotedField:
+    // Inside a quoted part of a field
     if (ARROW_PREDICT_FALSE(data == data_end)) {
       goto AbortLine;
     }
     c = *data++;
+    if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c == 
options_.escape_char)) {
+      if (ARROW_PREDICT_FALSE(data == data_end)) {
+        goto AbortLine;
+      }
+      c = *data++;
+      parsed_writer->PushFieldChar(c);
+      goto InQuotedField;
+    }
+    if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
+      if (options_.double_quote && ARROW_PREDICT_TRUE(data < data_end) &&
+          ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
+        // Double-quoting
+        ++data;
+      } else {
+        // End of single-quoting
+        goto InField;
+      }
+    }
     parsed_writer->PushFieldChar(c);
     goto InQuotedField;
-  }
-  if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
-    if (options_.double_quote && ARROW_PREDICT_TRUE(data < data_end) &&
-        ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
-      // Double-quoting
-      ++data;
-    } else {
-      // End of single-quoting
-      goto InField;
-    }
-  }
-  parsed_writer->PushFieldChar(c);
-  goto InQuotedField;
-
-FieldEnd:
-  // At the end of a field
-  FinishField();
-  ++num_cols;
-  if (ARROW_PREDICT_FALSE(data == data_end)) {
-    goto AbortLine;
-  }
-  goto FieldStart;
-
-LineEnd:
-  // At the end of line
-  FinishField();
-  ++num_cols;
-  if (ARROW_PREDICT_FALSE(num_cols != num_cols_)) {
-    if (num_cols_ == -1) {
-      num_cols_ = num_cols;
-    } else {
-      return MismatchingColumns(num_cols_, num_cols);
+
+  FieldEnd:
+    // At the end of a field
+    FinishField();
+    ++num_cols;
+    if (ARROW_PREDICT_FALSE(data == data_end)) {
+      goto AbortLine;
     }
-  }
-  ++num_rows_;
-  *out_data = data;
-  return Status::OK();
+    goto FieldStart;
 
-AbortLine:
-  // Not a full line except perhaps if in final block
-  if (is_final) {
+  LineEnd:
+    // At the end of line
     FinishField();
     ++num_cols;
-    if (num_cols_ == -1) {
-      num_cols_ = num_cols;
-    } else if (num_cols != num_cols_) {
-      return MismatchingColumns(num_cols_, num_cols);
+    if (ARROW_PREDICT_FALSE(num_cols != batch_.num_cols_)) {
+      if (batch_.num_cols_ == -1) {
+        batch_.num_cols_ = num_cols;
+      } else {
+        return MismatchingColumns(batch_.num_cols_, num_cols);
+      }
     }
-    ++num_rows_;
+    ++batch_.num_rows_;
     *out_data = data;
     return Status::OK();
-  }
-  // Truncated line at end of block, rewind parsed state
-  values_writer->RollbackLine();
-  parsed_writer->RollbackLine();
-  return Status::OK();
-
-EmptyLine:
-  if (!options_.ignore_empty_lines) {
-    if (num_cols_ == -1) {
-      // Consider as single value
-      num_cols_ = 1;
+
+  AbortLine:
+    // Not a full line except perhaps if in final block
+    if (is_final) {
+      goto LineEnd;
     }
-    // Record as row of empty (null?) values
-    while (num_cols++ < num_cols_) {
-      values_writer->StartField(false /* quoted */);
-      FinishField();
+    // Truncated line at end of block, rewind parsed state
+    values_writer->RollbackLine();
+    parsed_writer->RollbackLine();
+    return Status::OK();
+
+  EmptyLine:
+    if (!options_.ignore_empty_lines) {
+      if (batch_.num_cols_ == -1) {
+        // Consider as single value
+        batch_.num_cols_ = 1;
+      }
+      // Record as row of empty (null?) values
+      while (num_cols++ < batch_.num_cols_) {
+        values_writer->StartField(false /* quoted */);
+        FinishField();
+      }
+      ++batch_.num_rows_;
     }
-    ++num_rows_;
+    *out_data = data;
+    return Status::OK();
   }
-  *out_data = data;
-  return Status::OK();
-}
 
-template <typename SpecializedOptions, typename ValuesWriter, typename 
ParsedWriter>
-Status BlockParser::ParseChunk(ValuesWriter* values_writer, ParsedWriter* 
parsed_writer,
-                               const char* data, const char* data_end, bool 
is_final,
-                               int32_t rows_in_chunk, const char** out_data,
-                               bool* finished_parsing) {
-  int32_t num_rows_deadline = num_rows_ + rows_in_chunk;
-
-  while (data < data_end && num_rows_ < num_rows_deadline) {
-    const char* line_end = data;
-    RETURN_NOT_OK(ParseLine<SpecializedOptions>(values_writer, parsed_writer, 
data,
-                                                data_end, is_final, 
&line_end));
-    if (line_end == data) {
-      // Cannot parse any further
-      *finished_parsing = true;
-      break;
+  template <typename SpecializedOptions, typename ValueDescWriter, typename 
DataWriter>
+  Status ParseChunk(ValueDescWriter* values_writer, DataWriter* parsed_writer,
+                    const char* data, const char* data_end, bool is_final,
+                    int32_t rows_in_chunk, const char** out_data,
+                    bool* finished_parsing) {
+    int32_t num_rows_deadline = batch_.num_rows_ + rows_in_chunk;
+
+    while (data < data_end && batch_.num_rows_ < num_rows_deadline) {
+      const char* line_end = data;
+      RETURN_NOT_OK(ParseLine<SpecializedOptions>(values_writer, 
parsed_writer, data,
+                                                  data_end, is_final, 
&line_end));
+      if (line_end == data) {
+        // Cannot parse any further
+        *finished_parsing = true;
+        break;
+      }
+      data = line_end;
     }
-    data = line_end;
-  }
-  // Append new buffers and update size
-  std::shared_ptr<Buffer> values_buffer;
-  values_writer->Finish(&values_buffer);
-  if (values_buffer->size() > 0) {
-    values_size_ += static_cast<int32_t>(values_buffer->size() / 
sizeof(ValueDesc) - 1);
-    values_buffers_.push_back(std::move(values_buffer));
+    // Append new buffers and update size
+    std::shared_ptr<Buffer> values_buffer;
+    values_writer->Finish(&values_buffer);
+    if (values_buffer->size() > 0) {
+      values_size_ +=
+          static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc) 
- 1);
+      batch_.values_buffers_.push_back(std::move(values_buffer));
+    }
+    *out_data = data;
+    return Status::OK();
   }
-  *out_data = data;
-  return Status::OK();
-}
 
-template <typename SpecializedOptions>
-Status BlockParser::DoParseSpecialized(const std::vector<util::string_view>& 
views,
-                                       bool is_final, uint32_t* out_size) {
-  num_rows_ = 0;
-  values_size_ = 0;
-  parsed_size_ = 0;
-  values_buffers_.clear();
-  parsed_buffer_.reset();
-  parsed_ = nullptr;
-
-  size_t total_view_length = 0;
-  for (const auto& view : views) {
-    total_view_length += view.length();
-  }
-  if (total_view_length > std::numeric_limits<uint32_t>::max()) {
-    return Status::Invalid("CSV block too large");
-  }
+  template <typename SpecializedOptions>
+  Status ParseSpecialized(const std::vector<util::string_view>& views, bool 
is_final,
+                          uint32_t* out_size) {
+    batch_ = DataBatch{batch_.num_cols_};
+    values_size_ = 0;
 
-  PresizedParsedWriter parsed_writer(pool_, 
static_cast<uint32_t>(total_view_length));
-  uint32_t total_parsed_length = 0;
-
-  for (const auto& view : views) {
-    const char* data = view.data();
-    const char* data_end = view.data() + view.length();
-    bool finished_parsing = false;
-
-    if (num_cols_ == -1) {
-      // Can't presize values when the number of columns is not known, first 
parse
-      // a single line
-      const int32_t rows_in_chunk = 1;
-      ResizableValuesWriter values_writer(pool_);
-      values_writer.Start(parsed_writer);
-
-      RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer, 
&parsed_writer, data,
-                                                   data_end, is_final, 
rows_in_chunk,
-                                                   &data, &finished_parsing));
-      if (num_cols_ == -1) {
-        return ParseError("Empty CSV file or block: cannot infer number of 
columns");
-      }
+    size_t total_view_length = 0;
+    for (const auto& view : views) {
+      total_view_length += view.length();
+    }
+    if (total_view_length > std::numeric_limits<uint32_t>::max()) {
+      return Status::Invalid("CSV block too large");
     }
 
-    while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) {
-      // We know the number of columns, so can presize a values array for
-      // a given number of rows
-      DCHECK_GE(num_cols_, 0);
-
-      int32_t rows_in_chunk;
-      constexpr int32_t kTargetChunkSize = 32768;
-      if (num_cols_ > 0) {
-        rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512),
-                                 max_num_rows_ - num_rows_);
-      } else {
-        rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_);
+    PresizedDataWriter parsed_writer(pool_, 
static_cast<uint32_t>(total_view_length));
+    uint32_t total_parsed_length = 0;
+
+    for (const auto& view : views) {
+      const char* data = view.data();
+      const char* data_end = view.data() + view.length();
+      bool finished_parsing = false;
+
+      if (batch_.num_cols_ == -1) {
+        // Can't presize values when the number of columns is not known, first 
parse
+        // a single line
+        const int32_t rows_in_chunk = 1;
+        ResizableValueDescWriter values_writer(pool_);
+        values_writer.Start(parsed_writer);
+
+        RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer, 
&parsed_writer, data,
+                                                     data_end, is_final, 
rows_in_chunk,
+                                                     &data, 
&finished_parsing));
+        if (batch_.num_cols_ == -1) {
+          return ParseError("Empty CSV file or block: cannot infer number of 
columns");
+        }
       }
 
-      PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_);
-      values_writer.Start(parsed_writer);
-
-      RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer, 
&parsed_writer, data,
-                                                   data_end, is_final, 
rows_in_chunk,
-                                                   &data, &finished_parsing));
-    }
-    DCHECK_GE(data, view.data());
-    DCHECK_LE(data, data_end);
-    total_parsed_length += static_cast<uint32_t>(data - view.data());
+      while (!finished_parsing && data < data_end && batch_.num_rows_ < 
max_num_rows_) {
+        // We know the number of columns, so can presize a values array for
+        // a given number of rows
+        DCHECK_GE(batch_.num_cols_, 0);
+
+        int32_t rows_in_chunk;
+        constexpr int32_t kTargetChunkSize = 32768;  // in number of values
+        if (batch_.num_cols_ > 0) {
+          rows_in_chunk = std::min(std::max(kTargetChunkSize / 
batch_.num_cols_, 512),
+                                   max_num_rows_ - batch_.num_rows_);
+        } else {
+          rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - 
batch_.num_rows_);
+        }
+
+        PresizedValueDescWriter values_writer(pool_, rows_in_chunk, 
batch_.num_cols_);
+        values_writer.Start(parsed_writer);
+
+        RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer, 
&parsed_writer, data,
+                                                     data_end, is_final, 
rows_in_chunk,
+                                                     &data, 
&finished_parsing));
+      }
+      DCHECK_GE(data, view.data());
+      DCHECK_LE(data, data_end);
+      total_parsed_length += static_cast<uint32_t>(data - view.data());
 
-    if (data < data_end) {
-      // Stopped early, for some reason
-      break;
+      if (data < data_end) {
+        // Stopped early, for some reason
+        break;
+      }
     }
-  }
 
-  parsed_writer.Finish(&parsed_buffer_);
-  parsed_size_ = static_cast<int32_t>(parsed_buffer_->size());
-  parsed_ = parsed_buffer_->data();
+    parsed_writer.Finish(&batch_.parsed_buffer_);
+    batch_.parsed_size_ = static_cast<int32_t>(batch_.parsed_buffer_->size());
+    batch_.parsed_ = batch_.parsed_buffer_->data();
 
-  DCHECK_EQ(values_size_, num_rows_ * num_cols_);
-  if (num_cols_ == -1) {
-    DCHECK_EQ(num_rows_, 0);
-  }
+    if (batch_.num_cols_ == -1) {
+      DCHECK_EQ(batch_.num_rows_, 0);
+    }
+    DCHECK_EQ(values_size_, batch_.num_rows_ * batch_.num_cols_);
 #ifndef NDEBUG
-  if (num_rows_ > 0) {
-    DCHECK_GT(values_buffers_.size(), 0);
-    auto& last_values_buffer = values_buffers_.back();
-    auto last_values = reinterpret_cast<const 
ValueDesc*>(last_values_buffer->data());
-    auto last_values_size = last_values_buffer->size() / sizeof(ValueDesc);
-    auto check_parsed_size =
-        static_cast<int32_t>(last_values[last_values_size - 1].offset);
-    DCHECK_EQ(parsed_size_, check_parsed_size);
-  } else {
-    DCHECK_EQ(parsed_size_, 0);
-  }
-#endif
-  *out_size = static_cast<uint32_t>(total_parsed_length);
-  return Status::OK();
-}
-
-Status BlockParser::DoParse(const std::vector<util::string_view>& data, bool 
is_final,
-                            uint32_t* out_size) {
-  if (options_.quoting) {
-    if (options_.escaping) {
-      return DoParseSpecialized<SpecializedOptions<true, true>>(data, 
is_final, out_size);
+    if (batch_.num_rows_ > 0) {
+      // Ending parsed offset should be equal to number of parsed bytes
+      DCHECK_GT(batch_.values_buffers_.size(), 0);
+      const auto& last_values_buffer = batch_.values_buffers_.back();
+      const auto last_values =
+          reinterpret_cast<const ParsedValueDesc*>(last_values_buffer->data());
+      const auto last_values_size = last_values_buffer->size() / 
sizeof(ParsedValueDesc);
+      const auto check_parsed_size =
+          static_cast<int32_t>(last_values[last_values_size - 1].offset);
+      DCHECK_EQ(batch_.parsed_size_, check_parsed_size);
     } else {
-      return DoParseSpecialized<SpecializedOptions<true, false>>(data, 
is_final,
-                                                                 out_size);
+      DCHECK_EQ(batch_.parsed_size_, 0);
     }
-  } else {
-    if (options_.escaping) {
-      return DoParseSpecialized<SpecializedOptions<false, true>>(data, 
is_final,
+#endif
+    *out_size = static_cast<uint32_t>(total_parsed_length);
+    return Status::OK();
+  }
+
+  Status Parse(const std::vector<util::string_view>& data, bool is_final,
+               uint32_t* out_size) {
+    if (options_.quoting) {
+      if (options_.escaping) {
+        return ParseSpecialized<SpecializedOptions<true, true>>(data, 
is_final, out_size);
+      } else {
+        return ParseSpecialized<SpecializedOptions<true, false>>(data, 
is_final,
                                                                  out_size);
+      }
     } else {
-      return DoParseSpecialized<SpecializedOptions<false, false>>(data, 
is_final,
+      if (options_.escaping) {
+        return ParseSpecialized<SpecializedOptions<false, true>>(data, 
is_final,
+                                                                 out_size);
+      } else {
+        return ParseSpecialized<SpecializedOptions<false, false>>(data, 
is_final,
                                                                   out_size);
+      }
     }
   }
-}
+
+ protected:
+  MemoryPool* pool_;
+  const ParseOptions options_;
+  // The maximum number of rows to parse from a block
+  int32_t max_num_rows_;
+
+  // Unparsed data size
+  int32_t values_size_;
+  // Parsed data batch
+  DataBatch batch_;
+};
+
+BlockParser::BlockParser(ParseOptions options, int32_t num_cols, int32_t 
max_num_rows)
+    : BlockParser(default_memory_pool(), options, num_cols, max_num_rows) {}
+
+BlockParser::BlockParser(MemoryPool* pool, ParseOptions options, int32_t 
num_cols,
+                         int32_t max_num_rows)
+    : impl_(new BlockParserImpl(pool, std::move(options), num_cols, 
max_num_rows)) {}
+
+BlockParser::~BlockParser() {}
 
 Status BlockParser::Parse(const std::vector<util::string_view>& data,
                           uint32_t* out_size) {
-  return DoParse(data, false /* is_final */, out_size);
+  return impl_->Parse(data, false /* is_final */, out_size);
 }
 
 Status BlockParser::ParseFinal(const std::vector<util::string_view>& data,
                                uint32_t* out_size) {
-  return DoParse(data, true /* is_final */, out_size);
+  return impl_->Parse(data, true /* is_final */, out_size);
 }
 
 Status BlockParser::Parse(util::string_view data, uint32_t* out_size) {
-  return DoParse({data}, false /* is_final */, out_size);
+  return impl_->Parse({data}, false /* is_final */, out_size);
 }
 
 Status BlockParser::ParseFinal(util::string_view data, uint32_t* out_size) {
-  return DoParse({data}, true /* is_final */, out_size);
+  return impl_->Parse({data}, true /* is_final */, out_size);
 }
 
-BlockParser::BlockParser(MemoryPool* pool, ParseOptions options, int32_t 
num_cols,
-                         int32_t max_num_rows)
-    : pool_(pool),
-      options_(options),
-      num_rows_(-1),
-      num_cols_(num_cols),
-      max_num_rows_(max_num_rows) {}
+const DataBatch& BlockParser::parsed_batch() const { return 
impl_->parsed_batch(); }
 
-BlockParser::BlockParser(ParseOptions options, int32_t num_cols, int32_t 
max_num_rows)
-    : BlockParser(default_memory_pool(), options, num_cols, max_num_rows) {}
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+                 const uint8_t** out_data) {
+  const auto end = data + size;
+  int32_t skipped_rows = 0;
+  *out_data = data;
+
+  for (; skipped_rows < num_rows; ++skipped_rows) {
+    uint8_t c;
+    do {
+      while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
+        ++data;
+      }
+      if (ARROW_PREDICT_FALSE(data == end)) {
+        return skipped_rows;
+      }
+      c = *data++;
+    } while (c != '\r' && c != '\n');
+    if (c == '\r' && data < end && *data == '\n') {
+      ++data;
+    }
+    *out_data = data;
+  }
+
+  return skipped_rows;
+}
 
 }  // namespace csv
 }  // namespace arrow
diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h
index ad44ef2..4fcc52f 100644
--- a/cpp/src/arrow/csv/parser.h
+++ b/cpp/src/arrow/csv/parser.h
@@ -35,8 +35,6 @@ class MemoryPool;
 
 namespace csv {
 
-constexpr int32_t kMaxParserNumRows = 100000;
-
 /// Skip at most num_rows from the given input.  The input pointer is updated
 /// and the number of actually skipped rows is returns (may be less than
 /// requested if the input is too short).
@@ -44,6 +42,83 @@ ARROW_EXPORT
 int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
                  const uint8_t** out_data);
 
+class BlockParserImpl;
+
+namespace detail {
+
+struct ParsedValueDesc {
+  uint32_t offset : 31;
+  bool quoted : 1;
+};
+
+class ARROW_EXPORT DataBatch {
+ public:
+  explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {}
+
+  /// \brief Return the number of parsed rows
+  int32_t num_rows() const { return num_rows_; }
+  /// \brief Return the number of parsed columns
+  int32_t num_cols() const { return num_cols_; }
+  /// \brief Return the total size in bytes of parsed data
+  uint32_t num_bytes() const { return parsed_size_; }
+
+  template <typename Visitor>
+  Status VisitColumn(int32_t col_index, Visitor&& visit) const {
+    using detail::ParsedValueDesc;
+
+    for (size_t buf_index = 0; buf_index < values_buffers_.size(); 
++buf_index) {
+      const auto& values_buffer = values_buffers_[buf_index];
+      const auto values = reinterpret_cast<const 
ParsedValueDesc*>(values_buffer->data());
+      const auto max_pos =
+          static_cast<int32_t>(values_buffer->size() / 
sizeof(ParsedValueDesc)) - 1;
+      for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) {
+        auto start = values[pos].offset;
+        auto stop = values[pos + 1].offset;
+        auto quoted = values[pos + 1].quoted;
+        ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename Visitor>
+  Status VisitLastRow(Visitor&& visit) const {
+    using detail::ParsedValueDesc;
+
+    const auto& values_buffer = values_buffers_.back();
+    const auto values = reinterpret_cast<const 
ParsedValueDesc*>(values_buffer->data());
+    const auto start_pos =
+        static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) -
+        num_cols_ - 1;
+    for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+      auto start = values[start_pos + col_index].offset;
+      auto stop = values[start_pos + col_index + 1].offset;
+      auto quoted = values[start_pos + col_index + 1].quoted;
+      ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+    }
+    return Status::OK();
+  }
+
+ protected:
+  // The number of rows in this batch
+  int32_t num_rows_ = 0;
+  // The number of columns
+  int32_t num_cols_ = 0;
+
+  // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero 
bytes?
+  // It may help with null parsing...
+  std::vector<std::shared_ptr<Buffer>> values_buffers_;
+  std::shared_ptr<Buffer> parsed_buffer_;
+  const uint8_t* parsed_ = NULLPTR;
+  int32_t parsed_size_ = 0;
+
+  friend class ::arrow::csv::BlockParserImpl;
+};
+
+}  // namespace detail
+
+constexpr int32_t kMaxParserNumRows = 100000;
+
 /// \class BlockParser
 /// \brief A reusable block-based parser for CSV data
 ///
@@ -62,6 +137,7 @@ class ARROW_EXPORT BlockParser {
                        int32_t max_num_rows = kMaxParserNumRows);
   explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t 
num_cols = -1,
                        int32_t max_num_rows = kMaxParserNumRows);
+  ~BlockParser();
 
   /// \brief Parse a block of data
   ///
@@ -86,11 +162,11 @@ class ARROW_EXPORT BlockParser {
   Status ParseFinal(const std::vector<util::string_view>& data, uint32_t* 
out_size);
 
   /// \brief Return the number of parsed rows
-  int32_t num_rows() const { return num_rows_; }
+  int32_t num_rows() const { return parsed_batch().num_rows(); }
   /// \brief Return the number of parsed columns
-  int32_t num_cols() const { return num_cols_; }
+  int32_t num_cols() const { return parsed_batch().num_cols(); }
   /// \brief Return the total size in bytes of parsed data
-  uint32_t num_bytes() const { return parsed_size_; }
+  uint32_t num_bytes() const { return parsed_batch().num_bytes(); }
 
   /// \brief Visit parsed values in a column
   ///
@@ -98,82 +174,18 @@ class ARROW_EXPORT BlockParser {
   /// Status(const uint8_t* data, uint32_t size, bool quoted)
   template <typename Visitor>
   Status VisitColumn(int32_t col_index, Visitor&& visit) const {
-    for (size_t buf_index = 0; buf_index < values_buffers_.size(); 
++buf_index) {
-      const auto& values_buffer = values_buffers_[buf_index];
-      const auto values = reinterpret_cast<const 
ValueDesc*>(values_buffer->data());
-      const auto max_pos =
-          static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) - 1;
-      for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) {
-        auto start = values[pos].offset;
-        auto stop = values[pos + 1].offset;
-        auto quoted = values[pos + 1].quoted;
-        ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
-      }
-    }
-    return Status::OK();
+    return parsed_batch().VisitColumn(col_index, std::forward<Visitor>(visit));
   }
 
   template <typename Visitor>
   Status VisitLastRow(Visitor&& visit) const {
-    const auto& values_buffer = values_buffers_.back();
-    const auto values = reinterpret_cast<const 
ValueDesc*>(values_buffer->data());
-    const auto start_pos =
-        static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) - 
num_cols_ - 1;
-    for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
-      auto start = values[start_pos + col_index].offset;
-      auto stop = values[start_pos + col_index + 1].offset;
-      auto quoted = values[start_pos + col_index + 1].quoted;
-      ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
-    }
-    return Status::OK();
+    return parsed_batch().VisitLastRow(std::forward<Visitor>(visit));
   }
 
  protected:
-  ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
-
-  Status DoParse(const std::vector<util::string_view>& data, bool is_final,
-                 uint32_t* out_size);
-  template <typename SpecializedOptions>
-  Status DoParseSpecialized(const std::vector<util::string_view>& data, bool 
is_final,
-                            uint32_t* out_size);
-
-  template <typename SpecializedOptions, typename ValuesWriter, typename 
ParsedWriter>
-  Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
-                    const char* data, const char* data_end, bool is_final,
-                    int32_t rows_in_chunk, const char** out_data, bool* 
finished_parsing);
-
-  // Parse a single line from the data pointer
-  template <typename SpecializedOptions, typename ValuesWriter, typename 
ParsedWriter>
-  Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
-                   const char* data, const char* data_end, bool is_final,
-                   const char** out_data);
-
-  MemoryPool* pool_;
-  const ParseOptions options_;
-  // The number of rows parsed from the block
-  int32_t num_rows_;
-  // The number of columns (can be -1 at start)
-  int32_t num_cols_;
-  // The maximum number of rows to parse from this block
-  int32_t max_num_rows_;
-
-  // Linear scratchpad for parsed values
-  struct ValueDesc {
-    uint32_t offset : 31;
-    bool quoted : 1;
-  };
-
-  // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero 
bytes?
-  // It may help with null parsing...
-  std::vector<std::shared_ptr<Buffer>> values_buffers_;
-  std::shared_ptr<Buffer> parsed_buffer_;
-  const uint8_t* parsed_;
-  int32_t values_size_;
-  int32_t parsed_size_;
+  std::unique_ptr<BlockParserImpl> impl_;
 
-  class ResizableValuesWriter;
-  class PresizedValuesWriter;
-  class PresizedParsedWriter;
+  const detail::DataBatch& parsed_batch() const;
 };
 
 }  // namespace csv
diff --git a/cpp/src/arrow/csv/parser_benchmark.cc 
b/cpp/src/arrow/csv/parser_benchmark.cc
index 3012754..b279a3c 100644
--- a/cpp/src/arrow/csv/parser_benchmark.cc
+++ b/cpp/src/arrow/csv/parser_benchmark.cc
@@ -30,17 +30,46 @@
 namespace arrow {
 namespace csv {
 
-// Linter stipulates:
-// >> For a static/global string constant, use a C style string instead
-const char* one_row = "abc,\"d,f\",12.34,\n";
-const char* one_row_escaped = "abc,d\\,f,12.34,\n";
-
-const auto num_rows = static_cast<int32_t>((1024 * 64) / strlen(one_row));
-
-static std::string BuildCSVData(const std::string& row, int32_t repeat) {
+struct Example {
+  int32_t num_rows;
+  const char* csv_rows;
+};
+
+const Example quoted_example{1, "abc,\"d,f\",12.34,\n"};
+const Example escaped_example{1, "abc,d\\,f,12.34,\n"};
+
+const Example flights_example{
+    8,
+    
R"(2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,0010,0002,-8,12,0014,280,279,263,2330,0737,4,0750,0741,-9,0,0,,,,,,
+2015,1,1,4,US,840,N171US,SFO,CLT,0020,0018,-2,16,0034,286,293,266,2296,0800,11,0806,0811,5,0,0,,,,,,
+2015,1,1,4,AA,258,N3HYAA,LAX,MIA,0020,0015,-5,15,0030,285,281,258,2342,0748,8,0805,0756,-9,0,0,,,,,,
+2015,1,1,4,AS,135,N527AS,SEA,ANC,0025,0024,-1,11,0035,235,215,199,1448,0254,5,0320,0259,-21,0,0,,,,,,
+2015,1,1,4,DL,806,N3730B,SFO,MSP,0025,0020,-5,18,0038,217,230,206,1589,0604,6,0602,0610,8,0,0,,,,,,
+2015,1,1,4,NK,612,N635NK,LAS,MSP,0025,0019,-6,11,0030,181,170,154,1299,0504,5,0526,0509,-17,0,0,,,,,,
+2015,1,1,4,US,2013,N584UW,LAX,CLT,0030,0044,14,13,0057,273,249,228,2125,0745,8,0803,0753,-10,0,0,,,,,,
+2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,0030,0019,-11,17,0036,195,193,173,1464,0529,3,0545,0532,-13,0,0,,,,,,
+)"};
+
+// NOTE: quoted
+const Example vehicles_example{
+    2,
+    
R"(7088743681,https://greensboro.craigslist.org/ctd/d/cary-2004-honda-element-lx-4dr-suv/7088743681.html,greensboro,https://greensboro.craigslist.org,3995,2004,honda,element,,,gas,212526,clean,automatic,5J6YH18314L006498,fwd,,SUV,orange,https://images.craigslist.org/00E0E_eAUnhFF86M4_600x450.jpg,"2004
 Honda Element LX 4dr SUV     Offered by: Best Import Auto Sales Inc — (919) 
800-0650 — $3,995     EXCELLENT SHAPE INSIDE AND OUT FULLY SERVICED AND READY 
TO GO ,RUNS AND DRIVES PERFECT  [...]
+  
7088744126,https://greensboro.craigslist.org/cto/d/greensboro-2011-jaguar-xf-premier/7088744126.html,greensboro,https://greensboro.craigslist.org,9500,2011,jaguar,xf,excellent,,gas,85000,clean,automatic,,,,,blue,https://images.craigslist.org/00505_f22HGItCRpc_600x450.jpg,"2011
 jaguar XF premium - estate sale. Retired lady executive. Like new, garaged and 
maintained. Very nice leather, heated seats, electric sunroof, metallic blue 
paint. 85K miles bumper-to-bumper warranty. Premium radi [...]
+)"};
+
+const Example stocks_example{
+    3,
+    R"(2,2010-01-27 
00:00:00,002204,华锐铸钢,536498.0,135378.0,2652784.2001924426,14160629.45,5.382023337513902,5.288274712474071,5.382023337513902,5.341540976701248,,5.338025403262254,1.01364599,0.21306505690870553
+3,2010-02-05 
00:00:00,600266,北京城建,1122615.0,1122615.0,8102476.086666377,57695471.0,7.236029036381633,7.025270909108382,7.170459841229955,7.095523618199466,,7.120720923193468,2.3025570905818964,0.4683513939405588
+4,2010-01-04 
00:00:00,600289,亿阳信通,602926.359,602926.359,16393247.138998777,167754890.0,10.381817699665978,9.960037526145015,10.092597009251604,10.321563389162982,,10.233170315655089,4.436963485334562,0.6025431050299465
+)"};
+
+static constexpr int32_t kNumRows = 10000;
+
+static std::string BuildCSVData(const Example& example) {
   std::stringstream ss;
-  for (int32_t i = 0; i < repeat; ++i) {
-    ss << row;
+  for (int32_t i = 0; i < kNumRows; i += example.num_rows) {
+    ss << example.csv_rows;
   }
   return ss.str();
 }
@@ -60,7 +89,7 @@ static void BenchmarkCSVChunking(benchmark::State& state,  // 
NOLINT non-const r
 }
 
 static void ChunkCSVQuotedBlock(benchmark::State& state) {  // NOLINT 
non-const reference
-  auto csv = BuildCSVData(one_row, num_rows);
+  auto csv = BuildCSVData(quoted_example);
   auto options = ParseOptions::Defaults();
   options.quoting = true;
   options.escaping = false;
@@ -70,7 +99,7 @@ static void ChunkCSVQuotedBlock(benchmark::State& state) {  
// NOLINT non-const
 }
 
 static void ChunkCSVEscapedBlock(benchmark::State& state) {  // NOLINT 
non-const reference
-  auto csv = BuildCSVData(one_row_escaped, num_rows);
+  auto csv = BuildCSVData(escaped_example);
   auto options = ParseOptions::Defaults();
   options.quoting = false;
   options.escaping = true;
@@ -81,7 +110,7 @@ static void ChunkCSVEscapedBlock(benchmark::State& state) {  
// NOLINT non-const
 
 static void ChunkCSVNoNewlinesBlock(
     benchmark::State& state) {  // NOLINT non-const reference
-  auto csv = BuildCSVData(one_row_escaped, num_rows);
+  auto csv = BuildCSVData(escaped_example);
   auto options = ParseOptions::Defaults();
   options.quoting = true;
   options.escaping = false;
@@ -94,9 +123,9 @@ static void ChunkCSVNoNewlinesBlock(
 }
 
 static void BenchmarkCSVParsing(benchmark::State& state,  // NOLINT non-const 
reference
-                                const std::string& csv, int32_t rows,
+                                const std::string& csv, int32_t num_rows,
                                 ParseOptions options) {
-  BlockParser parser(options, -1, rows + 1);
+  BlockParser parser(options, -1, num_rows + 1);
 
   while (state.KeepRunning()) {
     uint32_t parsed_size = 0;
@@ -121,29 +150,56 @@ static void BenchmarkCSVParsing(benchmark::State& state,  
// NOLINT non-const re
   state.SetBytesProcessed(state.iterations() * csv.size());
 }
 
+static void BenchmarkCSVParsing(benchmark::State& state,  // NOLINT non-const 
reference
+                                const Example& example, ParseOptions options) {
+  auto csv = BuildCSVData(example);
+  BenchmarkCSVParsing(state, csv, kNumRows, options);
+}
+
 static void ParseCSVQuotedBlock(benchmark::State& state) {  // NOLINT 
non-const reference
-  auto csv = BuildCSVData(one_row, num_rows);
   auto options = ParseOptions::Defaults();
   options.quoting = true;
   options.escaping = false;
 
-  BenchmarkCSVParsing(state, csv, num_rows, options);
+  BenchmarkCSVParsing(state, quoted_example, options);
 }
 
 static void ParseCSVEscapedBlock(benchmark::State& state) {  // NOLINT 
non-const reference
-  auto csv = BuildCSVData(one_row_escaped, num_rows);
   auto options = ParseOptions::Defaults();
   options.quoting = false;
   options.escaping = true;
 
-  BenchmarkCSVParsing(state, csv, num_rows, options);
+  BenchmarkCSVParsing(state, escaped_example, options);
+}
+
+static void ParseCSVFlightsExample(
+    benchmark::State& state) {  // NOLINT non-const reference
+  BenchmarkCSVParsing(state, flights_example, ParseOptions::Defaults());
+}
+
+static void ParseCSVVehiclesExample(
+    benchmark::State& state) {  // NOLINT non-const reference
+  auto options = ParseOptions::Defaults();
+  options.quoting = true;
+  options.escaping = false;
+
+  BenchmarkCSVParsing(state, vehicles_example, options);
+}
+
+static void ParseCSVStocksExample(
+    benchmark::State& state) {  // NOLINT non-const reference
+  BenchmarkCSVParsing(state, stocks_example, ParseOptions::Defaults());
 }
 
 BENCHMARK(ChunkCSVQuotedBlock);
 BENCHMARK(ChunkCSVEscapedBlock);
 BENCHMARK(ChunkCSVNoNewlinesBlock);
+
 BENCHMARK(ParseCSVQuotedBlock);
 BENCHMARK(ParseCSVEscapedBlock);
+BENCHMARK(ParseCSVFlightsExample);
+BENCHMARK(ParseCSVVehiclesExample);
+BENCHMARK(ParseCSVStocksExample);
 
 }  // namespace csv
 }  // namespace arrow

[arrow] branch master updated: ARROW-10318: [C++] Use pimpl idiom in CSV parser

Reply via email to