This is an automated email from the ASF dual-hosted git repository.
bkietz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2321542 ARROW-10318: [C++] Use pimpl idiom in CSV parser
2321542 is described below
commit 23215422c717b6783f28fa402e08d63bb9f4afc6
Author: Antoine Pitrou <[email protected]>
AuthorDate: Tue Oct 20 12:18:23 2020 -0400
ARROW-10318: [C++] Use pimpl idiom in CSV parser
Hide more implementation details from `parser.h`.
Also add some more realistic parsing micro-benchmarks.
Interestingly, this increases performance on the CSV parser
micro-benchmarks (at least here with clang 10.0 on Ubuntu 20.04).
* before:
```
-------------------------------------------------------------------------------
Benchmark Time CPU Iterations
UserCounters...
-------------------------------------------------------------------------------
ParseCSVQuotedBlock 264665 ns 264597 ns 7904
bytes_per_second=612.723M/s
ParseCSVEscapedBlock 258008 ns 257938 ns 7762
bytes_per_second=591.568M/s
ParseCSVFlightsExample 2241195 ns 2240631 ns 932
bytes_per_second=433.608M/s
ParseCSVVehiclesExample 19660852 ns 19656220 ns 110
bytes_per_second=583.425M/s
ParseCSVStocksExample 4323310 ns 4322451 ns 486
bytes_per_second=485.489M/s
```
* after:
```
ParseCSVQuotedBlock 196201 ns 196167 ns 10717
bytes_per_second=826.462M/s
ParseCSVEscapedBlock 182517 ns 182479 ns 11919
bytes_per_second=836.194M/s
ParseCSVFlightsExample 1548348 ns 1548079 ns 1366
bytes_per_second=627.588M/s
ParseCSVVehiclesExample 17251422 ns 17244772 ns 121
bytes_per_second=665.009M/s
ParseCSVStocksExample 2253296 ns 2252727 ns 943
bytes_per_second=931.539M/s
```
Closes #8493 from pitrou/ARROW-10318-csv-parser-pimpl
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
---
cpp/src/arrow/csv/parser.cc | 725 +++++++++++++++++-----------------
cpp/src/arrow/csv/parser.h | 158 ++++----
cpp/src/arrow/csv/parser_benchmark.cc | 94 ++++-
3 files changed, 520 insertions(+), 457 deletions(-)
diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index 77ad402..07e5612 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -30,43 +30,22 @@
namespace arrow {
namespace csv {
-static Status ParseError(const char* message) {
+using detail::DataBatch;
+using detail::ParsedValueDesc;
+
+namespace {
+
+Status ParseError(const char* message) {
return Status::Invalid("CSV parse error: ", message);
}
-static Status MismatchingColumns(int32_t expected, int32_t actual) {
+Status MismatchingColumns(int32_t expected, int32_t actual) {
char s[50];
snprintf(s, sizeof(s), "Expected %d columns, got %d", expected, actual);
return ParseError(s);
}
-static inline bool IsControlChar(uint8_t c) { return c < ' '; }
-
-int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
- const uint8_t** out_data) {
- const auto end = data + size;
- int32_t skipped_rows = 0;
- *out_data = data;
-
- for (; skipped_rows < num_rows; ++skipped_rows) {
- uint8_t c;
- do {
- while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
- ++data;
- }
- if (ARROW_PREDICT_FALSE(data == end)) {
- return skipped_rows;
- }
- c = *data++;
- } while (c != '\r' && c != '\n');
- if (c == '\r' && data < end && *data == '\n') {
- ++data;
- }
- *out_data = data;
- }
-
- return skipped_rows;
-}
+inline bool IsControlChar(uint8_t c) { return c < ' '; }
template <bool Quoting, bool Escaping>
class SpecializedOptions {
@@ -77,9 +56,9 @@ class SpecializedOptions {
// A helper class allocating the buffer for parsed values and writing into it
// without any further resizes, except at the end.
-class BlockParser::PresizedParsedWriter {
+class PresizedDataWriter {
public:
- PresizedParsedWriter(MemoryPool* pool, uint32_t size)
+ PresizedDataWriter(MemoryPool* pool, uint32_t size)
: parsed_size_(0), parsed_capacity_(size) {
parsed_buffer_ = *AllocateResizableBuffer(parsed_capacity_, pool);
parsed_ = parsed_buffer_->mutable_data();
@@ -111,51 +90,44 @@ class BlockParser::PresizedParsedWriter {
int64_t saved_parsed_size_;
};
-// A helper class handling a growable buffer for values offsets. This class is
-// used when the number of columns is not yet known and we therefore cannot
-// efficiently presize the target area for a given number of rows.
-class BlockParser::ResizableValuesWriter {
+template <typename Derived>
+class ValueDescWriter {
public:
- explicit ResizableValuesWriter(MemoryPool* pool)
- : values_size_(0), values_capacity_(256) {
- values_buffer_ = *AllocateResizableBuffer(values_capacity_ *
sizeof(*values_), pool);
- values_ = reinterpret_cast<ValueDesc*>(values_buffer_->mutable_data());
- }
-
- template <typename ParsedWriter>
- void Start(ParsedWriter& parsed_writer) {
- PushValue({static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU,
false});
- }
+ Derived* derived() { return static_cast<Derived*>(this); }
- void Finish(std::shared_ptr<Buffer>* out_values) {
- ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
- *out_values = values_buffer_;
+ template <typename DataWriter>
+ void Start(DataWriter& parsed_writer) {
+ derived()->PushValue(
+ {static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU, false});
}
void BeginLine() { saved_values_size_ = values_size_; }
+ // Rollback the state that was saved in BeginLine()
+ void RollbackLine() { values_size_ = saved_values_size_; }
+
void StartField(bool quoted) { quoted_ = quoted; }
- template <typename ParsedWriter>
- void FinishField(ParsedWriter* parsed_writer) {
- PushValue({static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU,
quoted_});
+ template <typename DataWriter>
+ void FinishField(DataWriter* parsed_writer) {
+ derived()->PushValue(
+ {static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU, quoted_});
}
- // Rollback the state that was saved in BeginLine()
- void RollbackLine() { values_size_ = saved_values_size_; }
+ void Finish(std::shared_ptr<Buffer>* out_values) {
+ ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
+ *out_values = values_buffer_;
+ }
protected:
- void PushValue(ValueDesc v) {
- if (ARROW_PREDICT_FALSE(values_size_ == values_capacity_)) {
- values_capacity_ = values_capacity_ * 2;
- ARROW_CHECK_OK(values_buffer_->Resize(values_capacity_ *
sizeof(*values_)));
- values_ = reinterpret_cast<ValueDesc*>(values_buffer_->mutable_data());
- }
- values_[values_size_++] = v;
+ ValueDescWriter(MemoryPool* pool, int64_t values_capacity)
+ : values_size_(0), values_capacity_(values_capacity) {
+ values_buffer_ = *AllocateResizableBuffer(values_capacity_ *
sizeof(*values_), pool);
+ values_ =
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
}
std::shared_ptr<ResizableBuffer> values_buffer_;
- ValueDesc* values_;
+ ParsedValueDesc* values_;
int64_t values_size_;
int64_t values_capacity_;
bool quoted_;
@@ -163,392 +135,415 @@ class BlockParser::ResizableValuesWriter {
int64_t saved_values_size_;
};
+// A helper class handling a growable buffer for values offsets. This class is
+// used when the number of columns is not yet known and we therefore cannot
+// efficiently presize the target area for a given number of rows.
+class ResizableValueDescWriter : public
ValueDescWriter<ResizableValueDescWriter> {
+ public:
+ explicit ResizableValueDescWriter(MemoryPool* pool)
+ : ValueDescWriter(pool, /*values_capacity=*/256) {}
+
+ void PushValue(ParsedValueDesc v) {
+ if (ARROW_PREDICT_FALSE(values_size_ == values_capacity_)) {
+ values_capacity_ = values_capacity_ * 2;
+ ARROW_CHECK_OK(values_buffer_->Resize(values_capacity_ *
sizeof(*values_)));
+ values_ =
reinterpret_cast<ParsedValueDesc*>(values_buffer_->mutable_data());
+ }
+ values_[values_size_++] = v;
+ }
+};
+
// A helper class allocating the buffer for values offsets and writing into it
// without any further resizes, except at the end. This class is used once the
// number of columns is known, as it eliminates resizes and generates simpler,
// faster CSV parsing code.
-class BlockParser::PresizedValuesWriter {
+class PresizedValueDescWriter : public
ValueDescWriter<PresizedValueDescWriter> {
public:
- PresizedValuesWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
- : values_size_(0), values_capacity_(1 + num_rows * num_cols) {
- values_buffer_ = *AllocateResizableBuffer(values_capacity_ *
sizeof(*values_), pool);
- values_ = reinterpret_cast<ValueDesc*>(values_buffer_->mutable_data());
- }
-
- template <typename ParsedWriter>
- void Start(ParsedWriter& parsed_writer) {
- PushValue({static_cast<uint32_t>(parsed_writer.size()) & 0x7fffffffU,
false});
- }
-
- void Finish(std::shared_ptr<Buffer>* out_values) {
- ARROW_CHECK_OK(values_buffer_->Resize(values_size_ * sizeof(*values_)));
- *out_values = values_buffer_;
- }
+ PresizedValueDescWriter(MemoryPool* pool, int32_t num_rows, int32_t num_cols)
+ : ValueDescWriter(pool, /*values_capacity=*/1 + num_rows * num_cols) {}
- void BeginLine() { saved_values_size_ = values_size_; }
-
- void StartField(bool quoted) { quoted_ = quoted; }
-
- template <typename ParsedWriter>
- void FinishField(ParsedWriter* parsed_writer) {
- PushValue({static_cast<uint32_t>(parsed_writer->size()) & 0x7fffffffU,
quoted_});
- }
-
- // Rollback the state that was saved in BeginLine()
- void RollbackLine() { values_size_ = saved_values_size_; }
-
- protected:
- void PushValue(ValueDesc v) {
+ void PushValue(ParsedValueDesc v) {
DCHECK_LT(values_size_, values_capacity_);
values_[values_size_++] = v;
}
-
- std::shared_ptr<ResizableBuffer> values_buffer_;
- ValueDesc* values_;
- int64_t values_size_;
- const int64_t values_capacity_;
- bool quoted_;
- // Checkpointing, for when an incomplete line is encountered at end of block
- int64_t saved_values_size_;
};
-template <typename SpecializedOptions, typename ValuesWriter, typename
ParsedWriter>
-Status BlockParser::ParseLine(ValuesWriter* values_writer, ParsedWriter*
parsed_writer,
- const char* data, const char* data_end, bool
is_final,
- const char** out_data) {
- int32_t num_cols = 0;
- char c;
+} // namespace
+
+class BlockParserImpl {
+ public:
+ BlockParserImpl(MemoryPool* pool, ParseOptions options, int32_t num_cols,
+ int32_t max_num_rows)
+ : pool_(pool), options_(options), max_num_rows_(max_num_rows),
batch_(num_cols) {}
+
+ const DataBatch& parsed_batch() const { return batch_; }
- DCHECK_GT(data_end, data);
+ template <typename SpecializedOptions, typename ValueDescWriter, typename
DataWriter>
+ Status ParseLine(ValueDescWriter* values_writer, DataWriter* parsed_writer,
+ const char* data, const char* data_end, bool is_final,
+ const char** out_data) {
+ int32_t num_cols = 0;
+ char c;
- auto FinishField = [&]() { values_writer->FinishField(parsed_writer); };
+ DCHECK_GT(data_end, data);
- values_writer->BeginLine();
- parsed_writer->BeginLine();
+ auto FinishField = [&]() { values_writer->FinishField(parsed_writer); };
- // The parsing state machine
+ values_writer->BeginLine();
+ parsed_writer->BeginLine();
- // Special case empty lines: do we start with a newline separator?
- c = *data;
- if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
- if (c == '\r') {
- data++;
- if (data < data_end && *data == '\n') {
+ // The parsing state machine
+
+ // Special case empty lines: do we start with a newline separator?
+ c = *data;
+ if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
+ if (c == '\r') {
data++;
+ if (data < data_end && *data == '\n') {
+ data++;
+ }
+ goto EmptyLine;
+ }
+ if (c == '\n') {
+ data++;
+ goto EmptyLine;
}
- goto EmptyLine;
- }
- if (c == '\n') {
- data++;
- goto EmptyLine;
}
- }
-FieldStart:
- // At the start of a field
- // Quoting is only recognized at start of field
- if (SpecializedOptions::quoting && ARROW_PREDICT_FALSE(*data ==
options_.quote_char)) {
- ++data;
- values_writer->StartField(true /* quoted */);
- goto InQuotedField;
- } else {
- values_writer->StartField(false /* quoted */);
- goto InField;
- }
+ FieldStart:
+ // At the start of a field
+ // Quoting is only recognized at start of field
+ if (SpecializedOptions::quoting &&
+ ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
+ ++data;
+ values_writer->StartField(true /* quoted */);
+ goto InQuotedField;
+ } else {
+ values_writer->StartField(false /* quoted */);
+ goto InField;
+ }
-InField:
- // Inside a non-quoted part of a field
- if (ARROW_PREDICT_FALSE(data == data_end)) {
- goto AbortLine;
- }
- c = *data++;
- if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c ==
options_.escape_char)) {
+ InField:
+ // Inside a non-quoted part of a field
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
c = *data++;
- parsed_writer->PushFieldChar(c);
- goto InField;
- }
- if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
- goto FieldEnd;
- }
- if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
- if (c == '\r') {
- // In the middle of a newline separator?
- if (ARROW_PREDICT_TRUE(data < data_end) && *data == '\n') {
- data++;
+ if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c ==
options_.escape_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
}
- goto LineEnd;
+ c = *data++;
+ parsed_writer->PushFieldChar(c);
+ goto InField;
}
- if (c == '\n') {
- goto LineEnd;
+ if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
+ goto FieldEnd;
}
- }
- parsed_writer->PushFieldChar(c);
- goto InField;
+ if (ARROW_PREDICT_FALSE(IsControlChar(c))) {
+ if (c == '\r') {
+ // In the middle of a newline separator?
+ if (ARROW_PREDICT_TRUE(data < data_end) && *data == '\n') {
+ data++;
+ }
+ goto LineEnd;
+ }
+ if (c == '\n') {
+ goto LineEnd;
+ }
+ }
+ parsed_writer->PushFieldChar(c);
+ goto InField;
-InQuotedField:
- // Inside a quoted part of a field
- if (ARROW_PREDICT_FALSE(data == data_end)) {
- goto AbortLine;
- }
- c = *data++;
- if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c ==
options_.escape_char)) {
+ InQuotedField:
+ // Inside a quoted part of a field
if (ARROW_PREDICT_FALSE(data == data_end)) {
goto AbortLine;
}
c = *data++;
+ if (SpecializedOptions::escaping && ARROW_PREDICT_FALSE(c ==
options_.escape_char)) {
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
+ }
+ c = *data++;
+ parsed_writer->PushFieldChar(c);
+ goto InQuotedField;
+ }
+ if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
+ if (options_.double_quote && ARROW_PREDICT_TRUE(data < data_end) &&
+ ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
+ // Double-quoting
+ ++data;
+ } else {
+ // End of single-quoting
+ goto InField;
+ }
+ }
parsed_writer->PushFieldChar(c);
goto InQuotedField;
- }
- if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
- if (options_.double_quote && ARROW_PREDICT_TRUE(data < data_end) &&
- ARROW_PREDICT_FALSE(*data == options_.quote_char)) {
- // Double-quoting
- ++data;
- } else {
- // End of single-quoting
- goto InField;
- }
- }
- parsed_writer->PushFieldChar(c);
- goto InQuotedField;
-
-FieldEnd:
- // At the end of a field
- FinishField();
- ++num_cols;
- if (ARROW_PREDICT_FALSE(data == data_end)) {
- goto AbortLine;
- }
- goto FieldStart;
-
-LineEnd:
- // At the end of line
- FinishField();
- ++num_cols;
- if (ARROW_PREDICT_FALSE(num_cols != num_cols_)) {
- if (num_cols_ == -1) {
- num_cols_ = num_cols;
- } else {
- return MismatchingColumns(num_cols_, num_cols);
+
+ FieldEnd:
+ // At the end of a field
+ FinishField();
+ ++num_cols;
+ if (ARROW_PREDICT_FALSE(data == data_end)) {
+ goto AbortLine;
}
- }
- ++num_rows_;
- *out_data = data;
- return Status::OK();
+ goto FieldStart;
-AbortLine:
- // Not a full line except perhaps if in final block
- if (is_final) {
+ LineEnd:
+ // At the end of line
FinishField();
++num_cols;
- if (num_cols_ == -1) {
- num_cols_ = num_cols;
- } else if (num_cols != num_cols_) {
- return MismatchingColumns(num_cols_, num_cols);
+ if (ARROW_PREDICT_FALSE(num_cols != batch_.num_cols_)) {
+ if (batch_.num_cols_ == -1) {
+ batch_.num_cols_ = num_cols;
+ } else {
+ return MismatchingColumns(batch_.num_cols_, num_cols);
+ }
}
- ++num_rows_;
+ ++batch_.num_rows_;
*out_data = data;
return Status::OK();
- }
- // Truncated line at end of block, rewind parsed state
- values_writer->RollbackLine();
- parsed_writer->RollbackLine();
- return Status::OK();
-
-EmptyLine:
- if (!options_.ignore_empty_lines) {
- if (num_cols_ == -1) {
- // Consider as single value
- num_cols_ = 1;
+
+ AbortLine:
+ // Not a full line except perhaps if in final block
+ if (is_final) {
+ goto LineEnd;
}
- // Record as row of empty (null?) values
- while (num_cols++ < num_cols_) {
- values_writer->StartField(false /* quoted */);
- FinishField();
+ // Truncated line at end of block, rewind parsed state
+ values_writer->RollbackLine();
+ parsed_writer->RollbackLine();
+ return Status::OK();
+
+ EmptyLine:
+ if (!options_.ignore_empty_lines) {
+ if (batch_.num_cols_ == -1) {
+ // Consider as single value
+ batch_.num_cols_ = 1;
+ }
+ // Record as row of empty (null?) values
+ while (num_cols++ < batch_.num_cols_) {
+ values_writer->StartField(false /* quoted */);
+ FinishField();
+ }
+ ++batch_.num_rows_;
}
- ++num_rows_;
+ *out_data = data;
+ return Status::OK();
}
- *out_data = data;
- return Status::OK();
-}
-template <typename SpecializedOptions, typename ValuesWriter, typename
ParsedWriter>
-Status BlockParser::ParseChunk(ValuesWriter* values_writer, ParsedWriter*
parsed_writer,
- const char* data, const char* data_end, bool
is_final,
- int32_t rows_in_chunk, const char** out_data,
- bool* finished_parsing) {
- int32_t num_rows_deadline = num_rows_ + rows_in_chunk;
-
- while (data < data_end && num_rows_ < num_rows_deadline) {
- const char* line_end = data;
- RETURN_NOT_OK(ParseLine<SpecializedOptions>(values_writer, parsed_writer,
data,
- data_end, is_final,
&line_end));
- if (line_end == data) {
- // Cannot parse any further
- *finished_parsing = true;
- break;
+ template <typename SpecializedOptions, typename ValueDescWriter, typename
DataWriter>
+ Status ParseChunk(ValueDescWriter* values_writer, DataWriter* parsed_writer,
+ const char* data, const char* data_end, bool is_final,
+ int32_t rows_in_chunk, const char** out_data,
+ bool* finished_parsing) {
+ int32_t num_rows_deadline = batch_.num_rows_ + rows_in_chunk;
+
+ while (data < data_end && batch_.num_rows_ < num_rows_deadline) {
+ const char* line_end = data;
+ RETURN_NOT_OK(ParseLine<SpecializedOptions>(values_writer,
parsed_writer, data,
+ data_end, is_final,
&line_end));
+ if (line_end == data) {
+ // Cannot parse any further
+ *finished_parsing = true;
+ break;
+ }
+ data = line_end;
}
- data = line_end;
- }
- // Append new buffers and update size
- std::shared_ptr<Buffer> values_buffer;
- values_writer->Finish(&values_buffer);
- if (values_buffer->size() > 0) {
- values_size_ += static_cast<int32_t>(values_buffer->size() /
sizeof(ValueDesc) - 1);
- values_buffers_.push_back(std::move(values_buffer));
+ // Append new buffers and update size
+ std::shared_ptr<Buffer> values_buffer;
+ values_writer->Finish(&values_buffer);
+ if (values_buffer->size() > 0) {
+ values_size_ +=
+ static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)
- 1);
+ batch_.values_buffers_.push_back(std::move(values_buffer));
+ }
+ *out_data = data;
+ return Status::OK();
}
- *out_data = data;
- return Status::OK();
-}
-template <typename SpecializedOptions>
-Status BlockParser::DoParseSpecialized(const std::vector<util::string_view>&
views,
- bool is_final, uint32_t* out_size) {
- num_rows_ = 0;
- values_size_ = 0;
- parsed_size_ = 0;
- values_buffers_.clear();
- parsed_buffer_.reset();
- parsed_ = nullptr;
-
- size_t total_view_length = 0;
- for (const auto& view : views) {
- total_view_length += view.length();
- }
- if (total_view_length > std::numeric_limits<uint32_t>::max()) {
- return Status::Invalid("CSV block too large");
- }
+ template <typename SpecializedOptions>
+ Status ParseSpecialized(const std::vector<util::string_view>& views, bool
is_final,
+ uint32_t* out_size) {
+ batch_ = DataBatch{batch_.num_cols_};
+ values_size_ = 0;
- PresizedParsedWriter parsed_writer(pool_,
static_cast<uint32_t>(total_view_length));
- uint32_t total_parsed_length = 0;
-
- for (const auto& view : views) {
- const char* data = view.data();
- const char* data_end = view.data() + view.length();
- bool finished_parsing = false;
-
- if (num_cols_ == -1) {
- // Can't presize values when the number of columns is not known, first
parse
- // a single line
- const int32_t rows_in_chunk = 1;
- ResizableValuesWriter values_writer(pool_);
- values_writer.Start(parsed_writer);
-
- RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer,
&parsed_writer, data,
- data_end, is_final,
rows_in_chunk,
- &data, &finished_parsing));
- if (num_cols_ == -1) {
- return ParseError("Empty CSV file or block: cannot infer number of
columns");
- }
+ size_t total_view_length = 0;
+ for (const auto& view : views) {
+ total_view_length += view.length();
+ }
+ if (total_view_length > std::numeric_limits<uint32_t>::max()) {
+ return Status::Invalid("CSV block too large");
}
- while (!finished_parsing && data < data_end && num_rows_ < max_num_rows_) {
- // We know the number of columns, so can presize a values array for
- // a given number of rows
- DCHECK_GE(num_cols_, 0);
-
- int32_t rows_in_chunk;
- constexpr int32_t kTargetChunkSize = 32768;
- if (num_cols_ > 0) {
- rows_in_chunk = std::min(std::max(kTargetChunkSize / num_cols_, 512),
- max_num_rows_ - num_rows_);
- } else {
- rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ - num_rows_);
+ PresizedDataWriter parsed_writer(pool_,
static_cast<uint32_t>(total_view_length));
+ uint32_t total_parsed_length = 0;
+
+ for (const auto& view : views) {
+ const char* data = view.data();
+ const char* data_end = view.data() + view.length();
+ bool finished_parsing = false;
+
+ if (batch_.num_cols_ == -1) {
+ // Can't presize values when the number of columns is not known, first
parse
+ // a single line
+ const int32_t rows_in_chunk = 1;
+ ResizableValueDescWriter values_writer(pool_);
+ values_writer.Start(parsed_writer);
+
+ RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer,
&parsed_writer, data,
+ data_end, is_final,
rows_in_chunk,
+ &data,
&finished_parsing));
+ if (batch_.num_cols_ == -1) {
+ return ParseError("Empty CSV file or block: cannot infer number of
columns");
+ }
}
- PresizedValuesWriter values_writer(pool_, rows_in_chunk, num_cols_);
- values_writer.Start(parsed_writer);
-
- RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer,
&parsed_writer, data,
- data_end, is_final,
rows_in_chunk,
- &data, &finished_parsing));
- }
- DCHECK_GE(data, view.data());
- DCHECK_LE(data, data_end);
- total_parsed_length += static_cast<uint32_t>(data - view.data());
+ while (!finished_parsing && data < data_end && batch_.num_rows_ <
max_num_rows_) {
+ // We know the number of columns, so can presize a values array for
+ // a given number of rows
+ DCHECK_GE(batch_.num_cols_, 0);
+
+ int32_t rows_in_chunk;
+ constexpr int32_t kTargetChunkSize = 32768; // in number of values
+ if (batch_.num_cols_ > 0) {
+ rows_in_chunk = std::min(std::max(kTargetChunkSize /
batch_.num_cols_, 512),
+ max_num_rows_ - batch_.num_rows_);
+ } else {
+ rows_in_chunk = std::min(kTargetChunkSize, max_num_rows_ -
batch_.num_rows_);
+ }
+
+ PresizedValueDescWriter values_writer(pool_, rows_in_chunk,
batch_.num_cols_);
+ values_writer.Start(parsed_writer);
+
+ RETURN_NOT_OK(ParseChunk<SpecializedOptions>(&values_writer,
&parsed_writer, data,
+ data_end, is_final,
rows_in_chunk,
+ &data,
&finished_parsing));
+ }
+ DCHECK_GE(data, view.data());
+ DCHECK_LE(data, data_end);
+ total_parsed_length += static_cast<uint32_t>(data - view.data());
- if (data < data_end) {
- // Stopped early, for some reason
- break;
+ if (data < data_end) {
+ // Stopped early, for some reason
+ break;
+ }
}
- }
- parsed_writer.Finish(&parsed_buffer_);
- parsed_size_ = static_cast<int32_t>(parsed_buffer_->size());
- parsed_ = parsed_buffer_->data();
+ parsed_writer.Finish(&batch_.parsed_buffer_);
+ batch_.parsed_size_ = static_cast<int32_t>(batch_.parsed_buffer_->size());
+ batch_.parsed_ = batch_.parsed_buffer_->data();
- DCHECK_EQ(values_size_, num_rows_ * num_cols_);
- if (num_cols_ == -1) {
- DCHECK_EQ(num_rows_, 0);
- }
+ if (batch_.num_cols_ == -1) {
+ DCHECK_EQ(batch_.num_rows_, 0);
+ }
+ DCHECK_EQ(values_size_, batch_.num_rows_ * batch_.num_cols_);
#ifndef NDEBUG
- if (num_rows_ > 0) {
- DCHECK_GT(values_buffers_.size(), 0);
- auto& last_values_buffer = values_buffers_.back();
- auto last_values = reinterpret_cast<const
ValueDesc*>(last_values_buffer->data());
- auto last_values_size = last_values_buffer->size() / sizeof(ValueDesc);
- auto check_parsed_size =
- static_cast<int32_t>(last_values[last_values_size - 1].offset);
- DCHECK_EQ(parsed_size_, check_parsed_size);
- } else {
- DCHECK_EQ(parsed_size_, 0);
- }
-#endif
- *out_size = static_cast<uint32_t>(total_parsed_length);
- return Status::OK();
-}
-
-Status BlockParser::DoParse(const std::vector<util::string_view>& data, bool
is_final,
- uint32_t* out_size) {
- if (options_.quoting) {
- if (options_.escaping) {
- return DoParseSpecialized<SpecializedOptions<true, true>>(data,
is_final, out_size);
+ if (batch_.num_rows_ > 0) {
+ // Ending parsed offset should be equal to number of parsed bytes
+ DCHECK_GT(batch_.values_buffers_.size(), 0);
+ const auto& last_values_buffer = batch_.values_buffers_.back();
+ const auto last_values =
+ reinterpret_cast<const ParsedValueDesc*>(last_values_buffer->data());
+ const auto last_values_size = last_values_buffer->size() /
sizeof(ParsedValueDesc);
+ const auto check_parsed_size =
+ static_cast<int32_t>(last_values[last_values_size - 1].offset);
+ DCHECK_EQ(batch_.parsed_size_, check_parsed_size);
} else {
- return DoParseSpecialized<SpecializedOptions<true, false>>(data,
is_final,
- out_size);
+ DCHECK_EQ(batch_.parsed_size_, 0);
}
- } else {
- if (options_.escaping) {
- return DoParseSpecialized<SpecializedOptions<false, true>>(data,
is_final,
+#endif
+ *out_size = static_cast<uint32_t>(total_parsed_length);
+ return Status::OK();
+ }
+
+ Status Parse(const std::vector<util::string_view>& data, bool is_final,
+ uint32_t* out_size) {
+ if (options_.quoting) {
+ if (options_.escaping) {
+ return ParseSpecialized<SpecializedOptions<true, true>>(data,
is_final, out_size);
+ } else {
+ return ParseSpecialized<SpecializedOptions<true, false>>(data,
is_final,
out_size);
+ }
} else {
- return DoParseSpecialized<SpecializedOptions<false, false>>(data,
is_final,
+ if (options_.escaping) {
+ return ParseSpecialized<SpecializedOptions<false, true>>(data,
is_final,
+ out_size);
+ } else {
+ return ParseSpecialized<SpecializedOptions<false, false>>(data,
is_final,
out_size);
+ }
}
}
-}
+
+ protected:
+ MemoryPool* pool_;
+ const ParseOptions options_;
+ // The maximum number of rows to parse from a block
+ int32_t max_num_rows_;
+
+ // Unparsed data size
+ int32_t values_size_;
+ // Parsed data batch
+ DataBatch batch_;
+};
+
+BlockParser::BlockParser(ParseOptions options, int32_t num_cols, int32_t
max_num_rows)
+ : BlockParser(default_memory_pool(), options, num_cols, max_num_rows) {}
+
+BlockParser::BlockParser(MemoryPool* pool, ParseOptions options, int32_t
num_cols,
+ int32_t max_num_rows)
+ : impl_(new BlockParserImpl(pool, std::move(options), num_cols,
max_num_rows)) {}
+
+BlockParser::~BlockParser() {}
Status BlockParser::Parse(const std::vector<util::string_view>& data,
uint32_t* out_size) {
- return DoParse(data, false /* is_final */, out_size);
+ return impl_->Parse(data, false /* is_final */, out_size);
}
Status BlockParser::ParseFinal(const std::vector<util::string_view>& data,
uint32_t* out_size) {
- return DoParse(data, true /* is_final */, out_size);
+ return impl_->Parse(data, true /* is_final */, out_size);
}
Status BlockParser::Parse(util::string_view data, uint32_t* out_size) {
- return DoParse({data}, false /* is_final */, out_size);
+ return impl_->Parse({data}, false /* is_final */, out_size);
}
Status BlockParser::ParseFinal(util::string_view data, uint32_t* out_size) {
- return DoParse({data}, true /* is_final */, out_size);
+ return impl_->Parse({data}, true /* is_final */, out_size);
}
-BlockParser::BlockParser(MemoryPool* pool, ParseOptions options, int32_t
num_cols,
- int32_t max_num_rows)
- : pool_(pool),
- options_(options),
- num_rows_(-1),
- num_cols_(num_cols),
- max_num_rows_(max_num_rows) {}
+const DataBatch& BlockParser::parsed_batch() const { return
impl_->parsed_batch(); }
-BlockParser::BlockParser(ParseOptions options, int32_t num_cols, int32_t
max_num_rows)
- : BlockParser(default_memory_pool(), options, num_cols, max_num_rows) {}
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+ const uint8_t** out_data) {
+ const auto end = data + size;
+ int32_t skipped_rows = 0;
+ *out_data = data;
+
+ for (; skipped_rows < num_rows; ++skipped_rows) {
+ uint8_t c;
+ do {
+ while (ARROW_PREDICT_FALSE(data < end && !IsControlChar(*data))) {
+ ++data;
+ }
+ if (ARROW_PREDICT_FALSE(data == end)) {
+ return skipped_rows;
+ }
+ c = *data++;
+ } while (c != '\r' && c != '\n');
+ if (c == '\r' && data < end && *data == '\n') {
+ ++data;
+ }
+ *out_data = data;
+ }
+
+ return skipped_rows;
+}
} // namespace csv
} // namespace arrow
diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h
index ad44ef2..4fcc52f 100644
--- a/cpp/src/arrow/csv/parser.h
+++ b/cpp/src/arrow/csv/parser.h
@@ -35,8 +35,6 @@ class MemoryPool;
namespace csv {
-constexpr int32_t kMaxParserNumRows = 100000;
-
/// Skip at most num_rows from the given input. The input pointer is updated
/// and the number of actually skipped rows is returns (may be less than
/// requested if the input is too short).
@@ -44,6 +42,83 @@ ARROW_EXPORT
int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
const uint8_t** out_data);
+class BlockParserImpl;
+
+namespace detail {
+
+struct ParsedValueDesc {
+ uint32_t offset : 31;
+ bool quoted : 1;
+};
+
+class ARROW_EXPORT DataBatch {
+ public:
+ explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {}
+
+ /// \brief Return the number of parsed rows
+ int32_t num_rows() const { return num_rows_; }
+ /// \brief Return the number of parsed columns
+ int32_t num_cols() const { return num_cols_; }
+ /// \brief Return the total size in bytes of parsed data
+ uint32_t num_bytes() const { return parsed_size_; }
+
+ template <typename Visitor>
+ Status VisitColumn(int32_t col_index, Visitor&& visit) const {
+ using detail::ParsedValueDesc;
+
+ for (size_t buf_index = 0; buf_index < values_buffers_.size();
++buf_index) {
+ const auto& values_buffer = values_buffers_[buf_index];
+ const auto values = reinterpret_cast<const
ParsedValueDesc*>(values_buffer->data());
+ const auto max_pos =
+ static_cast<int32_t>(values_buffer->size() /
sizeof(ParsedValueDesc)) - 1;
+ for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) {
+ auto start = values[pos].offset;
+ auto stop = values[pos + 1].offset;
+ auto quoted = values[pos + 1].quoted;
+ ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+ }
+ }
+ return Status::OK();
+ }
+
+ template <typename Visitor>
+ Status VisitLastRow(Visitor&& visit) const {
+ using detail::ParsedValueDesc;
+
+ const auto& values_buffer = values_buffers_.back();
+ const auto values = reinterpret_cast<const
ParsedValueDesc*>(values_buffer->data());
+ const auto start_pos =
+ static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) -
+ num_cols_ - 1;
+ for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+ auto start = values[start_pos + col_index].offset;
+ auto stop = values[start_pos + col_index + 1].offset;
+ auto quoted = values[start_pos + col_index + 1].quoted;
+ ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+ }
+ return Status::OK();
+ }
+
+ protected:
+ // The number of rows in this batch
+ int32_t num_rows_ = 0;
+ // The number of columns
+ int32_t num_cols_ = 0;
+
+ // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero
bytes?
+ // It may help with null parsing...
+ std::vector<std::shared_ptr<Buffer>> values_buffers_;
+ std::shared_ptr<Buffer> parsed_buffer_;
+ const uint8_t* parsed_ = NULLPTR;
+ int32_t parsed_size_ = 0;
+
+ friend class ::arrow::csv::BlockParserImpl;
+};
+
+} // namespace detail
+
+constexpr int32_t kMaxParserNumRows = 100000;
+
/// \class BlockParser
/// \brief A reusable block-based parser for CSV data
///
@@ -62,6 +137,7 @@ class ARROW_EXPORT BlockParser {
int32_t max_num_rows = kMaxParserNumRows);
explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t
num_cols = -1,
int32_t max_num_rows = kMaxParserNumRows);
+ ~BlockParser();
/// \brief Parse a block of data
///
@@ -86,11 +162,11 @@ class ARROW_EXPORT BlockParser {
Status ParseFinal(const std::vector<util::string_view>& data, uint32_t*
out_size);
/// \brief Return the number of parsed rows
- int32_t num_rows() const { return num_rows_; }
+ int32_t num_rows() const { return parsed_batch().num_rows(); }
/// \brief Return the number of parsed columns
- int32_t num_cols() const { return num_cols_; }
+ int32_t num_cols() const { return parsed_batch().num_cols(); }
/// \brief Return the total size in bytes of parsed data
- uint32_t num_bytes() const { return parsed_size_; }
+ uint32_t num_bytes() const { return parsed_batch().num_bytes(); }
/// \brief Visit parsed values in a column
///
@@ -98,82 +174,18 @@ class ARROW_EXPORT BlockParser {
/// Status(const uint8_t* data, uint32_t size, bool quoted)
template <typename Visitor>
Status VisitColumn(int32_t col_index, Visitor&& visit) const {
- for (size_t buf_index = 0; buf_index < values_buffers_.size();
++buf_index) {
- const auto& values_buffer = values_buffers_[buf_index];
- const auto values = reinterpret_cast<const
ValueDesc*>(values_buffer->data());
- const auto max_pos =
- static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) - 1;
- for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) {
- auto start = values[pos].offset;
- auto stop = values[pos + 1].offset;
- auto quoted = values[pos + 1].quoted;
- ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
- }
- }
- return Status::OK();
+ return parsed_batch().VisitColumn(col_index, std::forward<Visitor>(visit));
}
template <typename Visitor>
Status VisitLastRow(Visitor&& visit) const {
- const auto& values_buffer = values_buffers_.back();
- const auto values = reinterpret_cast<const
ValueDesc*>(values_buffer->data());
- const auto start_pos =
- static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) -
num_cols_ - 1;
- for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
- auto start = values[start_pos + col_index].offset;
- auto stop = values[start_pos + col_index + 1].offset;
- auto quoted = values[start_pos + col_index + 1].quoted;
- ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
- }
- return Status::OK();
+ return parsed_batch().VisitLastRow(std::forward<Visitor>(visit));
}
protected:
- ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
-
- Status DoParse(const std::vector<util::string_view>& data, bool is_final,
- uint32_t* out_size);
- template <typename SpecializedOptions>
- Status DoParseSpecialized(const std::vector<util::string_view>& data, bool
is_final,
- uint32_t* out_size);
-
- template <typename SpecializedOptions, typename ValuesWriter, typename
ParsedWriter>
- Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
- const char* data, const char* data_end, bool is_final,
- int32_t rows_in_chunk, const char** out_data, bool*
finished_parsing);
-
- // Parse a single line from the data pointer
- template <typename SpecializedOptions, typename ValuesWriter, typename
ParsedWriter>
- Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
- const char* data, const char* data_end, bool is_final,
- const char** out_data);
-
- MemoryPool* pool_;
- const ParseOptions options_;
- // The number of rows parsed from the block
- int32_t num_rows_;
- // The number of columns (can be -1 at start)
- int32_t num_cols_;
- // The maximum number of rows to parse from this block
- int32_t max_num_rows_;
-
- // Linear scratchpad for parsed values
- struct ValueDesc {
- uint32_t offset : 31;
- bool quoted : 1;
- };
-
- // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero
bytes?
- // It may help with null parsing...
- std::vector<std::shared_ptr<Buffer>> values_buffers_;
- std::shared_ptr<Buffer> parsed_buffer_;
- const uint8_t* parsed_;
- int32_t values_size_;
- int32_t parsed_size_;
+ std::unique_ptr<BlockParserImpl> impl_;
- class ResizableValuesWriter;
- class PresizedValuesWriter;
- class PresizedParsedWriter;
+ const detail::DataBatch& parsed_batch() const;
};
} // namespace csv
diff --git a/cpp/src/arrow/csv/parser_benchmark.cc
b/cpp/src/arrow/csv/parser_benchmark.cc
index 3012754..b279a3c 100644
--- a/cpp/src/arrow/csv/parser_benchmark.cc
+++ b/cpp/src/arrow/csv/parser_benchmark.cc
@@ -30,17 +30,46 @@
namespace arrow {
namespace csv {
-// Linter stipulates:
-// >> For a static/global string constant, use a C style string instead
-const char* one_row = "abc,\"d,f\",12.34,\n";
-const char* one_row_escaped = "abc,d\\,f,12.34,\n";
-
-const auto num_rows = static_cast<int32_t>((1024 * 64) / strlen(one_row));
-
-static std::string BuildCSVData(const std::string& row, int32_t repeat) {
+struct Example {
+ int32_t num_rows;
+ const char* csv_rows;
+};
+
+const Example quoted_example{1, "abc,\"d,f\",12.34,\n"};
+const Example escaped_example{1, "abc,d\\,f,12.34,\n"};
+
+const Example flights_example{
+ 8,
+
R"(2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,0010,0002,-8,12,0014,280,279,263,2330,0737,4,0750,0741,-9,0,0,,,,,,
+2015,1,1,4,US,840,N171US,SFO,CLT,0020,0018,-2,16,0034,286,293,266,2296,0800,11,0806,0811,5,0,0,,,,,,
+2015,1,1,4,AA,258,N3HYAA,LAX,MIA,0020,0015,-5,15,0030,285,281,258,2342,0748,8,0805,0756,-9,0,0,,,,,,
+2015,1,1,4,AS,135,N527AS,SEA,ANC,0025,0024,-1,11,0035,235,215,199,1448,0254,5,0320,0259,-21,0,0,,,,,,
+2015,1,1,4,DL,806,N3730B,SFO,MSP,0025,0020,-5,18,0038,217,230,206,1589,0604,6,0602,0610,8,0,0,,,,,,
+2015,1,1,4,NK,612,N635NK,LAS,MSP,0025,0019,-6,11,0030,181,170,154,1299,0504,5,0526,0509,-17,0,0,,,,,,
+2015,1,1,4,US,2013,N584UW,LAX,CLT,0030,0044,14,13,0057,273,249,228,2125,0745,8,0803,0753,-10,0,0,,,,,,
+2015,1,1,4,AA,1112,N3LAAA,SFO,DFW,0030,0019,-11,17,0036,195,193,173,1464,0529,3,0545,0532,-13,0,0,,,,,,
+)"};
+
+// NOTE: quoted
+const Example vehicles_example{
+ 2,
+
R"(7088743681,https://greensboro.craigslist.org/ctd/d/cary-2004-honda-element-lx-4dr-suv/7088743681.html,greensboro,https://greensboro.craigslist.org,3995,2004,honda,element,,,gas,212526,clean,automatic,5J6YH18314L006498,fwd,,SUV,orange,https://images.craigslist.org/00E0E_eAUnhFF86M4_600x450.jpg,"2004
Honda Element LX 4dr SUV Offered by: Best Import Auto Sales Inc — (919)
800-0650 — $3,995 EXCELLENT SHAPE INSIDE AND OUT FULLY SERVICED AND READY
TO GO ,RUNS AND DRIVES PERFECT [...]
+
7088744126,https://greensboro.craigslist.org/cto/d/greensboro-2011-jaguar-xf-premier/7088744126.html,greensboro,https://greensboro.craigslist.org,9500,2011,jaguar,xf,excellent,,gas,85000,clean,automatic,,,,,blue,https://images.craigslist.org/00505_f22HGItCRpc_600x450.jpg,"2011
jaguar XF premium - estate sale. Retired lady executive. Like new, garaged and
maintained. Very nice leather, heated seats, electric sunroof, metallic blue
paint. 85K miles bumper-to-bumper warranty. Premium radi [...]
+)"};
+
+const Example stocks_example{
+ 3,
+ R"(2,2010-01-27
00:00:00,002204,华锐铸钢,536498.0,135378.0,2652784.2001924426,14160629.45,5.382023337513902,5.288274712474071,5.382023337513902,5.341540976701248,,5.338025403262254,1.01364599,0.21306505690870553
+3,2010-02-05
00:00:00,600266,北京城建,1122615.0,1122615.0,8102476.086666377,57695471.0,7.236029036381633,7.025270909108382,7.170459841229955,7.095523618199466,,7.120720923193468,2.3025570905818964,0.4683513939405588
+4,2010-01-04
00:00:00,600289,亿阳信通,602926.359,602926.359,16393247.138998777,167754890.0,10.381817699665978,9.960037526145015,10.092597009251604,10.321563389162982,,10.233170315655089,4.436963485334562,0.6025431050299465
+)"};
+
+static constexpr int32_t kNumRows = 10000;
+
+static std::string BuildCSVData(const Example& example) {
std::stringstream ss;
- for (int32_t i = 0; i < repeat; ++i) {
- ss << row;
+ for (int32_t i = 0; i < kNumRows; i += example.num_rows) {
+ ss << example.csv_rows;
}
return ss.str();
}
@@ -60,7 +89,7 @@ static void BenchmarkCSVChunking(benchmark::State& state, //
NOLINT non-const r
}
static void ChunkCSVQuotedBlock(benchmark::State& state) { // NOLINT
non-const reference
- auto csv = BuildCSVData(one_row, num_rows);
+ auto csv = BuildCSVData(quoted_example);
auto options = ParseOptions::Defaults();
options.quoting = true;
options.escaping = false;
@@ -70,7 +99,7 @@ static void ChunkCSVQuotedBlock(benchmark::State& state) {
// NOLINT non-const
}
static void ChunkCSVEscapedBlock(benchmark::State& state) { // NOLINT
non-const reference
- auto csv = BuildCSVData(one_row_escaped, num_rows);
+ auto csv = BuildCSVData(escaped_example);
auto options = ParseOptions::Defaults();
options.quoting = false;
options.escaping = true;
@@ -81,7 +110,7 @@ static void ChunkCSVEscapedBlock(benchmark::State& state) {
// NOLINT non-const
static void ChunkCSVNoNewlinesBlock(
benchmark::State& state) { // NOLINT non-const reference
- auto csv = BuildCSVData(one_row_escaped, num_rows);
+ auto csv = BuildCSVData(escaped_example);
auto options = ParseOptions::Defaults();
options.quoting = true;
options.escaping = false;
@@ -94,9 +123,9 @@ static void ChunkCSVNoNewlinesBlock(
}
static void BenchmarkCSVParsing(benchmark::State& state, // NOLINT non-const
reference
- const std::string& csv, int32_t rows,
+ const std::string& csv, int32_t num_rows,
ParseOptions options) {
- BlockParser parser(options, -1, rows + 1);
+ BlockParser parser(options, -1, num_rows + 1);
while (state.KeepRunning()) {
uint32_t parsed_size = 0;
@@ -121,29 +150,56 @@ static void BenchmarkCSVParsing(benchmark::State& state,
// NOLINT non-const re
state.SetBytesProcessed(state.iterations() * csv.size());
}
+static void BenchmarkCSVParsing(benchmark::State& state, // NOLINT non-const
reference
+ const Example& example, ParseOptions options) {
+ auto csv = BuildCSVData(example);
+ BenchmarkCSVParsing(state, csv, kNumRows, options);
+}
+
static void ParseCSVQuotedBlock(benchmark::State& state) { // NOLINT
non-const reference
- auto csv = BuildCSVData(one_row, num_rows);
auto options = ParseOptions::Defaults();
options.quoting = true;
options.escaping = false;
- BenchmarkCSVParsing(state, csv, num_rows, options);
+ BenchmarkCSVParsing(state, quoted_example, options);
}
static void ParseCSVEscapedBlock(benchmark::State& state) { // NOLINT
non-const reference
- auto csv = BuildCSVData(one_row_escaped, num_rows);
auto options = ParseOptions::Defaults();
options.quoting = false;
options.escaping = true;
- BenchmarkCSVParsing(state, csv, num_rows, options);
+ BenchmarkCSVParsing(state, escaped_example, options);
+}
+
+static void ParseCSVFlightsExample(
+ benchmark::State& state) { // NOLINT non-const reference
+ BenchmarkCSVParsing(state, flights_example, ParseOptions::Defaults());
+}
+
+static void ParseCSVVehiclesExample(
+ benchmark::State& state) { // NOLINT non-const reference
+ auto options = ParseOptions::Defaults();
+ options.quoting = true;
+ options.escaping = false;
+
+ BenchmarkCSVParsing(state, vehicles_example, options);
+}
+
+static void ParseCSVStocksExample(
+ benchmark::State& state) { // NOLINT non-const reference
+ BenchmarkCSVParsing(state, stocks_example, ParseOptions::Defaults());
}
BENCHMARK(ChunkCSVQuotedBlock);
BENCHMARK(ChunkCSVEscapedBlock);
BENCHMARK(ChunkCSVNoNewlinesBlock);
+
BENCHMARK(ParseCSVQuotedBlock);
BENCHMARK(ParseCSVEscapedBlock);
+BENCHMARK(ParseCSVFlightsExample);
+BENCHMARK(ParseCSVVehiclesExample);
+BENCHMARK(ParseCSVStocksExample);
} // namespace csv
} // namespace arrow