pitrou commented on code in PR #39963:
URL: https://github.com/apache/arrow/pull/39963#discussion_r1482675456
##########
cpp/src/arrow/csv/reader.cc:
##########
@@ -719,66 +726,25 @@ class ReaderMixin {
return Status::OK();
}
- struct ParseResult {
- std::shared_ptr<BlockParser> parser;
- int64_t parsed_bytes;
- };
-
- Result<ParseResult> Parse(const std::shared_ptr<Buffer>& partial,
- const std::shared_ptr<Buffer>& completion,
- const std::shared_ptr<Buffer>& block, int64_t
block_index,
- bool is_final) {
- static constexpr int32_t max_num_rows =
std::numeric_limits<int32_t>::max();
- auto parser = std::make_shared<BlockParser>(
- io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_,
max_num_rows);
-
- std::shared_ptr<Buffer> straddling;
- std::vector<std::string_view> views;
- if (partial->size() != 0 || completion->size() != 0) {
- if (partial->size() == 0) {
- straddling = completion;
- } else if (completion->size() == 0) {
- straddling = partial;
- } else {
- ARROW_ASSIGN_OR_RAISE(
- straddling, ConcatenateBuffers({partial, completion},
io_context_.pool()));
- }
- views = {std::string_view(*straddling), std::string_view(*block)};
- } else {
- views = {std::string_view(*block)};
- }
- uint32_t parsed_size;
- if (is_final) {
- RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size));
- } else {
- RETURN_NOT_OK(parser->Parse(views, &parsed_size));
- }
- // See BlockParsingOperator for explanation.
- const int64_t bytes_before_buffer = partial->size() + completion->size();
- if (static_cast<int64_t>(parsed_size) < bytes_before_buffer) {
- return Status::Invalid(
- "CSV parser got out of sync with chunker. This can mean the data
file "
- "contains cell values spanning multiple lines; please consider
enabling "
- "the option 'newlines_in_values'.");
- }
+ Result<ParsedBlock> Parse(const CSVBlock& block) {
+ DCHECK(parsing_operator_.has_value());
Review Comment:
IIUC, `consume_bytes` is defined for serial reads where we can give the CSV
parser a block that's truncated at the end, and keep the unprocessed data for
the next parser call. For threaded reads, we cannot do that so we use a more
elaborate chunking step instead.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]