Re: [PR] GH-39962: [C++] Small CSV reader refactoring [arrow]

via GitHub Thu, 08 Feb 2024 01:38:44 -0800


pitrou commented on code in PR #39963:
URL: https://github.com/apache/arrow/pull/39963#discussion_r1482675456



##########
cpp/src/arrow/csv/reader.cc:
##########
@@ -719,66 +726,25 @@ class ReaderMixin {
     return Status::OK();
   }
 
-  struct ParseResult {
-    std::shared_ptr<BlockParser> parser;
-    int64_t parsed_bytes;
-  };
-
-  Result<ParseResult> Parse(const std::shared_ptr<Buffer>& partial,
-                            const std::shared_ptr<Buffer>& completion,
-                            const std::shared_ptr<Buffer>& block, int64_t 
block_index,
-                            bool is_final) {
-    static constexpr int32_t max_num_rows = 
std::numeric_limits<int32_t>::max();
-    auto parser = std::make_shared<BlockParser>(
-        io_context_.pool(), parse_options_, num_csv_cols_, num_rows_seen_, 
max_num_rows);
-
-    std::shared_ptr<Buffer> straddling;
-    std::vector<std::string_view> views;
-    if (partial->size() != 0 || completion->size() != 0) {
-      if (partial->size() == 0) {
-        straddling = completion;
-      } else if (completion->size() == 0) {
-        straddling = partial;
-      } else {
-        ARROW_ASSIGN_OR_RAISE(
-            straddling, ConcatenateBuffers({partial, completion}, 
io_context_.pool()));
-      }
-      views = {std::string_view(*straddling), std::string_view(*block)};
-    } else {
-      views = {std::string_view(*block)};
-    }
-    uint32_t parsed_size;
-    if (is_final) {
-      RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size));
-    } else {
-      RETURN_NOT_OK(parser->Parse(views, &parsed_size));
-    }
-    // See BlockParsingOperator for explanation.
-    const int64_t bytes_before_buffer = partial->size() + completion->size();
-    if (static_cast<int64_t>(parsed_size) < bytes_before_buffer) {
-      return Status::Invalid(
-          "CSV parser got out of sync with chunker. This can mean the data 
file "
-          "contains cell values spanning multiple lines; please consider 
enabling "
-          "the option 'newlines_in_values'.");
-    }
+  Result<ParsedBlock> Parse(const CSVBlock& block) {
+    DCHECK(parsing_operator_.has_value());

Review Comment:
   IIUC, `consume_bytes` is defined for serial reads where we can give the CSV 
parser a block that's truncated at the end, and keep the unprocessed data for 
the next parser call. For threaded reads, we cannot do that so we use a more 
elaborate chunking step instead.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-39962: [C++] Small CSV reader refactoring [arrow]

Reply via email to