[GitHub] [arrow] pitrou commented on a change in pull request #10790: ARROW-12673: [C++] Add callback to handle incorrect column counts

GitBox Wed, 01 Sep 2021 11:49:34 -0700


pitrou commented on a change in pull request #10790:
URL: https://github.com/apache/arrow/pull/10790#discussion_r700473094




##########
File path: cpp/src/arrow/csv/test_common.h
##########
@@ -47,7 +47,7 @@ ARROW_TESTING_EXPORT
 void MakeColumnParser(std::vector<std::string> items, 
std::shared_ptr<BlockParser>* out);
 
 ARROW_TESTING_EXPORT
-Result<std::shared_ptr<Buffer>> MakeSampleCsvBuffer(size_t num_rows, bool 
valid = true);
+Result<std::shared_ptr<Buffer>> MakeSampleCsvBuffer(size_t num_rows, int 
invalid = 0);

Review comment:
       `size_t num_invalid_rows` perhaps?

##########
File path: cpp/src/arrow/csv/options.h
##########
@@ -58,6 +61,8 @@ struct ARROW_EXPORT ParseOptions {
   /// Whether empty lines are ignored.  If false, an empty line represents
   /// a single empty value (assuming a one-column CSV file).
   bool ignore_empty_lines = true;
+  /// A handler function for rows which do not have the correct number of 
columns
+  util::optional<InvalidRowHandler> invalid_row_handler;

Review comment:
       Note that `util::optional` is probably not necessary: a `std::function` 
can be "empty": see 
https://en.cppreference.com/w/cpp/utility/functional/function/operator_bool

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -324,9 +324,29 @@ class BlockParserImpl {
         if (*(end - 1) == '\r') {
           --end;
         }
-        return MismatchingColumns(batch_.num_cols_, num_cols,
-                                  first_row_ < 0 ? -1 : first_row_ + 
batch_.num_rows_,
-                                  util::string_view(start, end - start));
+        int32_t batch_row = batch_.num_rows_ + batch_.num_skipped_rows();
+        InvalidRow row{batch_.num_cols_, num_cols,
+                       first_row_ < 0 ? -1 : first_row_ + batch_row,
+                       util::string_view(start, end - start)};
+
+        if (options_.invalid_row_handler) {
+          if (options_.invalid_row_handler.value()(row) == 
InvalidRowResult::Skip) {
+            values_writer->RollbackLine();
+            parsed_writer->RollbackLine();
+            auto last_skip = batch_.skipped_rows_.rbegin();
+            if (last_skip == batch_.skipped_rows_.rend() ||
+                last_skip->second + 1 != batch_row) {
+              batch_.skipped_rows_.emplace_back(batch_row, batch_row);
+            } else {
+              last_skip->second = batch_row;
+            }

Review comment:
       Why not keep things simple and store all skipped row numbers?

##########
File path: cpp/src/arrow/csv/parser.h
##########
@@ -61,25 +62,41 @@ class ARROW_EXPORT DataBatch {
   int32_t num_cols() const { return num_cols_; }
   /// \brief Return the total size in bytes of parsed data
   uint32_t num_bytes() const { return parsed_size_; }
+  /// \brief Return the total number of rows skipped
+  int32_t num_skipped_rows() const {
+    if (ARROW_PREDICT_TRUE(skipped_rows_.empty())) {
+      return 0;
+    }
+    int32_t skipped = 0;
+    for (const auto& skip_range : skipped_rows_) {
+      skipped += skip_range.second - skip_range.first + 1;
+    }
+    return skipped;
+  }
 
   template <typename Visitor>
   Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) 
const {
     using detail::ParsedValueDesc;
 
-    int64_t row = first_row;
+    int32_t batch_row = 0;
     for (size_t buf_index = 0; buf_index < values_buffers_.size(); 
++buf_index) {
       const auto& values_buffer = values_buffers_[buf_index];
       const auto values = reinterpret_cast<const 
ParsedValueDesc*>(values_buffer->data());
       const auto max_pos =
           static_cast<int32_t>(values_buffer->size() / 
sizeof(ParsedValueDesc)) - 1;
-      for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++row) {
+      for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, 
++batch_row) {
         auto start = values[pos].offset;
         auto stop = values[pos + 1].offset;
         auto quoted = values[pos + 1].quoted;
         Status status = visit(parsed_ + start, stop - start, quoted);
         if (ARROW_PREDICT_FALSE(!status.ok())) {
           if (first_row >= 0) {

Review comment:
       Perhaps move the error handling here in a dedicated method?

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -324,9 +324,29 @@ class BlockParserImpl {
         if (*(end - 1) == '\r') {
           --end;
         }
-        return MismatchingColumns(batch_.num_cols_, num_cols,
-                                  first_row_ < 0 ? -1 : first_row_ + 
batch_.num_rows_,
-                                  util::string_view(start, end - start));
+        int32_t batch_row = batch_.num_rows_ + batch_.num_skipped_rows();

Review comment:
       `num_skipped_rows` is potentially O(n)... so, say you have one skipped 
row every two rows, parsing becomes O(n²)?

##########
File path: cpp/src/arrow/csv/test_common.h
##########
@@ -47,7 +47,7 @@ ARROW_TESTING_EXPORT
 void MakeColumnParser(std::vector<std::string> items, 
std::shared_ptr<BlockParser>* out);
 
 ARROW_TESTING_EXPORT
-Result<std::shared_ptr<Buffer>> MakeSampleCsvBuffer(size_t num_rows, bool 
valid = true);
+Result<std::shared_ptr<Buffer>> MakeSampleCsvBuffer(size_t num_rows, int 
invalid = 0);

Review comment:
       Or instead perhaps pass an optional predicate: `std::function<bool(int 
row_num)> is_valid`.
   This will make the test values less magical.

##########
File path: cpp/src/arrow/csv/parser.cc
##########
@@ -324,9 +324,29 @@ class BlockParserImpl {
         if (*(end - 1) == '\r') {
           --end;
         }
-        return MismatchingColumns(batch_.num_cols_, num_cols,

Review comment:
       Since the code for handling an invalid row is becoming non-trivial, 
perhaps move it into a dedicated method?
   




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #10790: ARROW-12673: [C++] Add callback to handle incorrect column counts

Reply via email to