This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a1c2753ba2 ARROW-16872: [C++] Fix CSV parser edge case (#13437)
a1c2753ba2 is described below

commit a1c2753ba21128138b7b7105b00c7c1d02eae44a
Author: Yibo Cai <[email protected]>
AuthorDate: Fri Jul 1 00:31:23 2022 +0800

    ARROW-16872: [C++] Fix CSV parser edge case (#13437)
    
    Fixes an error when there's no EOL at last line, and last field is consumed 
by bulk filter.
    
    Lead-authored-by: Yibo Cai <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/csv/parser.cc      |  6 ++++++
 cpp/src/arrow/csv/parser_test.cc | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index a546fc7a77..8b060df254 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -296,6 +296,9 @@ class BlockParserImpl {
     if (UseBulkFilter) {
       const char* bulk_end = RunBulkFilter(parsed_writer, data, data_end, 
bulk_filter);
       if (ARROW_PREDICT_FALSE(bulk_end == nullptr)) {
+        if (is_final) {
+          data = data_end;
+        }
         goto AbortLine;
       }
       data = bulk_end;
@@ -337,6 +340,9 @@ class BlockParserImpl {
     if (UseBulkFilter) {
       const char* bulk_end = RunBulkFilter(parsed_writer, data, data_end, 
bulk_filter);
       if (ARROW_PREDICT_FALSE(bulk_end == nullptr)) {
+        if (is_final) {
+          data = data_end;
+        }
         goto AbortLine;
       }
       data = bulk_end;
diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc
index 3eeb746afd..3fb2f11387 100644
--- a/cpp/src/arrow/csv/parser_test.cc
+++ b/cpp/src/arrow/csv/parser_test.cc
@@ -405,6 +405,26 @@ TEST(BlockParser, FinalTruncatedData) {
   ASSERT_RAISES(Invalid, st);
 }
 
+TEST(BlockParser, FinalBulkFilterNoEol) {
+  // Last field processed by bulk filter. No EOL at last line.
+  auto csv = MakeCSVData({"12345678901,12345678\n", "10987654321,87654321"});
+
+  BlockParser parser(ParseOptions::Defaults());
+  AssertParseFinal(parser, csv);
+  AssertColumnsEq(parser, {{"12345678901", "10987654321"}, {"12345678", 
"87654321"}});
+}
+
+TEST(BlockParser, FinalTruncatedBulkFilterNoEol) {
+  // Not enough fields at last line. Processed by bulk filter. No EOL at last 
line.
+  auto csv = MakeCSVData({"12345678901,12345678\n", "87654321"});
+  const char* err_msg = "Expected 2 columns, got 1: 87654321";
+
+  uint32_t out_size;
+  BlockParser parser(ParseOptions::Defaults());
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr(err_msg),
+                                  ParseFinal(parser, csv, &out_size));
+}
+
 TEST(BlockParser, QuotingSimple) {
   auto csv = MakeCSVData({"1,\",3,\",5\n"});
 

Reply via email to