This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a1c2753ba2 ARROW-16872: [C++] Fix CSV parser edge case (#13437)
a1c2753ba2 is described below
commit a1c2753ba21128138b7b7105b00c7c1d02eae44a
Author: Yibo Cai <[email protected]>
AuthorDate: Fri Jul 1 00:31:23 2022 +0800
ARROW-16872: [C++] Fix CSV parser edge case (#13437)
Fixes an error when there's no EOL at last line, and last field is consumed
by bulk filter.
Lead-authored-by: Yibo Cai <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/csv/parser.cc | 6 ++++++
cpp/src/arrow/csv/parser_test.cc | 20 ++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/cpp/src/arrow/csv/parser.cc b/cpp/src/arrow/csv/parser.cc
index a546fc7a77..8b060df254 100644
--- a/cpp/src/arrow/csv/parser.cc
+++ b/cpp/src/arrow/csv/parser.cc
@@ -296,6 +296,9 @@ class BlockParserImpl {
if (UseBulkFilter) {
const char* bulk_end = RunBulkFilter(parsed_writer, data, data_end,
bulk_filter);
if (ARROW_PREDICT_FALSE(bulk_end == nullptr)) {
+ if (is_final) {
+ data = data_end;
+ }
goto AbortLine;
}
data = bulk_end;
@@ -337,6 +340,9 @@ class BlockParserImpl {
if (UseBulkFilter) {
const char* bulk_end = RunBulkFilter(parsed_writer, data, data_end,
bulk_filter);
if (ARROW_PREDICT_FALSE(bulk_end == nullptr)) {
+ if (is_final) {
+ data = data_end;
+ }
goto AbortLine;
}
data = bulk_end;
diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc
index 3eeb746afd..3fb2f11387 100644
--- a/cpp/src/arrow/csv/parser_test.cc
+++ b/cpp/src/arrow/csv/parser_test.cc
@@ -405,6 +405,26 @@ TEST(BlockParser, FinalTruncatedData) {
ASSERT_RAISES(Invalid, st);
}
+TEST(BlockParser, FinalBulkFilterNoEol) {
+ // Last field processed by bulk filter. No EOL at last line.
+ auto csv = MakeCSVData({"12345678901,12345678\n", "10987654321,87654321"});
+
+ BlockParser parser(ParseOptions::Defaults());
+ AssertParseFinal(parser, csv);
+ AssertColumnsEq(parser, {{"12345678901", "10987654321"}, {"12345678",
"87654321"}});
+}
+
+TEST(BlockParser, FinalTruncatedBulkFilterNoEol) {
+ // Not enough fields at last line. Processed by bulk filter. No EOL at last
line.
+ auto csv = MakeCSVData({"12345678901,12345678\n", "87654321"});
+ const char* err_msg = "Expected 2 columns, got 1: 87654321";
+
+ uint32_t out_size;
+ BlockParser parser(ParseOptions::Defaults());
+ EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, ::testing::HasSubstr(err_msg),
+ ParseFinal(parser, csv, &out_size));
+}
+
TEST(BlockParser, QuotingSimple) {
auto csv = MakeCSVData({"1,\",3,\",5\n"});