This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 01dce6a0c9 ARROW-17382: [C++] open_dataset doesn't ignore BOM in csv 
file when header's with quotes (#13838)
01dce6a0c9 is described below

commit 01dce6a0c95242224eab1a5331e0a2524a4b9339
Author: ZMZ91 <[email protected]>
AuthorDate: Wed Sep 14 01:51:25 2022 +0800

    ARROW-17382: [C++] open_dataset doesn't ignore BOM in csv file when 
header's with quotes (#13838)
    
    Lead-authored-by: Zimo Zhang <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/dataset/file_csv.cc      | 12 +++++++-----
 cpp/src/arrow/dataset/file_csv_test.cc | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/dataset/file_csv.cc 
b/cpp/src/arrow/dataset/file_csv.cc
index 780f845429..4cb331b0af 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -57,6 +57,12 @@ using RecordBatchGenerator = 
std::function<Future<std::shared_ptr<RecordBatch>>(
 Result<std::unordered_set<std::string>> GetColumnNames(
     const csv::ReadOptions& read_options, const csv::ParseOptions& 
parse_options,
     util::string_view first_block, MemoryPool* pool) {
+  // Skip BOM when reading column names (ARROW-14644, ARROW-17382)
+  auto size = first_block.length();
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(first_block.data());
+  ARROW_ASSIGN_OR_RAISE(auto data_no_bom, util::SkipUTF8BOM(data, size));
+  size = size - static_cast<uint32_t>(data_no_bom - data);
+  first_block = util::string_view(reinterpret_cast<const char*>(data_no_bom), 
size);
   if (!read_options.column_names.empty()) {
     std::unordered_set<std::string> column_names;
     for (const auto& s : read_options.column_names) {
@@ -98,11 +104,7 @@ Result<std::unordered_set<std::string>> GetColumnNames(
 
   RETURN_NOT_OK(
       parser.VisitLastRow([&](const uint8_t* data, uint32_t size, bool quoted) 
-> Status {
-        // Skip BOM when reading column names (ARROW-14644)
-        ARROW_ASSIGN_OR_RAISE(auto data_no_bom, util::SkipUTF8BOM(data, size));
-        size = size - static_cast<uint32_t>(data_no_bom - data);
-
-        util::string_view view{reinterpret_cast<const char*>(data_no_bom), 
size};
+        util::string_view view{reinterpret_cast<const char*>(data), size};
         if (column_names.emplace(std::string(view)).second) {
           return Status::OK();
         }
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc 
b/cpp/src/arrow/dataset/file_csv_test.cc
index 76d2153cf2..99c6494f19 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -103,6 +103,24 @@ class TestCsvFileFormat : public 
FileFormatFixtureMixin<CsvFormatHelper>,
   }
 };
 
+TEST_P(TestCsvFileFormat, BOMQuoteInHeader) {
+  // ARROW-17382: quoted headers after a BOM should be parsed correctly
+  auto source = GetFileSource("\xef\xbb\xbf\"ab\",\"cd\"\nef,gh\nij,kl\n");
+  auto fields = {field("ab", utf8()), field("cd", utf8())};
+  SetSchema(fields);
+  auto fragment = MakeFragment(*source);
+
+  int64_t row_count = 0;
+
+  for (auto maybe_batch : Batches(fragment.get())) {
+    ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+    AssertSchemaEqual(batch->schema(), schema(fields));
+    row_count += batch->num_rows();
+  }
+
+  ASSERT_EQ(row_count, 2);
+}
+
 // Basic scanning tests (to exercise compression support); see the 
parameterized test
 // below for more comprehensive testing of scan behaviors
 TEST_P(TestCsvFileFormat, ScanRecordBatchReader) {

Reply via email to