This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 01dce6a0c9 ARROW-17382: [C++] open_dataset doesn't ignore BOM in csv
file when header's with quotes (#13838)
01dce6a0c9 is described below
commit 01dce6a0c95242224eab1a5331e0a2524a4b9339
Author: ZMZ91 <[email protected]>
AuthorDate: Wed Sep 14 01:51:25 2022 +0800
ARROW-17382: [C++] open_dataset doesn't ignore BOM in csv file when
header's with quotes (#13838)
Lead-authored-by: Zimo Zhang <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/dataset/file_csv.cc | 12 +++++++-----
cpp/src/arrow/dataset/file_csv_test.cc | 18 ++++++++++++++++++
2 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/dataset/file_csv.cc
b/cpp/src/arrow/dataset/file_csv.cc
index 780f845429..4cb331b0af 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -57,6 +57,12 @@ using RecordBatchGenerator =
std::function<Future<std::shared_ptr<RecordBatch>>(
Result<std::unordered_set<std::string>> GetColumnNames(
const csv::ReadOptions& read_options, const csv::ParseOptions&
parse_options,
util::string_view first_block, MemoryPool* pool) {
+ // Skip BOM when reading column names (ARROW-14644, ARROW-17382)
+ auto size = first_block.length();
+ const uint8_t* data = reinterpret_cast<const uint8_t*>(first_block.data());
+ ARROW_ASSIGN_OR_RAISE(auto data_no_bom, util::SkipUTF8BOM(data, size));
+ size = size - static_cast<uint32_t>(data_no_bom - data);
+ first_block = util::string_view(reinterpret_cast<const char*>(data_no_bom),
size);
if (!read_options.column_names.empty()) {
std::unordered_set<std::string> column_names;
for (const auto& s : read_options.column_names) {
@@ -98,11 +104,7 @@ Result<std::unordered_set<std::string>> GetColumnNames(
RETURN_NOT_OK(
parser.VisitLastRow([&](const uint8_t* data, uint32_t size, bool quoted)
-> Status {
- // Skip BOM when reading column names (ARROW-14644)
- ARROW_ASSIGN_OR_RAISE(auto data_no_bom, util::SkipUTF8BOM(data, size));
- size = size - static_cast<uint32_t>(data_no_bom - data);
-
- util::string_view view{reinterpret_cast<const char*>(data_no_bom),
size};
+ util::string_view view{reinterpret_cast<const char*>(data), size};
if (column_names.emplace(std::string(view)).second) {
return Status::OK();
}
diff --git a/cpp/src/arrow/dataset/file_csv_test.cc
b/cpp/src/arrow/dataset/file_csv_test.cc
index 76d2153cf2..99c6494f19 100644
--- a/cpp/src/arrow/dataset/file_csv_test.cc
+++ b/cpp/src/arrow/dataset/file_csv_test.cc
@@ -103,6 +103,24 @@ class TestCsvFileFormat : public
FileFormatFixtureMixin<CsvFormatHelper>,
}
};
+TEST_P(TestCsvFileFormat, BOMQuoteInHeader) {
+ // ARROW-17382: quoted headers after a BOM should be parsed correctly
+ auto source = GetFileSource("\xef\xbb\xbf\"ab\",\"cd\"\nef,gh\nij,kl\n");
+ auto fields = {field("ab", utf8()), field("cd", utf8())};
+ SetSchema(fields);
+ auto fragment = MakeFragment(*source);
+
+ int64_t row_count = 0;
+
+ for (auto maybe_batch : Batches(fragment.get())) {
+ ASSERT_OK_AND_ASSIGN(auto batch, maybe_batch);
+ AssertSchemaEqual(batch->schema(), schema(fields));
+ row_count += batch->num_rows();
+ }
+
+ ASSERT_EQ(row_count, 2);
+}
+
// Basic scanning tests (to exercise compression support); see the
parameterized test
// below for more comprehensive testing of scan behaviors
TEST_P(TestCsvFileFormat, ScanRecordBatchReader) {