This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 37c3bd00f8 ARROW-16436: [C++][Python] Datasets should not ignore CSV
autogenerate_column_names
37c3bd00f8 is described below
commit 37c3bd00f812513fe22179ae87573893c741af51
Author: Raúl Cumplido <[email protected]>
AuthorDate: Wed May 4 15:49:52 2022 +0200
ARROW-16436: [C++][Python] Datasets should not ignore CSV
autogenerate_column_names
The added test failed previously because the `autogenerate_column_names`
was ignored:
```
E pyarrow.lib.ArrowInvalid: Error creating dataset. Could not read schema
from '/tmp/pytest-of/pytest-15/test_csv_format_options_genera1/test.csv': Could
not open CSV input source
'/tmp/pytest-of/pytest-15/test_csv_format_options_genera1/test.csv': Invalid:
CSV file contained multiple columns named 1. Is this a 'csv' file?
```
Use the same approach we use on `GenerateColumnNames` here
https://github.com/apache/arrow/blob/master/cpp/src/arrow/csv/reader.cc#L637-L646
Closes #13064 from raulcd/ARROW-16436
Authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/dataset/file_csv.cc | 10 ++++++++++
python/pyarrow/tests/test_dataset.py | 16 ++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/cpp/src/arrow/dataset/file_csv.cc
b/cpp/src/arrow/dataset/file_csv.cc
index 277bab29a0..d185edf49d 100644
--- a/cpp/src/arrow/dataset/file_csv.cc
+++ b/cpp/src/arrow/dataset/file_csv.cc
@@ -85,6 +85,16 @@ Result<std::unordered_set<std::string>> GetColumnNames(
std::unordered_set<std::string> column_names;
+ if (read_options.autogenerate_column_names) {
+ column_names.reserve(parser.num_cols());
+ for (int32_t i = 0; i < parser.num_cols(); ++i) {
+ std::stringstream ss;
+ ss << "f" << i;
+ column_names.emplace(ss.str());
+ }
+ return column_names;
+ }
+
RETURN_NOT_OK(
parser.VisitLastRow([&](const uint8_t* data, uint32_t size, bool quoted)
-> Status {
// Skip BOM when reading column names (ARROW-14644)
diff --git a/python/pyarrow/tests/test_dataset.py
b/python/pyarrow/tests/test_dataset.py
index 6eda764f27..6aed7734f6 100644
--- a/python/pyarrow/tests/test_dataset.py
+++ b/python/pyarrow/tests/test_dataset.py
@@ -3033,6 +3033,22 @@ def test_csv_format_options(tempdir, dataset_reader):
pa.table({'foo': pa.array(['skipped', 'col0', 'foo', 'bar'])}))
+def test_csv_format_options_generate_columns(tempdir, dataset_reader):
+ path = str(tempdir / 'test.csv')
+ with open(path, 'w') as sink:
+ sink.write('1,a,true,1\n')
+
+ dataset = ds.dataset(path, format=ds.CsvFileFormat(
+ read_options=pa.csv.ReadOptions(autogenerate_column_names=True)))
+ result = dataset_reader.to_table(dataset)
+ expected_column_names = ["f0", "f1", "f2", "f3"]
+ assert result.column_names == expected_column_names
+ assert result.equals(pa.table({'f0': pa.array([1]),
+ 'f1': pa.array(["a"]),
+ 'f2': pa.array([True]),
+ 'f3': pa.array([1])}))
+
+
def test_csv_fragment_options(tempdir, dataset_reader):
path = str(tempdir / 'test.csv')
with open(path, 'w') as sink: