This is an automated email from the ASF dual-hosted git repository.

AlenkaF pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 3d6e138aef GH-22232: [C++][Python] Introduce optional 
default_column_type parameter (#47663)
3d6e138aef is described below

commit 3d6e138aef27741c0915fc8311872be44e9a1feb
Author: Vlad Borovtsov <[email protected]>
AuthorDate: Wed May 13 10:29:32 2026 +0200

    GH-22232: [C++][Python] Introduce optional default_column_type parameter 
(#47663)
    
    ### Rationale for this change
    
    Add an optional default_column_type parameter to the CSV reading API (C++ 
and Python) to provide a fallback type when per-column types aren’t specified, 
improving schema consistency and complementing the existing column_types logic.
    
    ### What changes are included in this PR?
    
    - c++: new convert option "default_column_type" to augment logic around 
column_types parameter
    - 3 reader tests: DefaultColumnTypePartialDefault, 
DefaultColumnTypeAllStringsWithHeader, DefaultColumnTypeAllStringsNoHeader). 
The last two tests are inspired by 
https://github.com/pandas-dev/pandas/pull/62242 and 
https://github.com/pandas-dev/pandas/issues/57666
    - python: corresponding changes to make cpp change consumable from python
    - python: extended test_convert_options test - include. default_column_type
    - python: added new test "test_default_column_type" which tests how the 
field impacts schema; also test implicitly verifies leading zero preservation
    - relevant documentation update for python component;
    
    ### Are these changes tested?
    
    Yes. Existing and new tests are passing.
    
    C++:
    
        > [==========] Running 3 tests from 1 test suite.
        > [----------] Global test environment set-up.
        > [----------] 3 tests from ReaderTests
        > [ RUN      ] ReaderTests.DefaultColumnTypePartialDefault
        > [       OK ] ReaderTests.DefaultColumnTypePartialDefault (3 ms)
        > [ RUN      ] ReaderTests.DefaultColumnTypeAllStringsWithHeader
        > [       OK ] ReaderTests.DefaultColumnTypeAllStringsWithHeader (0 ms)
        > [ RUN      ] ReaderTests.DefaultColumnTypeAllStringsNoHeader
        > [       OK ] ReaderTests.DefaultColumnTypeAllStringsNoHeader (0 ms)
        > [----------] 3 tests from ReaderTests (4 ms total)
        >
        > [----------] Global test environment tear-down
        > [==========] 3 tests from 1 test suite ran. (4 ms total)
        > [  PASSED  ] 3 tests.
    
        All:
    
        > [==========] 264 tests from 46 test suites ran. (452 ms total)
        > [  PASSED  ] 264 tests.
    
    pyarrow:
    New tests are passing.
    
    ### Are there any user-facing changes?
    
    I believe this change is backward compatible. Parameter is optional and its 
default value doesn't change the existing behavior; All the existing rests are 
passing.
    
    Maybe relevant: https://github.com/apache/arrow/issues/22232
    
    Relates to https://github.com/apache/arrow/issues/47502
    * GitHub Issue: #47502
    
    * GitHub Issue: #22232
    
    Lead-authored-by: Vlad Borovtsov <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: AlenkaF <[email protected]>
---
 cpp/src/arrow/csv/column_builder_test.cc | 19 +++++++
 cpp/src/arrow/csv/options.h              |  4 ++
 cpp/src/arrow/csv/reader.cc              | 10 +++-
 cpp/src/arrow/csv/reader_test.cc         | 87 ++++++++++++++++++++++++++++++++
 docs/source/python/csv.rst               |  1 +
 python/pyarrow/_csv.pyx                  | 82 ++++++++++++++++++++++++++----
 python/pyarrow/includes/libarrow.pxd     |  1 +
 python/pyarrow/tests/test_csv.py         | 72 +++++++++++++++++++++++++-
 8 files changed, 263 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/csv/column_builder_test.cc 
b/cpp/src/arrow/csv/column_builder_test.cc
index dddfb922e7..94de8c92d4 100644
--- a/cpp/src/arrow/csv/column_builder_test.cc
+++ b/cpp/src/arrow/csv/column_builder_test.cc
@@ -346,6 +346,25 @@ TEST_F(InferringColumnBuilderTest, SingleChunkInteger) {
                 {ArrayFromJSON(int64(), "[null, 123, 456]")});
 }
 
+TEST_F(InferringColumnBuilderTest, 
SingleChunkDefaultColumnTypeDoesNotOverrideInference) {
+  auto options = ConvertOptions::Defaults();
+  options.default_column_type = utf8();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"0000404", "0000505", "0000606"}}, options,
+                {ArrayFromJSON(int64(), "[404, 505, 606]")});
+}
+
+TEST_F(InferringColumnBuilderTest,
+       MultipleChunkDefaultColumnTypeDoesNotOverrideInference) {
+  auto options = ConvertOptions::Defaults();
+  options.default_column_type = utf8();
+  auto tg = TaskGroup::MakeSerial();
+
+  CheckInferred(tg, {{"0000404"}, {"0000505", "0000606"}}, options,
+                {ArrayFromJSON(int64(), "[404]"), ArrayFromJSON(int64(), 
"[505, 606]")});
+}
+
 TEST_F(InferringColumnBuilderTest, MultipleChunkInteger) {
   auto options = ConvertOptions::Defaults();
   auto tg = TaskGroup::MakeSerial();
diff --git a/cpp/src/arrow/csv/options.h b/cpp/src/arrow/csv/options.h
index 10e55bf838..f0b923d0f3 100644
--- a/cpp/src/arrow/csv/options.h
+++ b/cpp/src/arrow/csv/options.h
@@ -76,6 +76,10 @@ struct ARROW_EXPORT ConvertOptions {
   bool check_utf8 = true;
   /// Optional per-column types (disabling type inference on those columns)
   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+  /// Default type to use for columns not in `column_types`
+  ///
+  /// If set, this disables type inference on all columns.
+  std::shared_ptr<DataType> default_column_type;
   /// Recognized spellings for null values
   std::vector<std::string> null_values;
   /// Recognized spellings for boolean true values
diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc
index 8720331965..09e8290ba2 100644
--- a/cpp/src/arrow/csv/reader.cc
+++ b/cpp/src/arrow/csv/reader.cc
@@ -674,8 +674,14 @@ class ReaderMixin {
       // Does the named column have a fixed type?
       auto it = convert_options_.column_types.find(col_name);
       if (it == convert_options_.column_types.end()) {
-        conversion_schema_.columns.push_back(
-            ConversionSchema::InferredColumn(std::move(col_name), col_index));
+        // If not explicitly typed, respect default_column_type when provided
+        if (convert_options_.default_column_type != nullptr) {
+          conversion_schema_.columns.push_back(ConversionSchema::TypedColumn(
+              std::move(col_name), col_index, 
convert_options_.default_column_type));
+        } else {
+          conversion_schema_.columns.push_back(
+              ConversionSchema::InferredColumn(std::move(col_name), 
col_index));
+        }
       } else {
         conversion_schema_.columns.push_back(
             ConversionSchema::TypedColumn(std::move(col_name), col_index, 
it->second));
diff --git a/cpp/src/arrow/csv/reader_test.cc b/cpp/src/arrow/csv/reader_test.cc
index 23206717a1..9e0f4804b0 100644
--- a/cpp/src/arrow/csv/reader_test.cc
+++ b/cpp/src/arrow/csv/reader_test.cc
@@ -531,5 +531,92 @@ TEST(CountRowsAsync, Errors) {
                               internal::GetCpuThreadPool(), read_options, 
parse_options));
 }
 
+TEST(ReaderTests, DefaultColumnTypePartialDefault) {
+  auto table_buffer = std::make_shared<Buffer>(
+      "id,name,value,date\n"
+      "0000101,apple,0003.1400,2024-01-15\n"
+      "00102,banana,001.6180,2024-02-20\n"
+      "0003,cherry,02.71800,2024-03-25\n");
+
+  auto input = std::make_shared<io::BufferReader>(table_buffer);
+  auto read_options = ReadOptions::Defaults();
+  auto parse_options = ParseOptions::Defaults();
+  auto convert_options = ConvertOptions::Defaults();
+  convert_options.column_types["id"] = int64();
+  convert_options.default_column_type = utf8();
+
+  ASSERT_OK_AND_ASSIGN(auto reader,
+                       TableReader::Make(io::default_io_context(), input, 
read_options,
+                                         parse_options, convert_options));
+  ASSERT_OK_AND_ASSIGN(auto table, reader->Read());
+
+  auto expected_schema = schema({field("id", int64()), field("name", utf8()),
+                                 field("value", utf8()), field("date", 
utf8())});
+  AssertSchemaEqual(expected_schema, table->schema());
+
+  auto expected_table = TableFromJSON(
+      expected_schema,
+      {R"([{"id":101, "name":"apple",  "value":"0003.1400", 
"date":"2024-01-15"},
+            {"id":102, "name":"banana", "value":"001.6180", 
"date":"2024-02-20"},
+            {"id":3,   "name":"cherry", "value":"02.71800", 
"date":"2024-03-25"}])"});
+  ASSERT_TRUE(table->Equals(*expected_table));
+}
+
+TEST(ReaderTests, DefaultColumnTypeForcesTypedColumns) {
+  auto table_buffer = std::make_shared<Buffer>(
+      "id,amount,code\n"
+      "0000404,000045.6700,001\n"
+      "0000505,000000.10,010\n");
+
+  auto input = std::make_shared<io::BufferReader>(table_buffer);
+  auto read_options = ReadOptions::Defaults();
+  auto parse_options = ParseOptions::Defaults();
+  auto convert_options = ConvertOptions::Defaults();
+  convert_options.default_column_type = utf8();
+
+  ASSERT_OK_AND_ASSIGN(auto reader,
+                       TableReader::Make(io::default_io_context(), input, 
read_options,
+                                         parse_options, convert_options));
+  ASSERT_OK_AND_ASSIGN(auto table, reader->Read());
+
+  auto expected_schema =
+      schema({field("id", utf8()), field("amount", utf8()), field("code", 
utf8())});
+  AssertSchemaEqual(expected_schema, table->schema());
+
+  auto expected_table = TableFromJSON(
+      expected_schema, {R"([{"id":"0000404", "amount":"000045.6700", 
"code":"001"},
+            {"id":"0000505", "amount":"000000.10", "code":"010"}])"});
+  ASSERT_TRUE(table->Equals(*expected_table));
+}
+
+TEST(ReaderTests, DefaultColumnTypeAllStringsNoHeader) {
+  // Input without header; autogenerate column names and default all to strings
+  auto table_buffer = std::make_shared<Buffer>("AB|000388907|000045.6700\n");
+
+  auto input = std::make_shared<io::BufferReader>(table_buffer);
+  auto read_options = ReadOptions::Defaults();
+  read_options.autogenerate_column_names = true;  // treat first row as data
+  auto parse_options = ParseOptions::Defaults();
+  parse_options.delimiter = '|';
+  auto convert_options = ConvertOptions::Defaults();
+  convert_options.default_column_type = utf8();
+
+  ASSERT_OK_AND_ASSIGN(auto reader,
+                       TableReader::Make(io::default_io_context(), input, 
read_options,
+                                         parse_options, convert_options));
+  ASSERT_OK_AND_ASSIGN(auto table, reader->Read());
+
+  auto expected_schema =
+      schema({field("f0", utf8()), field("f1", utf8()), field("f2", utf8())});
+  AssertSchemaEqual(expected_schema, table->schema());
+
+  auto expected_table = TableFromJSON(expected_schema, {R"([{
+        "f0":"AB",
+        "f1":"000388907",
+        "f2":"000045.6700"
+      }])"});
+  ASSERT_TRUE(table->Equals(*expected_table));
+}
+
 }  // namespace csv
 }  // namespace arrow
diff --git a/docs/source/python/csv.rst b/docs/source/python/csv.rst
index 2bc2ccabc9..28946d1599 100644
--- a/docs/source/python/csv.rst
+++ b/docs/source/python/csv.rst
@@ -153,6 +153,7 @@ Available convert options are:
 
   ~ConvertOptions.check_utf8
   ~ConvertOptions.column_types
+  ~ConvertOptions.default_column_type
   ~ConvertOptions.null_values
   ~ConvertOptions.true_values
   ~ConvertOptions.false_values
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
index 79985530af..f2cefb8ff3 100644
--- a/python/pyarrow/_csv.pyx
+++ b/python/pyarrow/_csv.pyx
@@ -646,6 +646,9 @@ cdef class ConvertOptions(_Weakrefable):
     column_types : pyarrow.Schema or dict, optional
         Explicitly map column names to column types. Passing this argument
         disables type inference on the defined columns.
+    default_column_type : pyarrow.DataType, optional
+        Explicitly map columns not specified in column_types to a default type.
+        Passing this argument disables type inference on all columns.
     null_values : list, optional
         A sequence of strings that denote nulls in the data
         (defaults are appropriate in most cases). Note that by default,
@@ -840,6 +843,40 @@ cdef class ConvertOptions(_Weakrefable):
     fast: bool
     ----
     fast: [[true,true,false,false,null]]
+
+    Set a default column type for all columns (disables type inference):
+
+    >>> convert_options = csv.ConvertOptions(default_column_type=pa.string())
+    >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options)
+    pyarrow.Table
+    animals: string
+    n_legs: string
+    entry: string
+    fast: string
+    ----
+    animals: [["Flamingo","Horse","Brittle stars","Centipede",""]]
+    n_legs: [["2","4","5","100","6"]]
+    entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]]
+    fast: [["Yes","Yes","No","No",""]]
+
+    Combine default_column_type with column_types (specific column types 
override default):
+
+    >>> convert_options = csv.ConvertOptions(
+    ...                   column_types={"n_legs": pa.int64(), "fast": 
pa.bool_()},
+    ...                   default_column_type=pa.string(),
+    ...                   true_values=["Yes"],
+    ...                   false_values=["No"])
+    >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options)
+    pyarrow.Table
+    animals: string
+    n_legs: int64
+    entry: string
+    fast: bool
+    ----
+    animals: [["Flamingo","Horse","Brittle stars","Centipede",""]]
+    n_legs: [[2,4,5,100,6]]
+    entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]]
+    fast: [[true,true,false,false,null]]
     """
 
     # Avoid mistakingly creating attributes
@@ -849,7 +886,7 @@ cdef class ConvertOptions(_Weakrefable):
         self.options.reset(
             new CCSVConvertOptions(CCSVConvertOptions.Defaults()))
 
-    def __init__(self, *, check_utf8=None, column_types=None, null_values=None,
+    def __init__(self, *, check_utf8=None, column_types=None, 
default_column_type=None, null_values=None,
                  true_values=None, false_values=None, decimal_point=None,
                  strings_can_be_null=None, quoted_strings_can_be_null=None,
                  include_columns=None, include_missing_columns=None,
@@ -859,6 +896,8 @@ cdef class ConvertOptions(_Weakrefable):
             self.check_utf8 = check_utf8
         if column_types is not None:
             self.column_types = column_types
+        if default_column_type is not None:
+            self.default_column_type = default_column_type
         if null_values is not None:
             self.null_values = null_values
         if true_values is not None:
@@ -943,6 +982,27 @@ cdef class ConvertOptions(_Weakrefable):
             assert typ != NULL
             deref(self.options).column_types[tobytes(k)] = typ
 
+    @property
+    def default_column_type(self):
+        """
+        Explicitly map columns not specified in column_types to a default type.
+        """
+        if deref(self.options).default_column_type != NULL:
+            return 
pyarrow_wrap_data_type(deref(self.options).default_column_type)
+        else:
+            return None
+
+    @default_column_type.setter
+    def default_column_type(self, value):
+        cdef:
+            shared_ptr[CDataType] typ
+        if value is not None:
+            typ = pyarrow_unwrap_data_type(ensure_type(value))
+            assert typ != NULL
+            deref(self.options).default_column_type = typ
+        else:
+            deref(self.options).default_column_type.reset()
+
     @property
     def null_values(self):
         """
@@ -1104,6 +1164,7 @@ cdef class ConvertOptions(_Weakrefable):
         return (
             self.check_utf8 == other.check_utf8 and
             self.column_types == other.column_types and
+            self.default_column_type == other.default_column_type and
             self.null_values == other.null_values and
             self.true_values == other.true_values and
             self.false_values == other.false_values and
@@ -1120,17 +1181,17 @@ cdef class ConvertOptions(_Weakrefable):
         )
 
     def __getstate__(self):
-        return (self.check_utf8, self.column_types, self.null_values,
-                self.true_values, self.false_values, self.decimal_point,
-                self.timestamp_parsers, self.strings_can_be_null,
-                self.quoted_strings_can_be_null, self.auto_dict_encode,
-                self.auto_dict_max_cardinality, self.include_columns,
-                self.include_missing_columns)
+        return (self.check_utf8, self.column_types, self.default_column_type,
+                self.null_values, self.true_values, self.false_values,
+                self.decimal_point, self.timestamp_parsers,
+                self.strings_can_be_null, self.quoted_strings_can_be_null,
+                self.auto_dict_encode, self.auto_dict_max_cardinality,
+                self.include_columns, self.include_missing_columns)
 
     def __setstate__(self, state):
-        (self.check_utf8, self.column_types, self.null_values,
-         self.true_values, self.false_values, self.decimal_point,
-         self.timestamp_parsers, self.strings_can_be_null,
+        (self.check_utf8, self.column_types, self.default_column_type,
+         self.null_values, self.true_values, self.false_values,
+         self.decimal_point, self.timestamp_parsers, self.strings_can_be_null,
          self.quoted_strings_can_be_null, self.auto_dict_encode,
          self.auto_dict_max_cardinality, self.include_columns,
          self.include_missing_columns) = state
@@ -1145,6 +1206,7 @@ cdef class ConvertOptions(_Weakrefable):
         return (f"""
     check_utf8={self.check_utf8},
     column_types={self.column_types},
+    default_column_type={self.default_column_type!r},
     null_values={self.null_values},
     true_values={self.true_values},
     false_values={self.false_values},
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 8ee7784461..79522c1247 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2113,6 +2113,7 @@ cdef extern from "arrow/csv/api.h" namespace "arrow::csv" 
nogil:
     cdef cppclass CCSVConvertOptions" arrow::csv::ConvertOptions":
         c_bool check_utf8
         unordered_map[c_string, shared_ptr[CDataType]] column_types
+        shared_ptr[CDataType] default_column_type
         vector[c_string] null_values
         vector[c_string] true_values
         vector[c_string] false_values
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
index d608d2bee5..ac9012ebdf 100644
--- a/python/pyarrow/tests/test_csv.py
+++ b/python/pyarrow/tests/test_csv.py
@@ -321,7 +321,8 @@ def test_convert_options(pickle_module):
         include_columns=['def', 'abc'],
         include_missing_columns=False,
         auto_dict_encode=True,
-        timestamp_parsers=[ISO8601, '%y-%m'])
+        timestamp_parsers=[ISO8601, '%y-%m'],
+        default_column_type=pa.int16())
 
     with pytest.raises(ValueError):
         opts.decimal_point = '..'
@@ -349,6 +350,17 @@ def test_convert_options(pickle_module):
     with pytest.raises(TypeError):
         opts.column_types = 0
 
+    assert opts.default_column_type is None
+    opts.default_column_type = pa.string()
+    assert opts.default_column_type == pa.string()
+    opts.default_column_type = 'int32'
+    assert opts.default_column_type == pa.int32()
+    opts.default_column_type = None
+    assert opts.default_column_type is None
+
+    with pytest.raises(TypeError, match='DataType expected'):
+        opts.default_column_type = 123
+
     assert isinstance(opts.null_values, list)
     assert '' in opts.null_values
     assert 'N/A' in opts.null_values
@@ -368,10 +380,12 @@ def test_convert_options(pickle_module):
     assert opts.timestamp_parsers == [ISO8601]
 
     opts = cls(column_types={'a': pa.null()},
+               default_column_type=pa.int16(),
                null_values=['N', 'nn'], true_values=['T', 'tt'],
                false_values=['F', 'ff'], auto_dict_max_cardinality=999,
                timestamp_parsers=[ISO8601, '%Y-%m-%d'])
     assert opts.column_types == {'a': pa.null()}
+    assert opts.default_column_type == pa.int16()
     assert opts.null_values == ['N', 'nn']
     assert opts.false_values == ['F', 'ff']
     assert opts.true_values == ['T', 'tt']
@@ -381,6 +395,7 @@ def test_convert_options(pickle_module):
     expected_repr_inner = ("""
     check_utf8=True,
     column_types={'a': DataType(null)},
+    default_column_type=DataType(int16),
     null_values=['N', 'nn'],
     true_values=['T', 'tt'],
     false_values=['F', 'ff'],
@@ -1381,6 +1396,61 @@ class BaseCSVTableRead(BaseTestCSV):
             'y': ['b', 'd', 'f'],
         }
 
+    def test_default_column_type(self):
+        rows = b"a,b,c,d\n001,2.5,hello,true\n4,3.14,world,false\n"
+
+        # Test with default_column_type only -
+        # all columns should use the specified type.
+        opts = ConvertOptions(default_column_type=pa.string())
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()),
+                            ('b', pa.string()),
+                            ('c', pa.string()),
+                            ('d', pa.string())])
+        assert table.schema == schema
+        assert table.to_pydict() == {
+            'a': ["001", "4"],
+            'b': ["2.5", "3.14"],
+            'c': ["hello", "world"],
+            'd': ["true", "false"],
+        }
+
+        # Test with both column_types and default_column_type
+        # Columns specified in column_types should override default_column_type
+        opts = ConvertOptions(
+            column_types={'b': pa.float64(), 'd': pa.bool_()},
+            default_column_type=pa.string()
+        )
+        table = self.read_bytes(rows, convert_options=opts)
+        schema = pa.schema([('a', pa.string()),
+                            ('b', pa.float64()),
+                            ('c', pa.string()),
+                            ('d', pa.bool_())])
+        assert table.schema == schema
+        assert table.to_pydict() == {
+            'a': ["001", "4"],
+            'b': [2.5, 3.14],
+            'c': ["hello", "world"],
+            'd': [True, False],
+        }
+
+        # Test that default_column_type disables type inference
+        opts_no_default = ConvertOptions(column_types={'b': pa.float64()})
+        table_no_default = self.read_bytes(rows, 
convert_options=opts_no_default)
+
+        opts_with_default = ConvertOptions(
+            column_types={'b': pa.float64()},
+            default_column_type=pa.string()
+        )
+        table_with_default = self.read_bytes(rows, 
convert_options=opts_with_default)
+
+        # Column 'a' should be int64 without default, string with default
+        assert table_no_default.schema.field('a').type == pa.int64()
+        assert table_with_default.schema.field('a').type == pa.string()
+        # Column 'b' should always be float64 since explicitly typed
+        assert table_no_default.schema.field('b').type == pa.float64()
+        assert table_with_default.schema.field('b').type == pa.float64()
+
     def test_no_ending_newline(self):
         # No \n after last line
         rows = b"a,b,c\n1,2,3\n4,5,6"

Reply via email to