(arrow) branch main updated: GH-47861: [Python] reduce memory usage when using to_pandas() with many extension arrays columns (#47860)

apitrou Tue, 28 Oct 2025 03:03:38 -0700

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 57f6261893 GH-47861: [Python] reduce memory usage when using 
to_pandas() with many extension arrays columns (#47860)
57f6261893 is described below

commit 57f626189326251c8144d4fad89c28220c68ca75
Author: Will Gulian <[email protected]>
AuthorDate: Tue Oct 28 06:03:26 2025 -0400

    GH-47861: [Python] reduce memory usage when using to_pandas() with many 
extension arrays columns (#47860)
    
    ### Rationale for this change
    
    See GH-47861. With this change, the extension array variation takes ~192MB 
of memory instead of 7GB.
    
    From what I can tell, this is because the `PandasOptions` struct is copied 
around frequently (for example it seems like there is an `ExtensionWriter` for 
each extension column and each `ExtensionWriter` has a copy of `PandasOptions` 
which has a set of all extension columns). I haven't fully traced the 
PandasOptions structure, but it seems to get copied and modified in some 
codepaths so I have decided to put the column sets into a `std::shared_ptr` 
rather than pass around a `shared_ptr [...]
    
    ### What changes are included in this PR?
    
    The `PandasOptions` column sets have been swapped from 
`std::unordered_set<std::string>` to `std::shared_ptr<const 
std::unordered_set<std::string>>` and usages have been updated.
    
    ### Are these changes tested?
    
    Yes, no regression in the pytests. Also tested memory usage by hand.
    
    ### Are there any user-facing changes?
    
    All changes are internal to the pyarrow C++ binding code. There are no 
changes to the exposed Python API.
    
    * GitHub Issue: #47861
    
    Lead-authored-by: Will Gulian <[email protected]>
    Co-authored-by: Will Gulian <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 python/pyarrow/includes/libarrow_python.pxd        |  4 ++--
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 12 +++++------
 python/pyarrow/src/arrow/python/arrow_to_pandas.h  | 23 ++++++++++++++++++++--
 python/pyarrow/table.pxi                           |  7 ++++---
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/python/pyarrow/includes/libarrow_python.pxd 
b/python/pyarrow/includes/libarrow_python.pxd
index 172f734d18..4724c52ccb 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -198,8 +198,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" 
nogil:
         c_bool self_destruct
         MapConversionType maps_as_pydicts
         c_bool decode_dictionaries
-        unordered_set[c_string] categorical_columns
-        unordered_set[c_string] extension_columns
+        shared_ptr[const unordered_set[c_string]] categorical_columns
+        shared_ptr[const unordered_set[c_string]] extension_columns
         c_bool to_numpy
 
 
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc 
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index f694eb82ba..ed4f394362 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -75,7 +75,7 @@ PandasOptions MakeInnerOptions(PandasOptions options) {
   // Make sure conversion of inner dictionary arrays always returns an array,
   // not a dict {'indices': array, 'dictionary': array, 'ordered': bool}
   options.decode_dictionaries = true;
-  options.categorical_columns.clear();
+  options.categorical_columns.reset();
   options.strings_to_categorical = false;
 
   // In ARROW-7723, we found as a result of ARROW-3789 that second
@@ -2337,7 +2337,7 @@ class ConsolidatedBlockCreator : public 
PandasBlockCreator {
   }
 
   Status GetBlockType(int column_index, PandasWriter::type* out) {
-    if (options_.extension_columns.count(fields_[column_index]->name())) {
+    if (options_.IsExtensionColumn(fields_[column_index]->name())) {
       *out = PandasWriter::EXTENSION;
       return Status::OK();
     } else {
@@ -2458,7 +2458,7 @@ class SplitBlockCreator : public PandasBlockCreator {
   Status GetWriter(int i, std::shared_ptr<PandasWriter>* writer) {
     PandasWriter::type output_type = PandasWriter::OBJECT;
     const DataType& type = *arrays_[i]->type();
-    if (options_.extension_columns.count(fields_[i]->name())) {
+    if (options_.IsExtensionColumn(fields_[i]->name())) {
       output_type = PandasWriter::EXTENSION;
     } else {
       // Null count needed to determine output type
@@ -2516,10 +2516,10 @@ Status ConvertCategoricals(const PandasOptions& 
options, ChunkedArrayVector* arr
     return Status::OK();
   };
 
-  if (!options.categorical_columns.empty()) {
+  if (options.HasCategoricalColumns()) {
     for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
       if ((*arrays)[i]->type()->id() != Type::DICTIONARY &&
-          options.categorical_columns.count((*fields)[i]->name())) {
+          options.IsCategoricalColumn((*fields)[i]->name())) {
         columns_to_encode.push_back(i);
       }
     }
@@ -2625,7 +2625,7 @@ Status ConvertTableToPandas(const PandasOptions& options, 
std::shared_ptr<Table>
 
   PandasOptions modified_options = options;
   modified_options.strings_to_categorical = false;
-  modified_options.categorical_columns.clear();
+  modified_options.categorical_columns.reset();
 
   if (options.split_blocks) {
     modified_options.allow_zero_copy_blocks = true;
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h 
b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
index 82e0a60051..b4e91e6cf5 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -49,6 +49,22 @@ enum class MapConversionType {
 };
 
 struct PandasOptions {
+  bool HasCategoricalColumns() const {
+    return categorical_columns && !categorical_columns->empty();
+  }
+
+  bool IsCategoricalColumn(const std::string& name) const {
+    return categorical_columns && categorical_columns->count(name);
+  }
+
+  bool HasExtensionColumns() const {
+    return extension_columns && !extension_columns->empty();
+  }
+
+  bool IsExtensionColumn(const std::string& name) const {
+    return extension_columns && extension_columns->count(name);
+  }
+
   /// arrow::MemoryPool to use for memory allocations
   MemoryPool* pool = default_memory_pool();
 
@@ -112,11 +128,14 @@ struct PandasOptions {
   bool decode_dictionaries = false;
 
   // Columns that should be casted to categorical
-  std::unordered_set<std::string> categorical_columns;
+  //
+  // This is wrapped in a shared_ptr because this struct is copied internally 
for
+  // each column or nested field (see GH-47861).
+  std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
 
   // Columns that should be passed through to be converted to
   // ExtensionArray/Block
-  std::unordered_set<std::string> extension_columns;
+  std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
 
   // Used internally to decipher between to_numpy() and to_pandas() when
   // the expected output differs
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9904c587ae..9136f25298 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4083,10 +4083,11 @@ def table_to_blocks(options, Table table, categories, 
extension_columns):
         PandasOptions c_options = _convert_pandas_options(options)
 
     if categories is not None:
-        c_options.categorical_columns = {tobytes(cat) for cat in categories}
+        c_options.categorical_columns = make_shared[unordered_set[c_string]](
+            unordered_set[c_string]({tobytes(cat) for cat in categories}))
     if extension_columns is not None:
-        c_options.extension_columns = {tobytes(col)
-                                       for col in extension_columns}
+        c_options.extension_columns = make_shared[unordered_set[c_string]](
+            unordered_set[c_string]({tobytes(col) for col in 
extension_columns}))
 
     if pandas_api.is_v1():
         # ARROW-3789: Coerce date/timestamp types to datetime64[ns]

(arrow) branch main updated: GH-47861: [Python] reduce memory usage when using to_pandas() with many extension arrays columns (#47860)

Reply via email to