This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 57f6261893 GH-47861: [Python] reduce memory usage when using
to_pandas() with many extension arrays columns (#47860)
57f6261893 is described below
commit 57f626189326251c8144d4fad89c28220c68ca75
Author: Will Gulian <[email protected]>
AuthorDate: Tue Oct 28 06:03:26 2025 -0400
GH-47861: [Python] reduce memory usage when using to_pandas() with many
extension arrays columns (#47860)
### Rationale for this change
See GH-47861. With this change, the extension array variation takes ~192MB
of memory instead of 7GB.
From what I can tell, this is because the `PandasOptions` struct is copied
around frequently (for example it seems like there is an `ExtensionWriter` for
each extension column and each `ExtensionWriter` has a copy of `PandasOptions`
which has a set of all extension columns). I haven't fully traced the
PandasOptions structure, but it seems to get copied and modified in some
codepaths so I have decided to put the column sets into a `std::shared_ptr`
rather than pass around a `shared_ptr [...]
### What changes are included in this PR?
The `PandasOptions` column sets have been swapped from
`std::unordered_set<std::string>` to `std::shared_ptr<const
std::unordered_set<std::string>>` and usages have been updated.
### Are these changes tested?
Yes, no regression in the pytests. Also tested memory usage by hand.
### Are there any user-facing changes?
All changes are internal to the pyarrow C++ binding code. There are no
changes to the exposed Python API.
* GitHub Issue: #47861
Lead-authored-by: Will Gulian <[email protected]>
Co-authored-by: Will Gulian <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
python/pyarrow/includes/libarrow_python.pxd | 4 ++--
python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 12 +++++------
python/pyarrow/src/arrow/python/arrow_to_pandas.h | 23 ++++++++++++++++++++--
python/pyarrow/table.pxi | 7 ++++---
4 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/python/pyarrow/includes/libarrow_python.pxd
b/python/pyarrow/includes/libarrow_python.pxd
index 172f734d18..4724c52ccb 100644
--- a/python/pyarrow/includes/libarrow_python.pxd
+++ b/python/pyarrow/includes/libarrow_python.pxd
@@ -198,8 +198,8 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py"
nogil:
c_bool self_destruct
MapConversionType maps_as_pydicts
c_bool decode_dictionaries
- unordered_set[c_string] categorical_columns
- unordered_set[c_string] extension_columns
+ shared_ptr[const unordered_set[c_string]] categorical_columns
+ shared_ptr[const unordered_set[c_string]] extension_columns
c_bool to_numpy
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index f694eb82ba..ed4f394362 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -75,7 +75,7 @@ PandasOptions MakeInnerOptions(PandasOptions options) {
// Make sure conversion of inner dictionary arrays always returns an array,
// not a dict {'indices': array, 'dictionary': array, 'ordered': bool}
options.decode_dictionaries = true;
- options.categorical_columns.clear();
+ options.categorical_columns.reset();
options.strings_to_categorical = false;
// In ARROW-7723, we found as a result of ARROW-3789 that second
@@ -2337,7 +2337,7 @@ class ConsolidatedBlockCreator : public
PandasBlockCreator {
}
Status GetBlockType(int column_index, PandasWriter::type* out) {
- if (options_.extension_columns.count(fields_[column_index]->name())) {
+ if (options_.IsExtensionColumn(fields_[column_index]->name())) {
*out = PandasWriter::EXTENSION;
return Status::OK();
} else {
@@ -2458,7 +2458,7 @@ class SplitBlockCreator : public PandasBlockCreator {
Status GetWriter(int i, std::shared_ptr<PandasWriter>* writer) {
PandasWriter::type output_type = PandasWriter::OBJECT;
const DataType& type = *arrays_[i]->type();
- if (options_.extension_columns.count(fields_[i]->name())) {
+ if (options_.IsExtensionColumn(fields_[i]->name())) {
output_type = PandasWriter::EXTENSION;
} else {
// Null count needed to determine output type
@@ -2516,10 +2516,10 @@ Status ConvertCategoricals(const PandasOptions&
options, ChunkedArrayVector* arr
return Status::OK();
};
- if (!options.categorical_columns.empty()) {
+ if (options.HasCategoricalColumns()) {
for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
if ((*arrays)[i]->type()->id() != Type::DICTIONARY &&
- options.categorical_columns.count((*fields)[i]->name())) {
+ options.IsCategoricalColumn((*fields)[i]->name())) {
columns_to_encode.push_back(i);
}
}
@@ -2625,7 +2625,7 @@ Status ConvertTableToPandas(const PandasOptions& options,
std::shared_ptr<Table>
PandasOptions modified_options = options;
modified_options.strings_to_categorical = false;
- modified_options.categorical_columns.clear();
+ modified_options.categorical_columns.reset();
if (options.split_blocks) {
modified_options.allow_zero_copy_blocks = true;
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
index 82e0a60051..b4e91e6cf5 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -49,6 +49,22 @@ enum class MapConversionType {
};
struct PandasOptions {
+ bool HasCategoricalColumns() const {
+ return categorical_columns && !categorical_columns->empty();
+ }
+
+ bool IsCategoricalColumn(const std::string& name) const {
+ return categorical_columns && categorical_columns->count(name);
+ }
+
+ bool HasExtensionColumns() const {
+ return extension_columns && !extension_columns->empty();
+ }
+
+ bool IsExtensionColumn(const std::string& name) const {
+ return extension_columns && extension_columns->count(name);
+ }
+
/// arrow::MemoryPool to use for memory allocations
MemoryPool* pool = default_memory_pool();
@@ -112,11 +128,14 @@ struct PandasOptions {
bool decode_dictionaries = false;
// Columns that should be casted to categorical
- std::unordered_set<std::string> categorical_columns;
+ //
+ // This is wrapped in a shared_ptr because this struct is copied internally
for
+ // each column or nested field (see GH-47861).
+ std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
// Columns that should be passed through to be converted to
// ExtensionArray/Block
- std::unordered_set<std::string> extension_columns;
+ std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
// Used internally to decipher between to_numpy() and to_pandas() when
// the expected output differs
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 9904c587ae..9136f25298 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -4083,10 +4083,11 @@ def table_to_blocks(options, Table table, categories,
extension_columns):
PandasOptions c_options = _convert_pandas_options(options)
if categories is not None:
- c_options.categorical_columns = {tobytes(cat) for cat in categories}
+ c_options.categorical_columns = make_shared[unordered_set[c_string]](
+ unordered_set[c_string]({tobytes(cat) for cat in categories}))
if extension_columns is not None:
- c_options.extension_columns = {tobytes(col)
- for col in extension_columns}
+ c_options.extension_columns = make_shared[unordered_set[c_string]](
+ unordered_set[c_string]({tobytes(col) for col in
extension_columns}))
if pandas_api.is_v1():
# ARROW-3789: Coerce date/timestamp types to datetime64[ns]