This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-5599-0d8f9a7b8a210447364c43cbfe8aa3fe0d1f8cd5 in repository https://gitbox.apache.org/repos/asf/texera.git
commit 17e7a36ce238dbeae3a6a4d46017f1ca15e37fcf Author: Matthew B. <[email protected]> AuthorDate: Fri Jun 12 13:15:59 2026 -0700 perf(pyamber): avoid per-read deepcopy in Tuple.as_dict() (#5599) ### What changes were proposed in this PR? - Replace the per-read `deepcopy` in `Tuple.as_dict()` (`amber/src/main/python/core/models/tuple.py`) with a shallow copy, so reading a tuple no longer recursively clones every field value; cost now scales with field count instead of total field byte size. - This path is hot: `as_dict()` backs `as_series()` (per-tuple in the batch operator path) and `as_key_value_pairs()`; a tuple carrying a large binary field previously duplicated that whole payload on every read. - The deepcopy's isolation was unnecessary: `as_dict()` has no callers outside `Tuple`, its two users immediately build a new container, and the Tuple's mutators only reassign dict slots (never mutate a value in place), so a shallow copy preserves the independent-dict contract. - Remove the now-unused `from copy import deepcopy` import and document why the shallow copy is safe. ### Any related issues, documentation, discussions? Closes: #5598 ### How was this PR tested? - Existing tests only, no behavior change. Run `cd amber/src/main/python && python -m pytest ../../test/python/core/models/test_tuple.py -q`, expect 23 passed (covers `as_dict`/`as_series`/`as_key_value_pairs`). - Run `cd amber/src/main/python && python -m pytest ../../test/python/core/runnables/test_main_loop.py ../../test/python/core/architecture/managers/test_tuple_processing_manager.py -q`, expect 22 passed (exercises the batch read path that calls `as_series`). ### Was this PR authored or co-authored using generative AI tooling? Generated-by: Claude Opus 4.8 --------- Co-authored-by: Yicong Huang <[email protected]> --- amber/src/main/python/core/models/tuple.py | 20 ++++++++++++-------- amber/src/test/python/core/models/test_tuple.py | 15 +++++++++++++++ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/amber/src/main/python/core/models/tuple.py b/amber/src/main/python/core/models/tuple.py index 916301406f..1493ec0033 100644 --- a/amber/src/main/python/core/models/tuple.py +++ b/amber/src/main/python/core/models/tuple.py @@ -22,7 +22,7 @@ import pyarrow import struct import typing from collections import OrderedDict -from copy import deepcopy +from copy import deepcopy as _deepcopy from loguru import logger from pandas._libs.missing import checknull from pympler import asizeof @@ -232,16 +232,20 @@ class Tuple: """Convert the tuple to Pandas series format""" return pandas.Series(self.as_dict()) - def as_dict(self) -> "OrderedDict[str, Field]": + def _evaluate_all_fields(self) -> None: + # resolve any lazy field accessors in place + for i in self.get_field_names(): + self.__getitem__(i) + + def as_dict(self, deepcopy: bool = False) -> "OrderedDict[str, Field]": """ - Return a dictionary copy of this tuple. - Fields will be fetched from accessor if absent. + Return this tuple's field data as a dict, fetching lazy fields first. + + :param deepcopy: if True, deep copy values; else shallow copy (default) :return: dict with all the fields """ - # evaluate all the fields now - for i in self.get_field_names(): - self.__getitem__(i) - return deepcopy(self._field_data) + self._evaluate_all_fields() + return _deepcopy(self._field_data) if deepcopy else self._field_data.copy() def as_key_value_pairs(self) -> List[typing.Tuple[str, Field]]: return [(k, v) for k, v in self.as_dict().items()] diff --git a/amber/src/test/python/core/models/test_tuple.py b/amber/src/test/python/core/models/test_tuple.py index efb4fdf5c7..f786e88a19 100644 --- a/amber/src/test/python/core/models/test_tuple.py +++ b/amber/src/test/python/core/models/test_tuple.py @@ -46,6 +46,21 @@ class TestTuple: def test_tuple_as_dict(self, target_tuple): assert target_tuple.as_dict() == {"x": 1, "y": "a"} + def test_tuple_as_dict_deepcopy_isolates_values(self, target_tuple): + d = target_tuple.as_dict(deepcopy=True) + d["scores"] = [1, 2] + d["scores"].append(3) + # mutating the deep-copied dict must not affect the tuple + assert "scores" not in target_tuple.as_dict() + + def test_tuple_as_dict_is_shallow_by_default(self, target_tuple): + d = target_tuple.as_dict() + assert d == {"x": 1, "y": "a"} + # reassigning keys on the copy must not leak back into the tuple + d["x"] = 999 + del d["y"] + assert target_tuple.as_dict() == {"x": 1, "y": "a"} + def test_tuple_as_series(self, target_tuple): assert (target_tuple.as_series() == pandas.Series({"x": 1, "y": "a"})).all()
