This is an automated email from the ASF dual-hosted git repository.
zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d073c953e8a [Fix](pyudf) Convert nested map value correctly (#63907)
d073c953e8a is described below
commit d073c953e8a44abfadbcde8b16cc8d1903c42fd7
Author: linrrarity <[email protected]>
AuthorDate: Tue Jun 2 11:04:08 2026 +0800
[Fix](pyudf) Convert nested map value correctly (#63907)
Problem Summary:
Fix Python UDF nested complex type conversion when `MAP` appears inside
`ARRAY`, `STRUCT`, or vectorized inputs.
Previously, Python UDF argument conversion mostly relied on PyArrow's
default conversions(`Scalar.as_py()`, `Array.to_pylist()`,
`Array.to_pandas()`). Those APIs convert a top-level Arrow `MAP` into
Python-friendly values in some paths, but nested `MAP` values are
exposed as list-of-tuples. For example, `ARRAY<MAP<STRING, INT>>` could
arrive in Python as `[[('a', 1)]]` instead of `[{'a': 1}]`. This made
user UDF code see nested maps as `list` instead of `dict`.
This PR introduces a recursive Arrow-value conversion helper and applies
it consistently across Python UDF argument conversion paths. The helper
manually reconstructs Python values according to the Arrow type:
- `MAP` -> `dict`
- `LIST` / `LARGE_LIST` -> `list`
- `STRUCT` -> `dict`
before
```sql
CREATE FUNCTION py_deep_nested_debug(ARRAY<MAP<STRING, ARRAY<INT>>> )
RETURNS STRING
PROPERTIES (
"type" = "PYTHON_UDF",
"symbol" = "evaluate",
"runtime_version" = "3.12.11",
"always_nullable" = "true"
)
AS $$
def evaluate(arr):
if arr is None:
return 'None'
return 'outer_type={}, outer_repr={}'.format(type(arr).__name__,
repr(arr))
$$;
SELECT py_deep_nested_debug([{'a': [1, 2], 'b': [3]}, {'c': [4, 5, 6]}]);
+-------------------------------------------------------------------------------+
| py_deep_nested_debug([{'a': [1, 2], 'b': [3]}, {'c': [4, 5, 6]}])
|
+-------------------------------------------------------------------------------+
| outer_type=list, outer_repr=[[('a', [1, 2]), ('b', [3])], [('c', [4, 5,
6])]] |
+-------------------------------------------------------------------------------+
```
now:
```text
SELECT py_deep_nested_debug([{'a': [1, 2], 'b': [3]}, {'c': [4, 5, 6]}]);
+-------------------------------------------------------------------------+
| py_deep_nested_debug([{'a': [1, 2], 'b': [3]}, {'c': [4, 5, 6]}]) |
+-------------------------------------------------------------------------+
| outer_type=list, outer_repr=[{'a': [1, 2], 'b': [3]}, {'c': [4, 5, 6]}] |
+-------------------------------------------------------------------------+
```
---
be/src/udf/python/python_server.py | 85 +++-
.../test_pythonudf_nested_complex_type.out | 20 +
.../test_pythonudf_nested_complex_type.groovy | 445 +++++++++++++++++++++
3 files changed, 541 insertions(+), 9 deletions(-)
diff --git a/be/src/udf/python/python_server.py
b/be/src/udf/python/python_server.py
index ecdbef691b9..c0683149ea2 100644
--- a/be/src/udf/python/python_server.py
+++ b/be/src/udf/python/python_server.py
@@ -252,11 +252,6 @@ def convert_arrow_field_to_python(field,
column_metadata=None):
if field is None:
return None
- if pa.types.is_map(field.type):
- # pyarrow.lib.MapScalar's as_py() returns a list of tuples, convert to
dict
- list_of_tuples = field.as_py()
- return dict(list_of_tuples) if list_of_tuples is not None else None
-
# Check if we should apply special IP type conversion based on metadata
if column_metadata:
# Arrow metadata keys can be either bytes or str depending on how they
were created
@@ -300,8 +295,64 @@ def convert_arrow_field_to_python(field,
column_metadata=None):
)
return value
return None
-
- return field.as_py()
+
+ return convert_arrow_value_to_python(field.as_py(), field.type)
+
+
+def convert_arrow_value_to_python(value, arrow_type):
+ """
+ Recursively convert Arrow nested values to Doris Python UDF values.
+
+ PyArrow exposes MapScalar.as_py() as a list of key/value tuples. If the
map is
+ nested under ARRAY or STRUCT, the top-level scalar is no longer MapScalar,
so
+ field.as_py() alone would leak list-of-tuples to user UDF code.
+ """
+ if value is None:
+ return None
+
+ if pa.types.is_map(arrow_type):
+ key_type = arrow_type.key_type
+ item_type = arrow_type.item_type
+ return {
+ convert_arrow_value_to_python(k, key_type):
convert_arrow_value_to_python(
+ v, item_type
+ )
+ for k, v in value
+ }
+
+ if pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type):
+ element_type = arrow_type.value_type
+ return [convert_arrow_value_to_python(v, element_type) for v in value]
+
+ if pa.types.is_struct(arrow_type):
+ return {
+ arrow_type[i].name: convert_arrow_value_to_python(
+ value.get(arrow_type[i].name), arrow_type[i].type
+ )
+ for i in range(len(arrow_type))
+ }
+
+ return value
+
+
+def needs_nested_python_normalization(arrow_type):
+ """
+ Return True when Arrow default Python conversion can leak nested MAP
values as
+ list-of-tuples and therefore needs recursive normalization.
+ """
+ if pa.types.is_map(arrow_type):
+ return True
+
+ if pa.types.is_list(arrow_type) or pa.types.is_large_list(arrow_type):
+ return needs_nested_python_normalization(arrow_type.value_type)
+
+ if pa.types.is_struct(arrow_type):
+ return any(
+ needs_nested_python_normalization(arrow_type[i].type)
+ for i in range(len(arrow_type))
+ )
+
+ return False
def convert_python_to_arrow_value(value, output_type=None):
@@ -563,8 +614,24 @@ class AdaptivePythonUDF:
Convert a pa.Array to an instance of the specified VectorType.
"""
if vec_type == VectorType.LIST:
- return arrow_array.to_pylist()
+ values = arrow_array.to_pylist()
+ if not needs_nested_python_normalization(arrow_array.type):
+ return values
+ return [
+ convert_arrow_value_to_python(value, arrow_array.type)
+ for value in values
+ ]
elif vec_type == VectorType.PANDAS_SERIES:
+ if needs_nested_python_normalization(arrow_array.type):
+ # Some pyarrow builds cannot materialize nested map-containing
arrays
+ # through to_pandas() (for example list<map<...>>). Normalize
through
+ # Python objects first, then build an object Series explicitly.
+ values = arrow_array.to_pylist()
+ converted = [
+ convert_arrow_value_to_python(value, arrow_array.type)
+ for value in values
+ ]
+ return pd.Series(converted, dtype=object)
return arrow_array.to_pandas()
else:
raise ValueError(f"Unsupported vector type: {vec_type}")
@@ -665,7 +732,7 @@ class AdaptivePythonUDF:
# instead of converting to list
pylist = arrow_col.to_pylist()
if len(pylist) > 0:
- converted = pylist[0]
+ converted = convert_arrow_value_to_python(pylist[0],
arrow_col.type)
logging.info(
"Converted %s to scalar (first value): %s",
param.name,
diff --git
a/regression-test/data/pythonudf_p0/test_pythonudf_nested_complex_type.out
b/regression-test/data/pythonudf_p0/test_pythonudf_nested_complex_type.out
new file mode 100644
index 00000000000..4fe7cb82953
--- /dev/null
+++ b/regression-test/data/pythonudf_p0/test_pythonudf_nested_complex_type.out
@@ -0,0 +1,20 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !scalar_constant_nested_complex --
+[{a:1,b:2},{c:3}]|{left:[{x:10}],right:[{y:20},{z:30}]}|(const,[{s:7}],{empty:[],nums:[1,2]})|{const_key:(constTag,[{cm:11}])}
+[]|{empty:[]}|(empty,[],{none:NULL})|{empty:(emptyTag,[])}
+NULL|NULL|NULL|NULL
+
+-- !vector_list_nested_complex --
+[{a:1,b:2},{c:3}]|{left:[{x:10}],right:[{y:20},{z:30}]}|(row1,[{s:7},{t:8}],{empty:[],nums:[1,2]})|{first:(tagA,[{m:1},{n:2}]),second:(tagB,[])}
+[]|{empty:[]}|(row2,[],{none:NULL})|{empty:(tagEmpty,[])}
+NULL|NULL|NULL|NULL
+
+-- !vector_series_nested_complex --
+[{a:1,b:2},{c:3}]|{left:[{x:10}],right:[{y:20},{z:30}]}|(row1,[{s:7},{t:8}],{empty:[],nums:[1,2]})|{first:(tagA,[{m:1},{n:2}]),second:(tagB,[])}
+[]|{empty:[]}|(row2,[],{none:NULL})|{empty:(tagEmpty,[])}
+NULL|NULL|NULL|NULL
+
+-- !vector_mixed_scalar_nested_complex --
+1|{first:(tagA,[{m:1},{n:2}]),second:(tagB,[])}
+2|{first:(tagA,[{m:1},{n:2}]),second:(tagB,[])}
+3|{first:(tagA,[{m:1},{n:2}]),second:(tagB,[])}
diff --git
a/regression-test/suites/pythonudf_p0/test_pythonudf_nested_complex_type.groovy
b/regression-test/suites/pythonudf_p0/test_pythonudf_nested_complex_type.groovy
new file mode 100644
index 00000000000..ca626292375
--- /dev/null
+++
b/regression-test/suites/pythonudf_p0/test_pythonudf_nested_complex_type.groovy
@@ -0,0 +1,445 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_pythonudf_nested_complex_type") {
+ def runtime_version = getPythonUdfRuntimeVersion()
+
+ sql """ DROP TABLE IF EXISTS test_pythonudf_nested_complex_type; """
+ sql """
+ CREATE TABLE test_pythonudf_nested_complex_type (
+ id INT,
+ array_map ARRAY<MAP<STRING, INT>>,
+ map_array_map MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ struct_nested STRUCT<
+ label: STRING,
+ maps: ARRAY<MAP<STRING, INT>>,
+ attrs: MAP<STRING, ARRAY<INT>>
+ >,
+ map_struct_nested MAP<STRING, STRUCT<
+ tag: STRING,
+ metrics: ARRAY<MAP<STRING, INT>>
+ >>
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES("replication_num" = "1");
+ """
+
+ sql """
+ INSERT INTO test_pythonudf_nested_complex_type VALUES
+ (
+ 1,
+ [{'a': 1, 'b': 2}, {'c': 3}],
+ {'left': [{'x': 10}], 'right': [{'y': 20}, {'z': 30}]},
+ {'row1', [{'s': 7}, {'t': 8}], {'nums': [1, 2], 'empty': []}},
+ {'first': {'tagA', [{'m': 1}, {'n': 2}]}, 'second': {'tagB', []}}
+ ),
+ (
+ 2,
+ [],
+ {'empty': []},
+ {'row2', [], {'none': NULL}},
+ {'empty': {'tagEmpty', []}}
+ ),
+ (
+ 3,
+ NULL,
+ NULL,
+ NULL,
+ NULL
+ );
+ """
+
+ sql """
+ DROP FUNCTION IF EXISTS py_nested_complex_scalar(
+ ARRAY<MAP<STRING, INT>>,
+ MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs:
MAP<STRING, ARRAY<INT>>>,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ );
+ """
+ sql """
+ CREATE FUNCTION py_nested_complex_scalar(
+ ARRAY<MAP<STRING, INT>>,
+ MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs:
MAP<STRING, ARRAY<INT>>>,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+def format_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{m[k]}' for k in sorted(m)) + '}'
+
+def format_array_map(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(format_map(item) for item in arr) + ']'
+
+def format_map_array_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP_ARRAY_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{format_array_map(m[k])}' for k in sorted(m))
+ '}'
+
+def format_attrs(attrs):
+ if attrs is None:
+ return 'NULL'
+ if not isinstance(attrs, dict):
+ return 'BAD_ATTRS:' + type(attrs).__name__
+ parts = []
+ for key in sorted(attrs):
+ val = attrs[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ else:
+ parts.append(f'{key}:[' + ','.join(str(x) for x in val) + ']')
+ return '{' + ','.join(parts) + '}'
+
+def format_struct(s):
+ if s is None:
+ return 'NULL'
+ if not isinstance(s, dict):
+ return 'BAD_STRUCT:' + type(s).__name__
+ return '(' + str(s.get('label')) + ',' + format_array_map(s.get('maps')) +
',' + format_attrs(s.get('attrs')) + ')'
+
+def format_map_struct_nested(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP_STRUCT:' + type(m).__name__
+ parts = []
+ for key in sorted(m):
+ val = m[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ elif not isinstance(val, dict):
+ parts.append(f'{key}:BAD_STRUCT:' + type(val).__name__)
+ else:
+ parts.append(f'{key}:(' + str(val.get('tag')) + ',' +
format_array_map(val.get('metrics')) + ')')
+ return '{' + ','.join(parts) + '}'
+
+def evaluate(array_map, map_array_map, struct_nested, map_struct_nested):
+ return '|'.join([
+ format_array_map(array_map),
+ format_map_array_map(map_array_map),
+ format_struct(struct_nested),
+ format_map_struct_nested(map_struct_nested),
+ ])
+\$\$;
+ """
+
+ qt_scalar_constant_nested_complex """
+ SELECT result
+ FROM (
+ SELECT 1 AS ord, py_nested_complex_scalar(
+ CAST([{'a': 1, 'b': 2}, {'c': 3}] AS ARRAY<MAP<STRING, INT>>),
+ CAST({'left': [{'x': 10}], 'right': [{'y': 20}, {'z': 30}]} AS
MAP<STRING, ARRAY<MAP<STRING, INT>>>),
+ CAST({'const', [{'s': 7}], {'nums': [1, 2], 'empty': []}} AS
STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs: MAP<STRING,
ARRAY<INT>>>),
+ CAST({'const_key': {'constTag', [{'cm': 11}]}} AS MAP<STRING,
STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>)
+ ) AS result
+ UNION ALL
+ SELECT 2 AS ord, py_nested_complex_scalar(
+ CAST([] AS ARRAY<MAP<STRING, INT>>),
+ CAST({'empty': []} AS MAP<STRING, ARRAY<MAP<STRING, INT>>>),
+ CAST({'empty', [], {'none': NULL}} AS STRUCT<label: STRING,
maps: ARRAY<MAP<STRING, INT>>, attrs: MAP<STRING, ARRAY<INT>>>),
+ CAST({'empty': {'emptyTag', []}} AS MAP<STRING, STRUCT<tag:
STRING, metrics: ARRAY<MAP<STRING, INT>>>>)
+ ) AS result
+ UNION ALL
+ SELECT 3 AS ord, py_nested_complex_scalar(
+ CAST(NULL AS ARRAY<MAP<STRING, INT>>),
+ CAST(NULL AS MAP<STRING, ARRAY<MAP<STRING, INT>>>),
+ CAST(NULL AS STRUCT<label: STRING, maps: ARRAY<MAP<STRING,
INT>>, attrs: MAP<STRING, ARRAY<INT>>>),
+ CAST(NULL AS MAP<STRING, STRUCT<tag: STRING, metrics:
ARRAY<MAP<STRING, INT>>>>)
+ ) AS result
+ ) ordered_result
+ ORDER BY ord;
+ """
+
+ sql """
+ DROP FUNCTION IF EXISTS py_nested_complex_vector_list(
+ ARRAY<MAP<STRING, INT>>,
+ MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs:
MAP<STRING, ARRAY<INT>>>,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ );
+ """
+ sql """
+ CREATE FUNCTION py_nested_complex_vector_list(
+ ARRAY<MAP<STRING, INT>>,
+ MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs:
MAP<STRING, ARRAY<INT>>>,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+def format_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{m[k]}' for k in sorted(m)) + '}'
+
+def format_array_map(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(format_map(item) for item in arr) + ']'
+
+def format_map_array_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP_ARRAY_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{format_array_map(m[k])}' for k in sorted(m))
+ '}'
+
+def format_attrs(attrs):
+ if attrs is None:
+ return 'NULL'
+ if not isinstance(attrs, dict):
+ return 'BAD_ATTRS:' + type(attrs).__name__
+ parts = []
+ for key in sorted(attrs):
+ val = attrs[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ else:
+ parts.append(f'{key}:[' + ','.join(str(x) for x in val) + ']')
+ return '{' + ','.join(parts) + '}'
+
+def format_struct(s):
+ if s is None:
+ return 'NULL'
+ if not isinstance(s, dict):
+ return 'BAD_STRUCT:' + type(s).__name__
+ return '(' + str(s.get('label')) + ',' + format_array_map(s.get('maps')) +
',' + format_attrs(s.get('attrs')) + ')'
+
+def format_map_struct_nested(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP_STRUCT:' + type(m).__name__
+ parts = []
+ for key in sorted(m):
+ val = m[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ elif not isinstance(val, dict):
+ parts.append(f'{key}:BAD_STRUCT:' + type(val).__name__)
+ else:
+ parts.append(f'{key}:(' + str(val.get('tag')) + ',' +
format_array_map(val.get('metrics')) + ')')
+ return '{' + ','.join(parts) + '}'
+
+def evaluate(array_maps: list, map_array_maps: list, struct_nesteds: list,
map_struct_nesteds: list):
+ result = []
+ for array_map, map_array_map, struct_nested, map_struct_nested in
zip(array_maps, map_array_maps, struct_nesteds, map_struct_nesteds):
+ result.append('|'.join([
+ format_array_map(array_map),
+ format_map_array_map(map_array_map),
+ format_struct(struct_nested),
+ format_map_struct_nested(map_struct_nested),
+ ]))
+ return result
+\$\$;
+ """
+
+ qt_vector_list_nested_complex """
+ SELECT py_nested_complex_vector_list(array_map, map_array_map,
struct_nested, map_struct_nested) AS result
+ FROM test_pythonudf_nested_complex_type
+ ORDER BY id;
+ """
+
+ sql """
+ DROP FUNCTION IF EXISTS py_nested_complex_vector_series(
+ ARRAY<MAP<STRING, INT>>,
+ MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs:
MAP<STRING, ARRAY<INT>>>,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ );
+ """
+ sql """
+ CREATE FUNCTION py_nested_complex_vector_series(
+ ARRAY<MAP<STRING, INT>>,
+ MAP<STRING, ARRAY<MAP<STRING, INT>>>,
+ STRUCT<label: STRING, maps: ARRAY<MAP<STRING, INT>>, attrs:
MAP<STRING, ARRAY<INT>>>,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def format_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{m[k]}' for k in sorted(m)) + '}'
+
+def format_array_map(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(format_map(item) for item in arr) + ']'
+
+def format_map_array_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP_ARRAY_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{format_array_map(m[k])}' for k in sorted(m))
+ '}'
+
+def format_attrs(attrs):
+ if attrs is None:
+ return 'NULL'
+ if not isinstance(attrs, dict):
+ return 'BAD_ATTRS:' + type(attrs).__name__
+ parts = []
+ for key in sorted(attrs):
+ val = attrs[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ else:
+ parts.append(f'{key}:[' + ','.join(str(x) for x in val) + ']')
+ return '{' + ','.join(parts) + '}'
+
+def format_struct(s):
+ if s is None:
+ return 'NULL'
+ if not isinstance(s, dict):
+ return 'BAD_STRUCT:' + type(s).__name__
+ return '(' + str(s.get('label')) + ',' + format_array_map(s.get('maps')) +
',' + format_attrs(s.get('attrs')) + ')'
+
+def format_map_struct_nested(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP_STRUCT:' + type(m).__name__
+ parts = []
+ for key in sorted(m):
+ val = m[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ elif not isinstance(val, dict):
+ parts.append(f'{key}:BAD_STRUCT:' + type(val).__name__)
+ else:
+ parts.append(f'{key}:(' + str(val.get('tag')) + ',' +
format_array_map(val.get('metrics')) + ')')
+ return '{' + ','.join(parts) + '}'
+
+def evaluate(array_maps: pd.Series, map_array_maps: pd.Series, struct_nesteds:
pd.Series, map_struct_nesteds: pd.Series) -> pd.Series:
+ if not all(isinstance(arg, pd.Series) for arg in [array_maps,
map_array_maps, struct_nesteds, map_struct_nesteds]):
+ return pd.Series(['BAD_VECTOR_ARGS'] * len(array_maps))
+ result = []
+ for array_map, map_array_map, struct_nested, map_struct_nested in
zip(array_maps, map_array_maps, struct_nesteds, map_struct_nesteds):
+ result.append('|'.join([
+ format_array_map(array_map),
+ format_map_array_map(map_array_map),
+ format_struct(struct_nested),
+ format_map_struct_nested(map_struct_nested),
+ ]))
+ return pd.Series(result)
+\$\$;
+ """
+
+ qt_vector_series_nested_complex """
+ SELECT py_nested_complex_vector_series(array_map, map_array_map,
struct_nested, map_struct_nested) AS result
+ FROM test_pythonudf_nested_complex_type
+ ORDER BY id;
+ """
+
+ sql """
+ DROP FUNCTION IF EXISTS py_nested_complex_vector_mixed(
+ INT,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ );
+ """
+ sql """
+ CREATE FUNCTION py_nested_complex_vector_mixed(
+ INT,
+ MAP<STRING, STRUCT<tag: STRING, metrics: ARRAY<MAP<STRING, INT>>>>
+ )
+ RETURNS STRING
+ PROPERTIES (
+ "type" = "PYTHON_UDF",
+ "symbol" = "evaluate",
+ "runtime_version" = "${runtime_version}",
+ "always_nullable" = "true"
+ )
+ AS \$\$
+import pandas as pd
+
+def format_map(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MAP:' + type(m).__name__
+ return '{' + ','.join(f'{k}:{m[k]}' for k in sorted(m)) + '}'
+
+def format_array_map(arr):
+ if arr is None:
+ return 'NULL'
+ return '[' + ','.join(format_map(item) for item in arr) + ']'
+
+def format_map_struct_nested(m):
+ if m is None:
+ return 'NULL'
+ if not isinstance(m, dict):
+ return 'BAD_MIXED_SCALAR:' + type(m).__name__
+ parts = []
+ for key in sorted(m):
+ val = m[key]
+ if val is None:
+ parts.append(f'{key}:NULL')
+ elif not isinstance(val, dict):
+ parts.append(f'{key}:BAD_STRUCT:' + type(val).__name__)
+ else:
+ parts.append(f'{key}:(' + str(val.get('tag')) + ',' +
format_array_map(val.get('metrics')) + ')')
+ return '{' + ','.join(parts) + '}'
+
+def evaluate(ids: pd.Series, mixed_map_struct_nested) -> pd.Series:
+ if not isinstance(ids, pd.Series):
+ return pd.Series(['BAD_VECTOR_ARG'])
+ formatted = format_map_struct_nested(mixed_map_struct_nested)
+ return pd.Series([str(id_value) + '|' + formatted for id_value in ids])
+\$\$;
+ """
+
+ qt_vector_mixed_scalar_nested_complex """
+ SELECT py_nested_complex_vector_mixed(id, map_struct_nested) AS result
+ FROM test_pythonudf_nested_complex_type
+ ORDER BY id;
+ """
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]