rudolfbyker commented on issue #19157:
URL: https://github.com/apache/arrow/issues/19157#issuecomment-3625174048

   This is what I have been using as a workaround in Python:
   
   ```py
   from logging import getLogger
   from types import NoneType
   from typing import (
       Sequence,
       Any,
       List,
       Dict,
       Type,
       Mapping,
       Tuple,
       Callable,
       Iterable,
   )
   
   from ordered_set import OrderedSet
   from pandas import Series  # Optional. Only required for type checking.
   from pyarrow import Array, array, UnionArray, int8, int32, ExtensionType
   
   logger = getLogger(__name__)
   
   
   def create_arrow_array(
       *,
       values: Sequence[Any] | "Series[Any]",
       extensions: (
           Mapping[Type[Any], Tuple[ExtensionType, Callable[[Any], Any]]] | None
       ) = None,
       warn_about_heterogeneous_types: bool = True,
   ) -> Array:
       """
       Create an Arrow array from an Iterable of values of unknown, possibly 
heterogeneous types.
       If there is more than one type, use a dense union (slow).
   
       Args:
           values: The values to convert to an Arrow array.
           extensions:
               A mapping of types to tuples, where each tuple contains:
                   - An Arrow `ExtensionType` to use for that type.
                   - A function that can serialize values of that type to the 
storage type of the `ExtensionType`.
           warn_about_heterogeneous_types:
               Whether to log a warning if the values are of heterogeneous 
types. This is True by default, because
               creating dense unions is a lot slower than creating arrow arrays 
with homogeneous types.
   
       Returns: An Arrow Array.
       """
       extensions = extensions or {}
   
       if len(values) == 0:
           return array([])
   
       def create_array_maybe_extension(*, vv: Iterable[Any], t: Type[Any]) -> 
Array:
           for extension_t in extensions:
               if issubclass(t, extension_t):
                   ext, serialize = extensions[extension_t]
                   return array([serialize(v) for v in vv], type=ext)
   
           return array(vv)
   
       types = [type(v) for v in values]
       unique_types: OrderedSet[Type[Any]] = OrderedSet(types)
   
       if len(unique_types - {NoneType}) < 2:
           # A single data type. This is the fast path.
           return create_array_maybe_extension(vv=values, t=unique_types.pop())
   
       if unique_types <= {NoneType, int, float}:
           # Only numeric types. Let Arrow choose the type.
           return array(values)
   
       if warn_about_heterogeneous_types:
           logger.warning(
               "Heterogeneous data types detected: %s. "
               "Creating a dense union, which is slower than an array with 
homogeneous types.",
               ", ".join(t.__name__ for t in unique_types),
           )
   
       type_ids: List[int] = []
       value_offsets: List[int] = []
       values_by_type: Dict[Type[Any], List[Any]] = {t: [] for t in 
unique_types}
   
       for v, t in zip(values, types):
           value_offsets.append(len(values_by_type[t]))
           values_by_type[t].append(v)
           type_ids.append(unique_types.index(t))
   
       return UnionArray.from_dense(
           array(type_ids, type=int8()),
           array(value_offsets, type=int32()),
           [create_array_maybe_extension(vv=values_by_type[t], t=t) for t in 
unique_types],
       )
   ```
   
   This automatically creates the schema. I we want to specify the schema, we 
could create a schema from `unique_types` and check if it's compatible with the 
user-provided schema before creating the `UnionArray`.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to