jorisvandenbossche commented on code in PR #14804: URL: https://github.com/apache/arrow/pull/14804#discussion_r1049359192
########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: + """ + Handles PyArrow Arrays and ChunkedArrays. + """ + # Store the column as a private attribute + if isinstance(column, pa.ChunkedArray): + if column.num_chunks == 1: + column = column.chunk(0) + else: + if not allow_copy: + raise RuntimeError( + "Chunks will be combined and a copy is required which " + "is forbidden by allow_copy=False" + ) + column = column.combine_chunks() + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + return len(self._col) + + @property + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return self._col.offset + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.type + try: + bit_width = dtype.bit_width + except ValueError: # in case of a variable-length strings + bit_width = 8 + + if pa.types.is_large_string(dtype): + # format string needs to be changed from "U" to "u" + # in case of large strings (make an issue in pandas?) + kind = DtypeKind.STRING + return kind, bit_width, "u", Endianness.NATIVE + elif pa.types.is_timestamp(dtype): + kind = DtypeKind.DATETIME + ts = dtype.unit[0] + tz = dtype.tz if dtype.tz else "" + f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz) + return kind, bit_width, f_string, Endianness.NATIVE + elif pa.types.is_dictionary(dtype): + kind = DtypeKind.CATEGORICAL + f_string = "L" + return kind, bit_width, f_string, Endianness.NATIVE + else: + return self._dtype_from_arrowdtype(dtype, bit_width) Review Comment: Could we put all the above logic together in the `_dtype_from_arrowdtype` helper? And maybe I would also already call that inside the `__init__` (so this gets called only once and not on every access to `.dtype`, and so you also already get the "NotImplementedError" for unsupported data types in the init). In the init we can assign it to a private `_dtype` attribute on the class, and then just return that one in the `dtype` property here. ########## python/pyarrow/interchange/dataframe.py: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations +from typing import ( + Any, + Iterable, + Optional, + Sequence, +) + +import pyarrow as pa + +from pyarrow.interchange.column import _PyArrowColumn + + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pa.Table.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or + # ``NaT``). This currently has no effect; once support for nullable + # extension dtypes is added, this value should be propagated to + # columns. Review Comment: I suppose this comment is a leftover from the pandas implementation? Although I can imagine this PR also doesn't yet support the keyword? (if that's the case, should we raise an error or warning about it if the users passes `True`?) ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict Review Comment: Is this a third party package that needs to be installed? ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] Review Comment: Why is this commented out? ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), Review Comment: pandas can consume large string type? ########## python/pyarrow/interchange/dataframe.py: ########## @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations +from typing import ( + Any, + Iterable, + Optional, + Sequence, +) + +import pyarrow as pa + +from pyarrow.interchange.column import _PyArrowColumn + + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: + """ + Constructor - an instance of this (private) class is returned from + `pa.Table.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with ``NaN`` (or + # ``NaT``). This currently has no effect; once support for nullable + # extension dtypes is added, this value should be propagated to + # columns. + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + return _PyArrowDataFrame(self._df, nan_as_null, allow_copy) + + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + # The metadata for the data frame, as a dictionary with string keys. + # Add schema metadata here (pandas metadata or custom metadata) + if self._df.schema.metadata: + schema_metadata = {"pyarrow." + k.decode('utf8'): v.decode('utf8') + for k, v in self._df.schema.metadata.items()} + return schema_metadata + else: + return {} + + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + return self._df.num_columns + + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + return self._df.num_rows + + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + return self._df.column(0).num_chunks Review Comment: With the current implementation, this is always 1 ? (it's also not super clear what the "number of chunks" for a DataFrame actually _is_, if each column can have a different number of chunks ..) ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: + """ + Handles PyArrow Arrays and ChunkedArrays. + """ + # Store the column as a private attribute + if isinstance(column, pa.ChunkedArray): + if column.num_chunks == 1: + column = column.chunk(0) + else: + if not allow_copy: + raise RuntimeError( + "Chunks will be combined and a copy is required which " + "is forbidden by allow_copy=False" + ) + column = column.combine_chunks() + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + return len(self._col) + + @property + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return self._col.offset + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.type + try: + bit_width = dtype.bit_width + except ValueError: # in case of a variable-length strings + bit_width = 8 + + if pa.types.is_large_string(dtype): + # format string needs to be changed from "U" to "u" + # in case of large strings (make an issue in pandas?) + kind = DtypeKind.STRING + return kind, bit_width, "u", Endianness.NATIVE + elif pa.types.is_timestamp(dtype): + kind = DtypeKind.DATETIME + ts = dtype.unit[0] + tz = dtype.tz if dtype.tz else "" + f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz) + return kind, bit_width, f_string, Endianness.NATIVE + elif pa.types.is_dictionary(dtype): + kind = DtypeKind.CATEGORICAL + f_string = "L" + return kind, bit_width, f_string, Endianness.NATIVE + else: + return self._dtype_from_arrowdtype(dtype, bit_width) + + def _dtype_from_arrowdtype( + self, dtype, bit_width + ) -> Tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) + # not handled datetime and timedelta both map to datetime + # (is timedelta handled?) + + kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) + if kind is None: + raise ValueError( + f"Data type {dtype} not supported by interchange protocol") + + return kind, bit_width, f_string, Endianness.NATIVE + + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + Raises TypeError if the dtype is not categorical + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + TBD: are there any other in-memory representations that are needed? + """ + arr = self._col + if not pa.types.is_dictionary(arr.type): + raise TypeError( + "describe_categorical only works on a column with " + "categorical dtype!" + ) + + return { + "is_ordered": self._col.type.ordered, + "is_dictionary": True, + "categories": _PyArrowColumn(arr.dictionary), + } + + @property + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + # In case of no missing values, we need to set ColumnNullType to + # non nullable as in the current __dataframe__ protocol bit/byte masks + # can not be None + if self.null_count == 0: + return ColumnNullType.NON_NULLABLE, None + else: + return ColumnNullType.USE_BITMASK, 0 + + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + return self._col.null_count + + @property + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + chunk_size = self.size // n_chunks + if self.size % n_chunks != 0: + chunk_size += 1 + + array = self._col + i = 0 + for start in range(0, chunk_size * n_chunks, chunk_size): + yield _PyArrowColumn( + array.slice(start, chunk_size), self._allow_copy + ) + i += 1 + + elif isinstance(self._col, pa.ChunkedArray): Review Comment: Above you combined the chunks when passing a ChunkedArray to the constructor, so that means `self._col` can currently never be a ChunkedArray? But maybe we should try to support that and not always upfront call `combine_chunks`? ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes Review Comment: ```suggestion Integer enum for data types. Attributes ``` (it seems that in general the docstrings you copy pasted from dataframe-api have lost their whitespace lines) ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: + """ + Handles PyArrow Arrays and ChunkedArrays. + """ + # Store the column as a private attribute + if isinstance(column, pa.ChunkedArray): + if column.num_chunks == 1: + column = column.chunk(0) + else: + if not allow_copy: + raise RuntimeError( + "Chunks will be combined and a copy is required which " + "is forbidden by allow_copy=False" + ) + column = column.combine_chunks() + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + return len(self._col) + + @property + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return self._col.offset + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.type + try: + bit_width = dtype.bit_width + except ValueError: # in case of a variable-length strings + bit_width = 8 Review Comment: Ah, no, I see this is just for the actual string data, considering that as an array of bytes (so the 8 is for 1 byte) ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: + """ + Handles PyArrow Arrays and ChunkedArrays. + """ + # Store the column as a private attribute + if isinstance(column, pa.ChunkedArray): + if column.num_chunks == 1: + column = column.chunk(0) + else: + if not allow_copy: + raise RuntimeError( + "Chunks will be combined and a copy is required which " + "is forbidden by allow_copy=False" + ) + column = column.combine_chunks() + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + return len(self._col) + + @property + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return self._col.offset + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.type + try: + bit_width = dtype.bit_width + except ValueError: # in case of a variable-length strings + bit_width = 8 + + if pa.types.is_large_string(dtype): + # format string needs to be changed from "U" to "u" + # in case of large strings (make an issue in pandas?) + kind = DtypeKind.STRING + return kind, bit_width, "u", Endianness.NATIVE + elif pa.types.is_timestamp(dtype): + kind = DtypeKind.DATETIME + ts = dtype.unit[0] + tz = dtype.tz if dtype.tz else "" + f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz) + return kind, bit_width, f_string, Endianness.NATIVE + elif pa.types.is_dictionary(dtype): + kind = DtypeKind.CATEGORICAL + f_string = "L" + return kind, bit_width, f_string, Endianness.NATIVE + else: + return self._dtype_from_arrowdtype(dtype, bit_width) + + def _dtype_from_arrowdtype( + self, dtype, bit_width + ) -> Tuple[DtypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) + # not handled datetime and timedelta both map to datetime + # (is timedelta handled?) + + kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) + if kind is None: + raise ValueError( + f"Data type {dtype} not supported by interchange protocol") + + return kind, bit_width, f_string, Endianness.NATIVE + + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + Raises TypeError if the dtype is not categorical + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + TBD: are there any other in-memory representations that are needed? + """ + arr = self._col + if not pa.types.is_dictionary(arr.type): + raise TypeError( + "describe_categorical only works on a column with " + "categorical dtype!" + ) + + return { + "is_ordered": self._col.type.ordered, + "is_dictionary": True, + "categories": _PyArrowColumn(arr.dictionary), + } + + @property + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + # In case of no missing values, we need to set ColumnNullType to + # non nullable as in the current __dataframe__ protocol bit/byte masks + # can not be None + if self.null_count == 0: + return ColumnNullType.NON_NULLABLE, None + else: + return ColumnNullType.USE_BITMASK, 0 + + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + return self._col.null_count + + @property + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + pass + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + if n_chunks and n_chunks > 1: + chunk_size = self.size // n_chunks + if self.size % n_chunks != 0: + chunk_size += 1 + + array = self._col + i = 0 + for start in range(0, chunk_size * n_chunks, chunk_size): + yield _PyArrowColumn( + array.slice(start, chunk_size), self._allow_copy + ) + i += 1 + + elif isinstance(self._col, pa.ChunkedArray): + return [ + _PyArrowColumn(chunk, self._allow_copy) + for chunk in self._col.chunks + ] + else: + yield self + + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + The returned dictionary has the following contents: + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers: ColumnBuffers = { + "data": self._get_data_buffer(), + "validity": None, + "offsets": None, + } + + try: + buffers["validity"] = self._get_validity_buffer() + except NoBufferPresent: + pass + + try: + buffers["offsets"] = self._get_offsets_buffer() + except NoBufferPresent: + pass + + return buffers + + def _get_data_buffer( + self, + ) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple + """ + Return the buffer containing the data and the buffer's + associated dtype. + """ + array = self._col + dtype = self.dtype + + # In case of boolean arrays, cast to uint8 array + # as bit packed buffers are not supported + if pa.types.is_boolean(array.type): + array = pc.cast(array, pa.uint8()) + dtype = _PyArrowColumn(array).dtype + # In case of dictionary arrays, use indices + # to define a buffer, codes are transferred through + # describe_categorical() + if pa.types.is_dictionary(array.type): + array = array.indices + dtype = _PyArrowColumn(array).dtype + + n = len(array.buffers()) + if n == 2: + return _PyArrowBuffer(array.buffers()[1]), dtype + elif n == 3: + return _PyArrowBuffer(array.buffers()[2]), dtype + + def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: + """ + Return the buffer containing the mask values indicating missing data + and the buffer's associated dtype. + Raises NoBufferPresent if null representation is not a bit or byte + mask. + """ + # Define the dtype of the returned buffer + dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE) + array = self._col + buff = array.buffers()[0] + if buff: + return _PyArrowBuffer(buff), dtype + else: + raise NoBufferPresent( + "There are no missing values so " + "does not have a separate mask") + + def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + Raises NoBufferPresent if the data buffer does not have an associated + offsets buffer. + """ + array = self._col + n = len(array.buffers()) + if n == 2: + raise NoBufferPresent( + "This column has a fixed-length dtype so " + "it does not have an offsets buffer" + ) + elif n == 3: + # Define the dtype of the returned buffer + dtype = self._col.type + if pa.types.is_large_string(dtype): + dtype = (DtypeKind.INT, 64, "i", Endianness.NATIVE) Review Comment: ```suggestion dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE) ``` ? (to match the bit width) ########## python/pyarrow/interchange/column.py: ########## @@ -0,0 +1,507 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +import enum +from typing import ( + Any, + Dict, + Iterable, + Optional, + Tuple, +) + +import sys +if sys.version_info >= (3, 8): + from typing import TypedDict +else: + from typing_extensions import TypedDict + +import pyarrow as pa +import pyarrow.compute as pc +from pyarrow.interchange.buffer import _PyArrowBuffer + + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype + + +_PYARROW_KINDS = { + pa.int8(): (DtypeKind.INT, "c"), + pa.int16(): (DtypeKind.INT, "s"), + pa.int32(): (DtypeKind.INT, "i"), + pa.int64(): (DtypeKind.INT, "l"), + pa.uint8(): (DtypeKind.UINT, "C"), + pa.uint16(): (DtypeKind.UINT, "S"), + pa.uint32(): (DtypeKind.UINT, "I"), + pa.uint64(): (DtypeKind.UINT, "L"), + pa.float16(): (DtypeKind.FLOAT, "e"), + pa.float32(): (DtypeKind.FLOAT, "f"), + pa.float64(): (DtypeKind.FLOAT, "g"), + pa.bool_(): (DtypeKind.BOOL, "b"), + pa.string(): (DtypeKind.STRING, "u"), # utf-8 + pa.large_string(): (DtypeKind.STRING, "U"), +} + + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + + +class ColumnBuffers(TypedDict): + # first element is a buffer containing the column data; + # second element is the data buffer's associated dtype + data: Tuple[_PyArrowBuffer, Dtype] + + # first element is a buffer containing mask values indicating missing data; + # second element is the mask value buffer's associated dtype. + # None if the null representation is not a bit or byte mask + validity: Optional[Tuple[_PyArrowBuffer, Dtype]] + + # first element is a buffer containing the offset values for + # variable-size binary data (e.g., variable-length strings); + # second element is the offsets buffer's associated dtype. + # None if the data buffer does not have an associated offsets buffer + offsets: Optional[Tuple[_PyArrowBuffer, Dtype]] + + +class CategoricalDescription(TypedDict): + # whether the ordering of dictionary indices is semantically meaningful + is_ordered: bool + # whether a dictionary-style mapping of categorical values to other objects + # exists + is_dictionary: bool + # Python-level only (e.g. ``{int: str}``). + # None if not a dictionary-style categorical. + # categories: Optional[Column] + + +class Endianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + + def __init__( + self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True + ) -> None: + """ + Handles PyArrow Arrays and ChunkedArrays. + """ + # Store the column as a private attribute + if isinstance(column, pa.ChunkedArray): + if column.num_chunks == 1: + column = column.chunk(0) + else: + if not allow_copy: + raise RuntimeError( + "Chunks will be combined and a copy is required which " + "is forbidden by allow_copy=False" + ) + column = column.combine_chunks() + self._col = column + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + return len(self._col) + + @property + def offset(self) -> int: + """ + Offset of first element. + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + return self._col.offset + + @property + def dtype(self) -> Tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.type + try: + bit_width = dtype.bit_width + except ValueError: # in case of a variable-length strings + bit_width = 8 Review Comment: For variable length strings, is this bit width the one for the offsets? (but in which case it depends on string vs large_string?) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
