This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to branch release-7.0.0 in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 9c5f9a100aed89bf1e442e02dbb1ca7e159fbdd3 Author: Krisztián Szűcs <[email protected]> AuthorDate: Fri Jan 21 09:27:37 2022 +0100 Revert "ARROW-12735: [C++] Write GDB plugin" This reverts commit a3efe72f99b1b9f23b1d11afc648f4306d32e330. --- ci/scripts/python_test.sh | 1 - cpp/gdb_arrow.py | 1894 -------------------- cpp/src/arrow/ipc/json_simple.cc | 14 - cpp/src/arrow/ipc/json_simple.h | 6 - cpp/src/arrow/ipc/json_simple_test.cc | 19 - cpp/src/arrow/python/CMakeLists.txt | 1 - cpp/src/arrow/python/gdb.cc | 449 ----- cpp/src/arrow/python/gdb.h | 29 - cpp/src/arrow/testing/gtest_util.cc | 8 +- .../python_test.sh => python/pyarrow/_engine.pyx | 32 +- python/pyarrow/engine.py | 0 python/pyarrow/includes/libarrow.pxd | 4 - python/pyarrow/includes/libarrow_compute.pxd | 56 + python/pyarrow/includes/libarrow_engine.pxd | 37 + python/pyarrow/lib.pyx | 4 - python/pyarrow/tests/test_gdb.py | 854 --------- 16 files changed, 118 insertions(+), 3290 deletions(-) diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 4eb4bd1..6e05af8 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -25,7 +25,6 @@ export ARROW_SOURCE_DIR=${arrow_dir} export ARROW_TEST_DATA=${arrow_dir}/testing/data export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -export ARROW_GDB_SCRIPT=${arrow_dir}/cpp/gdb_arrow.py # Enable some checks inside Python itself export PYTHONDEVMODE=1 diff --git a/cpp/gdb_arrow.py b/cpp/gdb_arrow.py deleted file mode 100644 index bdcef84..0000000 --- a/cpp/gdb_arrow.py +++ /dev/null @@ -1,1894 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from collections import namedtuple -from collections.abc import Sequence -import decimal -import enum -from functools import lru_cache, partial -import struct -import sys -import warnings - -import gdb -from gdb.types import get_basic_type - -# gdb API docs at https://sourceware.org/gdb/onlinedocs/gdb/Python-API.html#Python-API - -# TODO check guidelines here: https://sourceware.org/gdb/onlinedocs/gdb/Writing-a-Pretty_002dPrinter.html -# TODO investigate auto-loading: https://sourceware.org/gdb/onlinedocs/gdb/Auto_002dloading-extensions.html#Auto_002dloading-extensions - - -_type_ids = [ - 'NA', 'BOOL', 'UINT8', 'INT8', 'UINT16', 'INT16', 'UINT32', 'INT32', - 'UINT64', 'INT64', 'HALF_FLOAT', 'FLOAT', 'DOUBLE', 'STRING', 'BINARY', - 'FIXED_SIZE_BINARY', 'DATE32', 'DATE64', 'TIMESTAMP', 'TIME32', 'TIME64', - 'INTERVAL_MONTHS', 'INTERVAL_DAY_TIME', 'DECIMAL128', 'DECIMAL256', - 'LIST', 'STRUCT', 'SPARSE_UNION', 'DENSE_UNION', 'DICTIONARY', 'MAP', - 'EXTENSION', 'FIXED_SIZE_LIST', 'DURATION', 'LARGE_STRING', - 'LARGE_BINARY', 'LARGE_LIST', 'INTERVAL_MONTH_DAY_NANO'] - -# Mirror the C++ Type::type enum -Type = enum.IntEnum('Type', _type_ids, start=0) - - -@lru_cache() -def byte_order(): - """ - Get the target program (not the GDB host's) endianness. - """ - s = gdb.execute("show endian", to_string=True).strip() - if 'big' in s: - return 'big' - elif 'little' in s: - return 'little' - warnings.warn('Could not determine target endianness ' - f'from GDB\'s response:\n"""{s}"""') - # Fall back to host endianness - return sys.byteorder - - -def for_evaluation(val, ty=None): - """ - Return a parsable form of gdb.Value `val`, optionally with gdb.Type `ty`. - """ - if ty is None: - ty = get_basic_type(val.type) - if ty.code == gdb.TYPE_CODE_PTR: - # It's already a pointer, can represent it directly - return f"(({ty}) ({val}))" - if val.address is None: - raise ValueError(f"Cannot further evaluate rvalue: {val}") - return f"(* ({ty}*) ({val.address}))" - - -def is_char_star(ty): - # Note that "const char*" can have TYPE_CODE_INT as target type... - ty = get_basic_type(ty) - return (ty.code == gdb.TYPE_CODE_PTR and - get_basic_type(ty.target()).code - in (gdb.TYPE_CODE_CHAR, gdb.TYPE_CODE_INT)) - - -def deref(val): - """ - Dereference a raw or smart pointer. - """ - ty = get_basic_type(val.type) - if ty.code == gdb.TYPE_CODE_PTR: - return val.dereference() - if ty.name.startswith('std::'): - if "shared" in ty.name: - return SharedPtr(val).value - if "unique" in ty.name: - return UniquePtr(val).value - raise TypeError(f"Cannot dereference value of type '{ty.name}'") - - -_string_literal_mapping = { - ord('\\'): r'\\', - ord('\n'): r'\n', - ord('\r'): r'\r', - ord('\t'): r'\t', - ord('"'): r'\"', -} - -for c in range(0, 32): - if c not in _string_literal_mapping: - _string_literal_mapping[c] = f"\\x{c:02x}" - - -def string_literal(s): - """ - Format a Python string or gdb.Value for display as a literal. - """ - max_len = 50 - if isinstance(s, gdb.Value): - s = s.string() - if len(s) > max_len: - s = s[:max_len] - return '"' + s.translate(_string_literal_mapping) + '" [continued]' - else: - return '"' + s.translate(_string_literal_mapping) + '"' - - -def bytes_literal(val, size=None): - """ - Format a gdb.Value for display as a literal containing possibly - unprintable characters. - """ - return val.lazy_string(length=size).value() - - -def utf8_literal(val, size=None): - """ - Format a gdb.Value for display as a utf-8 literal. - """ - if size is None: - s = val.string(encoding='utf8', errors='backslashreplace') - elif size != 0: - s = val.string(encoding='utf8', errors='backslashreplace', length=size) - else: - s = "" - return string_literal(s) - - -def half_float_value(val): - """ - Return a Python float of the given half-float (represented as a uint64_t - gdb.Value). - """ - buf = gdb.selected_inferior().read_memory(val.address, 2) - return struct.unpack("e", buf)[0] - - -def load_atomic(val): - """ - Load a std::atomic<T>'s value. - """ - valty = val.type.template_argument(0) - # XXX This assumes std::atomic<T> has the same layout as a raw T. - return val.address.reinterpret_cast(valty.pointer()).dereference() - - -def load_null_count(val): - """ - Load a null count from a gdb.Value of an integer (either atomic or not). - """ - if get_basic_type(val.type).code != gdb.TYPE_CODE_INT: - val = load_atomic(val) - return val - - -def format_null_count(val): - """ - Format a null count value. - """ - if not isinstance(val, int): - null_count = int(load_null_count(val)) - return (f"null count {null_count}" if null_count != -1 - else "unknown null count") - - -def short_time_unit(val): - return ['s', 'ms', 'us', 'ns'][int(val)] - - -def format_month_interval(val): - """ - Format a MonthInterval value. - """ - return f"{int(val)}M" - - -def cast_to_concrete(val, ty): - return (val.reference_value().reinterpret_cast(ty.reference()) - .referenced_value()) - - -def scalar_class_from_type(name): - """ - Given a DataTypeClass class name (such as "BooleanType"), return the - corresponding Scalar class name. - """ - assert name.endswith("Type") - return name[:-4] + "Scalar" - - -def array_class_from_type(name): - """ - Given a DataTypeClass class name (such as "BooleanType"), return the - corresponding Array class name. - """ - assert name.endswith("Type") - return name[:-4] + "Array" - - -class CString: - """ - A `const char*` or similar value. - """ - - def __init__(self, val): - self.val = val - - def __bool__(self): - return int(data) != 0 and int(data[0]) != 0 - - @property - def data(self): - return self.val - - def bytes_literal(self): - return self.val.lazy_string().value() - - def string_literal(self): - # XXX use lazy_string() as well? - return string_literal(self.val) - - def string(self): - return self.val.string() - - def __format__(self, fmt): - return str(self.bytes_literal()) - - -# NOTE: gdb.parse_and_eval() is *slow* and calling it multiple times -# may add noticeable latencies. For standard C++ classes, we therefore -# try to fetch their properties from libstdc++ internals (which hopefully -# are stable), before falling back on calling the public API methods. - -class SharedPtr: - """ - A `std::shared_ptr<T>` value. - """ - - def __init__(self, val): - self.val = val - try: - # libstdc++ internals - self._ptr = val['_M_ptr'] - except gdb.error: - # fallback for other C++ standard libraries - self._ptr = gdb.parse_and_eval(f"{for_evaluation(val)}.get()") - - def get(self): - """ - Return the underlying pointer (a T*). - """ - return self._ptr - - @property - def value(self): - """ - The underlying value (a T). - """ - return self._ptr.dereference() - - -class UniquePtr: - """ - A `std::unique_ptr<T>` value. - """ - - def __init__(self, val): - self.val = val - ty = self.val.type.template_argument(0) - # XXX This assumes that the embedded T* pointer lies at the start - # of std::unique_ptr<T>. - self._ptr = self.val.address.reinterpret_cast(ty.pointer().pointer()) - - def get(self): - """ - Return the underlying pointer (a T*). - """ - return self._ptr - - @property - def value(self): - """ - The underlying value (a T). - """ - return self._ptr.dereference() - - -class Variant: - """ - A arrow::util::Variant<...>. - """ - - def __init__(self, val): - self.val = val - self.index = int(self.val['index_']) - try: - self.value_type = self.val.type.template_argument(self.index) - except RuntimeError: - # Index out of bounds - self.value_type = None - - @property - def value(self): - if self.value_type is None: - return None - ptr = self.val.address - if ptr is not None: - return ptr.reinterpret_cast(self.value_type.pointer() - ).dereference() - return None - - -class StdString: - """ - A `std::string` (or possibly `string_view`) value. - """ - - def __init__(self, val): - self.val = val - try: - # libstdc++ internals - self._data = val['_M_dataplus']['_M_p'] - self._size = val['_M_string_length'] - except gdb.error: - # fallback for other C++ standard libraries - self._data = gdb.parse_and_eval(f"{for_evaluation(val)}.c_str()") - self._size = gdb.parse_and_eval(f"{for_evaluation(val)}.size()") - - def __bool__(self): - return self._size != 0 - - @property - def data(self): - return self._data - - @property - def size(self): - return self._size - - def bytes_literal(self): - return self._data.lazy_string(length=self._size).value() - - def string_literal(self): - # XXX use lazy_string() as well? - return string_literal(self._data) - - def string(self): - return self._data.string() - - def __format__(self, fmt): - return str(self.bytes_literal()) - - -class StdVector(Sequence): - """ - A `std::vector<T>` value. - """ - - def __init__(self, val): - self.val = val - try: - # libstdc++ internals - impl = self.val['_M_impl'] - self._data = impl['_M_start'] - self._size = int(impl['_M_finish'] - self._data) - except gdb.error: - # fallback for other C++ standard libraries - self._data = int(gdb.parse_and_eval( - f"{for_evaluation(self.val)}.data()")) - self._size = int(gdb.parse_and_eval( - f"{for_evaluation(self.val)}.size()")) - - def _check_index(self, index): - if index < 0 or index >= self._size: - raise IndexError( - f"Index {index} out of bounds (should be in [0, {self._size - 1}])") - - def __len__(self): - return self._size - - def __getitem__(self, index): - self._check_index(index) - return self._data[index] - - def eval_at(self, index, eval_format): - """ - Run `eval_format` with the value at `index`. - - For example, if `eval_format` is "{}.get()", this will evaluate - "{self[0]}.get()". - """ - self._check_index(index) - return gdb.parse_and_eval( - eval_format.format(for_evaluation(self._data[index]))) - - def iter_eval(self, eval_format): - data_eval = for_evaluation(self._data) - for i in range(self._size): - yield gdb.parse_and_eval( - eval_format.format(f"{data_eval}[{i}]")) - - @property - def size(self): - return self._size - - -class StdPtrVector(StdVector): - - def __getitem__(self, index): - return deref(super().__getitem__(index)) - - -class FieldVector(StdVector): - - def __getitem__(self, index): - """ - Dereference the Field object at this index. - """ - return Field(deref(super().__getitem__(index))) - - def __str__(self): - l = [str(self[i]) for i in range(len(self))] - return "{" + ", ".join(l) + "}" - - -class Field: - """ - A arrow::Field value. - """ - - def __init__(self, val): - self.val = val - - @property - def name(self): - return StdString(self.val['name_']) - - @property - def type(self): - return deref(self.val['type_']) - - @property - def nullable(self): - return bool(self.val['nullable_']) - - def __str__(self): - return str(self.val) - - -class FieldPtr(Field): - """ - A std::shared_ptr<arrow::Field> value. - """ - - def __init__(self, val): - super().__init__(deref(val)) - - -class Buffer: - """ - A arrow::Buffer value. - """ - - def __init__(self, val): - self.val = val - self.size = int(val['size_']) - - @property - def data(self): - return self.val['data_'] - - def bytes_literal(self): - if self.size > 0: - return self.val['data_'].lazy_string(length=self.size).value() - else: - return '""' - - -class BufferPtr: - """ - A arrow::Buffer* value (possibly null). - """ - - def __init__(self, val): - self.val = val - ptr = int(self.val) - self.buf = Buffer(val.dereference()) if ptr != 0 else None - - @property - def data(self): - if self.buf is None: - return None - return self.buf.data - - @property - def size(self): - if self.buf is None: - return None - return self.buf.size - - def bytes_literal(self): - if self.buf is None: - return None - return self.buf.bytes_literal() - - -KeyValue = namedtuple('KeyValue', ('key', 'value')) - - -class Metadata(Sequence): - """ - A arrow::KeyValueMetadata value. - """ - - def __init__(self, val): - self.val = val - self.keys = StdVector(self.val['keys_']) - self.values = StdVector(self.val['values_']) - - def __len__(self): - return len(self.keys) - - def __getitem__(self, i): - return KeyValue(StdString(self.keys[i]), StdString(self.values[i])) - - -class MetadataPtr(Sequence): - """ - A shared_ptr<arrow::KeyValueMetadata> value, possibly null. - """ - - def __init__(self, val): - self.ptr = SharedPtr(val).get() - self.is_null = int(self.ptr) == 0 - self.md = None if self.is_null else Metadata(self.ptr.dereference()) - - def __len__(self): - return 0 if self.is_null else len(self.md) - - def __getitem__(self, i): - if self.is_null: - raise IndexError - return self.md[i] - - -DecimalTraits = namedtuple('DecimalTraits', ('nbits', 'struct_format_le')) - -decimal_traits = { - 128: DecimalTraits(128, 'Qq'), - 256: DecimalTraits(256, 'QQQq'), -} - -class Decimal: - """ - A arrow::BasicDecimal{128,256...} value. - """ - - def __init__(self, traits, val): - self.val = val - self.traits = traits - - @classmethod - def from_bits(cls, nbits, *args, **kwargs): - return cls(decimal_traits[nbits], *args, **kwargs) - - @property - def words(self): - """ - The decimal words, from least to most significant. - """ - mem = gdb.selected_inferior().read_memory( - self.val['array_'].address, self.traits.nbits // 8) - fmt = self.traits.struct_format_le - if byte_order() == 'big': - fmt = fmt[::-1] - words = struct.unpack(f"={fmt}", mem) - if byte_order() == 'big': - words = words[::-1] - return words - - def __int__(self): - """ - The underlying bigint value. - """ - v = 0 - words = self.words - bits_per_word = self.traits.nbits // len(words) - for w in reversed(words): - v = (v << bits_per_word) + w - return v - - def format(self, precision, scale): - """ - Format as a decimal number with the given precision and scale. - """ - v = int(self) - with decimal.localcontext() as ctx: - ctx.prec = precision - ctx.capitals = False - return str(decimal.Decimal(v).scaleb(-scale)) - - -Decimal128 = partial(Decimal.from_bits, 128) -Decimal256 = partial(Decimal.from_bits, 256) - -decimal_type_to_class = { - 'Decimal128Type': Decimal128, - 'Decimal256Type': Decimal256, -} - - -class ExtensionType: - """ - A arrow::ExtensionType. - """ - - def __init__(self, val): - self.val = val - - @property - def storage_type(self): - return deref(self.val['storage_type_']) - - def to_string(self): - """ - The result of calling ToString(). - """ - return StdString(gdb.parse_and_eval( - f"{for_evaluation(self.val)}.ToString()")) - - -class Schema: - """ - A arrow::Schema. - """ - - def __init__(self, val): - self.val = val - impl = deref(self.val['impl_']) - self.fields = FieldVector(impl['fields_']) - self.metadata = MetadataPtr(impl['metadata_']) - - -class RecordBatch: - """ - A arrow::RecordBatch. - """ - - def __init__(self, val): - # XXX this relies on RecordBatch always being a SimpleRecordBatch - # under the hood. What if users create their own RecordBatch - # implementation? - self.val = cast_to_concrete(val, - gdb.lookup_type("arrow::SimpleRecordBatch")) - self.schema = Schema(deref(self.val['schema_'])) - self.columns = StdPtrVector(self.val['columns_']) - - @property - def num_rows(self): - return self.val['num_rows_'] - - -class Table: - """ - A arrow::Table. - """ - - def __init__(self, val): - # XXX this relies on Table always being a SimpleTable under the hood. - # What if users create their own Table implementation? - self.val = cast_to_concrete(val, - gdb.lookup_type("arrow::SimpleTable")) - self.schema = Schema(deref(self.val['schema_'])) - self.columns = StdPtrVector(self.val['columns_']) - - @property - def num_rows(self): - return self.val['num_rows_'] - - -type_reprs = { - 'NullType': 'null', - 'BooleanType': 'boolean', - 'UInt8Type': 'uint8', - 'Int8Type': 'int8', - 'UInt16Type': 'uint16', - 'Int16Type': 'int16', - 'UInt32Type': 'uint32', - 'Int32Type': 'int32', - 'UInt64Type': 'uint64', - 'Int64Type': 'int64', - 'HalfFloatType': 'float16', - 'FloatType': 'float32', - 'DoubleType': 'float64', - 'Date32Type': 'date32', - 'Date64Type': 'date64', - 'Time32Type': 'time32', - 'Time64Type': 'time64', - 'TimestampType': 'timestamp', - 'MonthIntervalType': 'month_interval', - 'DayTimeIntervalType': 'day_time_interval', - 'MonthDayNanoIntervalType': 'month_day_nano_interval', - 'DurationType': 'duration', - 'Decimal128Type': 'decimal128', - 'Decimal256Type': 'decimal256', - 'StringType': 'utf8', - 'LargeStringType': 'large_utf8', - 'BinaryType': 'binary', - 'LargeBinaryType': 'large_binary', - 'FixedSizeBinaryType': 'fixed_size_binary', - 'ListType': 'list', - 'LargeListType': 'large_list', - 'FixedSizeListType': 'fixed_size_list', - 'MapType': 'map', - 'StructType': 'struct_', - 'SparseUnionType': 'sparse_union', - 'DenseUnionType': 'dense_union', - 'DictionaryType': 'dictionary', - } - - -class TypePrinter: - """ - Pretty-printer for arrow::DataTypeClass and subclasses. - """ - - def __init__(self, name, val): - self.name = name - # Cast to concrete type class to access all derived methods - # and properties. - self.type = gdb.lookup_type(f"arrow::{name}") - self.val = cast_to_concrete(val, self.type) - - @property - def fields(self): - return FieldVector(self.val['children_']) - - def _format_type(self): - r = type_reprs.get(self.name, self.name) - return f"arrow::{r}" - - def _for_evaluation(self): - return for_evaluation(self.val, self.type) - - -class PrimitiveTypePrinter(TypePrinter): - """ - Pretty-printer for non-parametric types. - """ - - def to_string(self): - return f"{self._format_type()}()" - - -class TimeTypePrinter(TypePrinter): - """ - Pretty-printer for time and duration types. - """ - - def _get_unit(self): - return self.val['unit_'] - - def to_string(self): - return f"{self._format_type()}({self._get_unit()})" - - -class TimestampTypePrinter(TimeTypePrinter): - """ - Pretty-printer for timestamp types. - """ - - def to_string(self): - tz = StdString(self.val['timezone_']) - if tz: - return f'{self._format_type()}({self._get_unit()}, {tz})' - else: - return f'{self._format_type()}({self._get_unit()})' - - -class FixedSizeBinaryTypePrinter(TypePrinter): - """ - Pretty-printer for fixed-size binary types. - """ - - def to_string(self): - width = int(self.val['byte_width_']) - return f"{self._format_type()}({width})" - - -class DecimalTypePrinter(TypePrinter): - """ - Pretty-printer for decimal types. - """ - - def to_string(self): - precision = int(self.val['precision_']) - scale = int(self.val['scale_']) - return f"{self._format_type()}({precision}, {scale})" - - -class ListTypePrinter(TypePrinter): - """ - Pretty-printer for list types. - """ - - def _get_value_type(self): - fields = self.fields - if len(fields) != 1: - return None - return fields[0].type - - def to_string(self): - child = self._get_value_type() - if child is None: - return f"{self._format_type()}<uninitialized or corrupt>" - else: - return f"{self._format_type()}({child})" - - -class FixedSizeListTypePrinter(ListTypePrinter): - """ - Pretty-printer for fixed-size list type. - """ - - def to_string(self): - child = self._get_value_type() - if child is None: - return f"{self._format_type()}<uninitialized or corrupt>" - list_size = int(self.val['list_size_']) - return f"{self._format_type()}({child}, {list_size})" - - -class MapTypePrinter(ListTypePrinter): - """ - Pretty-printer for map types. - """ - - def to_string(self): - struct_type = self._get_value_type() - if struct_type is None: - return f"{self._format_type()}<uninitialized or corrupt>" - struct_children = FieldVector(struct_type['children_']) - if len(struct_children) != 2: - return f"{self._format_type()}<uninitialized or corrupt>" - key_type = struct_children[0].type - item_type = struct_children[1].type - return (f"{self._format_type()}({key_type}, {item_type}, " - f"keys_sorted={self.val['keys_sorted_']})") - - -class DictionaryTypePrinter(TypePrinter): - """ - Pretty-printer for dictionary types. - """ - - def to_string(self): - index_type = deref(self.val['index_type_']) - value_type = deref(self.val['value_type_']) - ordered = self.val['ordered_'] - return (f"{self._format_type()}({index_type}, {value_type}, " - f"ordered={ordered})") - - -class StructTypePrinter(TypePrinter): - """ - Pretty-printer for struct types. - """ - - def to_string(self): - return f"{self._format_type()}({self.fields})" - - -class UnionTypePrinter(TypePrinter): - """ - Pretty-printer for union types. - """ - - def to_string(self): - type_codes = StdVector(self.val['type_codes_']) - type_codes = "{" + ", ".join(str(x.cast(gdb.lookup_type('int'))) - for x in type_codes) + "}" - return f"{self._format_type()}(fields={self.fields}, type_codes={type_codes})" - - -class ExtensionTypePrinter(TypePrinter): - """ - Pretty-printer for extension types. - """ - - def to_string(self): - ext_type = ExtensionType(self.val) - return (f"{self._format_type()} {ext_type.to_string().string_literal()} " - f"with storage type {ext_type.storage_type}") - - -class ScalarPrinter: - """ - Pretty-printer for arrow::Scalar and subclasses. - """ - - def __new__(cls, val): - # Lookup actual (derived) class to instantiate - type_id = int(deref(val['type'])['id_']) - type_class = lookup_type_class(type_id) - if type_class is not None: - cls = type_class.scalar_printer - assert issubclass(cls, ScalarPrinter) - self = object.__new__(cls) - self.type_class = type_class - self.type_name = type_class.name - self.name = scalar_class_from_type(self.type_name) - self.type_id = type_id - # Cast to concrete Scalar class to access derived attributes. - concrete_type = gdb.lookup_type(f"arrow::{self.name}") - self.val = cast_to_concrete(val, concrete_type) - self.is_valid = bool(self.val['is_valid']) - return self - - @property - def type(self): - """ - The concrete DataTypeClass instance. - """ - concrete_type = gdb.lookup_type(f"arrow::{self.type_name}") - return cast_to_concrete(deref(self.val['type']), - concrete_type) - - def _format_type(self): - return f"arrow::{self.name}" - - def _format_null(self): - if self.type_class.is_parametric: - return f"{self._format_type()} of type {self.type}, null value" - else: - return f"{self._format_type()} of null value" - - def _for_evaluation(self): - return for_evaluation(self.val) - - -class NullScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::NullScalar. - """ - - def to_string(self): - return self._format_type() - - -class NumericScalarPrinter(ScalarPrinter): - """ - Pretty-printer for numeric Arrow scalars. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - value = self.val['value'] - if self.type_name == "HalfFloatType": - return (f"{self._format_type()} " - f"of value {half_float_value(value)} [{value}]") - if self.type_name in ("UInt8Type", "Int8Type"): - value = value.cast(gdb.lookup_type('int')) - return f"{self._format_type()} of value {value}" - - -class TimeScalarPrinter(ScalarPrinter): - """ - Pretty-printer for Arrow time-like scalars. - """ - - def to_string(self): - unit = short_time_unit(self.type['unit_']) - if not self.is_valid: - return f"{self._format_type()} of null value [{unit}]" - value = self.val['value'] - return f"{self._format_type()} of value {value}{unit}" - - -class Date32ScalarPrinter(TimeScalarPrinter): - """ - Pretty-printer for arrow::Date32Scalar. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - value = self.val['value'] - return f"{self._format_type()} of value {value}d" - - -class Date64ScalarPrinter(TimeScalarPrinter): - """ - Pretty-printer for arrow::Date64Scalar. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - value = self.val['value'] - return f"{self._format_type()} of value {value}ms" - - -class TimestampScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::TimestampScalar. - """ - - def to_string(self): - unit = short_time_unit(self.type['unit_']) - tz = StdString(self.type['timezone_']) - tz = tz.string_literal() if tz.size != 0 else "no timezone" - if not self.is_valid: - return f"{self._format_type()} of null value [{unit}, {tz}]" - value = self.val['value'] - return f"{self._format_type()} of value {value}{unit} [{tz}]" - - -class MonthIntervalScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::MonthIntervalScalarPrinter. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - value = self.val['value'] - return f"{self._format_type()} of value {format_month_interval(value)}" - - -class DecimalScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::DecimalScalar and subclasses. - """ - - @property - def decimal_class(self): - return decimal_type_to_class[self.type_name] - - def to_string(self): - ty = self.type - precision = int(ty['precision_']) - scale = int(ty['scale_']) - suffix = f"[precision={precision}, scale={scale}]" - if not self.is_valid: - return f"{self._format_type()} of null value {suffix}" - value = self.decimal_class(self.val['value']).format(precision, scale) - return f"{self._format_type()} of value {value} {suffix}" - - -class BaseBinaryScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::BaseBinaryScalar and subclasses. - """ - - def _format_buf(self, bufptr): - if 'String' in self.type_name: - return utf8_literal(bufptr.data, bufptr.size) - else: - return bufptr.bytes_literal() - - def to_string(self): - if not self.is_valid: - return self._format_null() - bufptr = BufferPtr(SharedPtr(self.val['value']).get()) - size = bufptr.size - if size is None: - return f"{self._format_type()} of value <unallocated>" - return (f"{self._format_type()} of size {size}, " - f"value {self._format_buf(bufptr)}") - - -class FixedSizeBinaryScalarPrinter(BaseBinaryScalarPrinter): - """ - Pretty-printer for arrow::FixedSizeBinaryScalar. - """ - - def to_string(self): - size = self.type['byte_width_'] - if not self.is_valid: - return f"{self._format_type()} of size {size}, null value" - bufptr = BufferPtr(SharedPtr(self.val['value']).get()) - if bufptr.data is None: - return f"{self._format_type()} of size {size}, <unallocated>" - return (f"{self._format_type()} of size {size}, " - f"value {self._format_buf(bufptr)}") - - -class DictionaryScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::DictionaryScalar. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - index = deref(self.val['value']['index']) - dictionary = deref(self.val['value']['dictionary']) - return (f"{self._format_type()} of index {index}, " - f"dictionary {dictionary}") - - -class BaseListScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::BaseListScalar and subclasses. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - value = deref(self.val['value']) - return f"{self._format_type()} of value {value}" - - -class StructScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::StructScalar. - """ - - def display_hint(self): - return 'map' - - def children(self): - eval_fields = StdVector(self.type['children_']) - eval_values = StdVector(self.val['value']) - for field, value in zip(eval_fields, eval_values): - name = StdString(deref(field)['name_']).string_literal() - yield ("name", name) - yield ("value", deref(value)) - - def to_string(self): - if not self.is_valid: - return self._format_null() - return f"{self._format_type()}" - - -class UnionScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::UnionScalar and subclasses. - """ - - def to_string(self): - type_code = self.val['type_code'].cast(gdb.lookup_type('int')) - if not self.is_valid: - return (f"{self._format_type()} of type {self.type}, " - f"type code {type_code}, null value") - value = deref(self.val['value']) - return (f"{self._format_type()} of type code {type_code}, " - f"value {value}") - - -class MapScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::MapScalar. - """ - - def to_string(self): - if not self.is_valid: - return self._format_null() - - array = deref(self.val['value']) - data = deref(array['data_']) - data_printer = ArrayDataPrinter("arrow::ArrayData", data) - return (f"{self._format_type()} of type {self.type}, " - f"value {data_printer._format_contents()}") - - -class ExtensionScalarPrinter(ScalarPrinter): - """ - Pretty-printer for arrow::ExtensionScalar. - """ - - def to_string(self): - ext_type = ExtensionType(self.type) - if not self.is_valid: - return (f"{self._format_type()} of type " - f"{ext_type.to_string().string_literal()}, null value") - value = deref(self.val['value']) - return (f"{self._format_type()} of type " - f"{ext_type.to_string().string_literal()}, value {value}") - - -class ArrayDataPrinter: - """ - Pretty-printer for arrow::ArrayData. - """ - - def __new__(cls, name, val): - # Lookup actual (derived) class to instantiate - type_id = int(deref(val['type'])['id_']) - type_class = lookup_type_class(type_id) - if type_class is not None: - cls = type_class.array_data_printer - assert issubclass(cls, ArrayDataPrinter) - self = object.__new__(cls) - self.name = name - self.type_class = type_class - self.type_name = type_class.name - self.type_id = type_id - self.val = val - return self - - @property - def type(self): - """ - The concrete DataTypeClass instance. - """ - concrete_type = gdb.lookup_type(f"arrow::{self.type_name}") - return cast_to_concrete(deref(self.val['type']), concrete_type) - - def _format_contents(self): - return (f"length {self.val['length']}, " - f"{format_null_count(self.val['null_count'])}") - - def to_string(self): - ty = self.type - return (f"{self.name} of type {ty}, " - f"{self._format_contents()}") - - -class ArrayPrinter: - """ - Pretty-printer for arrow::Array and subclasses. - """ - - def __init__(self, val): - data = deref(val['data_']) - self.data_printer = ArrayDataPrinter("arrow::ArrayData", data) - self.name = array_class_from_type(self.data_printer.type_name) - - def _format_contents(self): - return self.data_printer._format_contents() - - def to_string(self): - if self.data_printer.type_class.is_parametric: - ty = self.data_printer.type - return f"arrow::{self.name} of type {ty}, {self._format_contents()}" - else: - return f"arrow::{self.name} of {self._format_contents()}" - - -class ChunkedArrayPrinter: - """ - Pretty-printer for arrow::ChunkedArray. - """ - - def __init__(self, name, val): - self.name = name - self.val = val - self.chunks = StdVector(self.val['chunks_']) - - def display_hint(self): - return "array" - - def children(self): - for i, chunk in enumerate(self.chunks): - printer = ArrayPrinter(deref(chunk)) - yield str(i), printer._format_contents() - - def to_string(self): - ty = deref(self.val['type_']) - return (f"{self.name} of type {ty}, length {self.val['length_']}, " - f"{format_null_count(self.val['null_count_'])} " - f"with {len(self.chunks)} chunks") - - -class DataTypeClass: - - array_data_printer = ArrayDataPrinter - - def __init__(self, name): - self.name = name - - -class NullTypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = NullScalarPrinter - - -class NumericTypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = NumericScalarPrinter - - -class Date32TypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = Date32ScalarPrinter - - -class Date64TypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = Date64ScalarPrinter - - -class TimeTypeClass(DataTypeClass): - is_parametric = True - type_printer = TimeTypePrinter - scalar_printer = TimeScalarPrinter - - -class TimestampTypeClass(DataTypeClass): - is_parametric = True - type_printer = TimestampTypePrinter - scalar_printer = TimestampScalarPrinter - - -class DurationTypeClass(DataTypeClass): - is_parametric = True - type_printer = TimeTypePrinter - scalar_printer = TimeScalarPrinter - - -class MonthIntervalTypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = MonthIntervalScalarPrinter - - -class DayTimeIntervalTypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = NumericScalarPrinter - - -class MonthDayNanoIntervalTypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = NumericScalarPrinter - - -class DecimalTypeClass(DataTypeClass): - is_parametric = True - type_printer = DecimalTypePrinter - scalar_printer = DecimalScalarPrinter - - -class BaseBinaryTypeClass(DataTypeClass): - is_parametric = False - type_printer = PrimitiveTypePrinter - scalar_printer = BaseBinaryScalarPrinter - - -class FixedSizeBinaryTypeClass(DataTypeClass): - is_parametric = True - type_printer = FixedSizeBinaryTypePrinter - scalar_printer = FixedSizeBinaryScalarPrinter - - -class BaseListTypeClass(DataTypeClass): - is_parametric = True - type_printer = ListTypePrinter - scalar_printer = BaseListScalarPrinter - - -class FixedSizeListTypeClass(DataTypeClass): - is_parametric = True - type_printer = FixedSizeListTypePrinter - scalar_printer = BaseListScalarPrinter - - -class MapTypeClass(DataTypeClass): - is_parametric = True - type_printer = MapTypePrinter - scalar_printer = MapScalarPrinter - - -class StructTypeClass(DataTypeClass): - is_parametric = True - type_printer = StructTypePrinter - scalar_printer = StructScalarPrinter - - -class UnionTypeClass(DataTypeClass): - is_parametric = True - type_printer = UnionTypePrinter - scalar_printer = UnionScalarPrinter - - -class DictionaryTypeClass(DataTypeClass): - is_parametric = True - type_printer = DictionaryTypePrinter - scalar_printer = DictionaryScalarPrinter - - -class ExtensionTypeClass(DataTypeClass): - is_parametric = True - type_printer = ExtensionTypePrinter - scalar_printer = ExtensionScalarPrinter - - -DataTypeTraits = namedtuple('DataTypeTraits', ('factory', 'name')) - - -type_traits_by_id = { - Type.NA: DataTypeTraits(NullTypeClass, 'NullType'), - - Type.BOOL: DataTypeTraits(NumericTypeClass, 'BooleanType'), - Type.UINT8: DataTypeTraits(NumericTypeClass, 'UInt8Type'), - Type.INT8: DataTypeTraits(NumericTypeClass, 'Int8Type'), - Type.UINT16: DataTypeTraits(NumericTypeClass, 'UInt16Type'), - Type.INT16: DataTypeTraits(NumericTypeClass, 'Int16Type'), - Type.UINT32: DataTypeTraits(NumericTypeClass, 'UInt32Type'), - Type.INT32: DataTypeTraits(NumericTypeClass, 'Int32Type'), - Type.UINT64: DataTypeTraits(NumericTypeClass, 'UInt64Type'), - Type.INT64: DataTypeTraits(NumericTypeClass, 'Int64Type'), - Type.HALF_FLOAT: DataTypeTraits(NumericTypeClass, 'HalfFloatType'), - Type.FLOAT: DataTypeTraits(NumericTypeClass, 'FloatType'), - Type.DOUBLE: DataTypeTraits(NumericTypeClass, 'DoubleType'), - - Type.STRING: DataTypeTraits(BaseBinaryTypeClass, 'StringType'), - Type.BINARY: DataTypeTraits(BaseBinaryTypeClass, 'BinaryType'), - Type.LARGE_STRING: DataTypeTraits(BaseBinaryTypeClass, 'LargeStringType'), - Type.LARGE_BINARY: DataTypeTraits(BaseBinaryTypeClass, 'LargeBinaryType'), - - Type.FIXED_SIZE_BINARY: DataTypeTraits(FixedSizeBinaryTypeClass, - 'FixedSizeBinaryType'), - - Type.DATE32: DataTypeTraits(Date32TypeClass, 'Date32Type'), - Type.DATE64: DataTypeTraits(Date64TypeClass, 'Date64Type'), - Type.TIMESTAMP: DataTypeTraits(TimestampTypeClass, 'TimestampType'), - Type.TIME32: DataTypeTraits(TimeTypeClass, 'Time32Type'), - Type.TIME64: DataTypeTraits(TimeTypeClass, 'Time64Type'), - Type.DURATION: DataTypeTraits(DurationTypeClass, 'DurationType'), - Type.INTERVAL_MONTHS: DataTypeTraits(MonthIntervalTypeClass, - 'MonthIntervalType'), - Type.INTERVAL_DAY_TIME: DataTypeTraits(DayTimeIntervalTypeClass, - 'DayTimeIntervalType'), - Type.INTERVAL_MONTH_DAY_NANO: DataTypeTraits(MonthDayNanoIntervalTypeClass, - 'MonthDayNanoIntervalType'), - - Type.DECIMAL128: DataTypeTraits(DecimalTypeClass, 'Decimal128Type'), - Type.DECIMAL256: DataTypeTraits(DecimalTypeClass, 'Decimal256Type'), - - Type.LIST: DataTypeTraits(BaseListTypeClass, 'ListType'), - Type.LARGE_LIST: DataTypeTraits(BaseListTypeClass, 'LargeListType'), - Type.FIXED_SIZE_LIST: DataTypeTraits(FixedSizeListTypeClass, - 'FixedSizeListType'), - Type.MAP: DataTypeTraits(MapTypeClass, 'MapType'), - - Type.STRUCT: DataTypeTraits(StructTypeClass, 'StructType'), - Type.SPARSE_UNION: DataTypeTraits(UnionTypeClass, 'SparseUnionType'), - Type.DENSE_UNION: DataTypeTraits(UnionTypeClass, 'DenseUnionType'), - - Type.DICTIONARY: DataTypeTraits(DictionaryTypeClass, 'DictionaryType'), - Type.EXTENSION: DataTypeTraits(ExtensionTypeClass, 'ExtensionType'), -} - -max_type_id = len(type_traits_by_id) - 1 - - -def lookup_type_class(type_id): - """ - Lookup a type class (an instance of DataTypeClass) by its type id. - """ - traits = type_traits_by_id.get(type_id) - if traits is not None: - return traits.factory(traits.name) - return None - - -class StatusPrinter: - """ - Pretty-printer for arrow::Status. - """ - _status_codes_by_id = { - 0: 'OK', - 1: 'OutOfMemory', - 2: 'KeyError', - 3: 'TypeError', - 4: 'Invalid', - 5: 'IOError', - 6: 'CapacityError', - 7: 'IndexError', - 8: 'Cancelled', - 9: 'UnknownError', - 10: 'NotImplemented', - 11: 'SerializationError', - 13: 'RError', - 40: 'CodeGenError', - 41: 'ExpressionValidationError', - 42: 'ExecutionError', - 45: 'AlreadyExists', - } - - def __init__(self, name, val): - self.val = val - - def _format_detail(self, state): - detail_ptr = SharedPtr(state['detail']).get() - if int(detail_ptr) == 0: - return None - detail_id = CString(gdb.parse_and_eval( - f"{for_evaluation(detail_ptr)}->type_id()")) - # Cannot use StdString as ToString() returns a rvalue - detail_msg = CString(gdb.parse_and_eval( - f"{for_evaluation(detail_ptr)}->ToString().c_str()")) - return f"[{detail_id.string()}] {detail_msg.string_literal()}" - - def _format_error(self, state): - code = int(state['code']) - codename = self._status_codes_by_id.get(code) - if codename is not None: - s = f"arrow::Status::{codename}(" - else: - s = f"arrow::Status(<unknown code {code}>, " - s += StdString(state['msg']).string_literal() - detail_msg = self._format_detail(state) - if detail_msg is not None: - return s + f", detail={detail_msg})" - else: - return s + ")" - - def to_string(self): - state_ptr = self.val['state_'] - if int(state_ptr) == 0: - return "arrow::Status::OK()" - return self._format_error(state_ptr.dereference()) - - -class ResultPrinter(StatusPrinter): - """ - Pretty-printer for arrow::Result<T>. - """ - - def to_string(self): - data_type = self.val.type.template_argument(0) - state_ptr = self.val['status_']['state_'] - if int(state_ptr) != 0: - inner = self._format_error(state_ptr) - else: - data_ptr = self.val['storage_']['data_'].address - assert data_ptr - inner = data_ptr.reinterpret_cast( - data_type.pointer()).dereference() - return f"arrow::Result<{data_type}>({inner})" - - -class StringViewPrinter: - """ - Pretty-printer for arrow::util::string_view. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - size = int(self.val['size_']) - if size == 0: - return f"arrow::util::string_view of size 0" - else: - data = bytes_literal(self.val['data_'], size) - return f"arrow::util::string_view of size {size}, {data}" - - -class OptionalPrinter: - """ - Pretty-printer for arrow::util::optional. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - data_type = self.val.type.template_argument(0) - # XXX We rely on internal details of our vendored optional<T> - # implementation, as inlined methods may not be callable from gdb. - if not self.val['has_value_']: - inner = "nullopt" - else: - data_ptr = self.val['contained']['data'].address - assert data_ptr - inner = data_ptr.reinterpret_cast( - data_type.pointer()).dereference() - return f"arrow::util::optional<{data_type}>({inner})" - - -class VariantPrinter: - """ - Pretty-printer for arrow::util::Variant. - """ - - def __init__(self, name, val): - self.val = val - self.variant = Variant(val) - - def to_string(self): - if self.variant.value_type is None: - return "arrow::util::Variant (uninitialized or corrupt)" - type_desc = (f"arrow::util::Variant of index {self.variant.index} " - f"(actual type {self.variant.value_type})") - - value = self.variant.value - if value is None: - return (f"{type_desc}, unavailable value") - else: - return (f"{type_desc}, value {value}") - - -class FieldPrinter: - """ - Pretty-printer for arrow::Field. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - f = Field(self.val) - nullable = f.nullable - if nullable: - return f'arrow::field({f.name}, {f.type})' - else: - return f'arrow::field({f.name}, {f.type}, nullable=false)' - - -class MetadataPrinter: - """ - Pretty-printer for arrow::KeyValueMetadata. - """ - - def __init__(self, name, val): - self.val = val - self.metadata = Metadata(self.val) - - def display_hint(self): - return 'map' - - def children(self): - for k, v in self.metadata: - yield ("key", k.bytes_literal()) - yield ("value", v.bytes_literal()) - - def to_string(self): - return f"arrow::KeyValueMetadata of size {len(self.metadata)}" - - -class SchemaPrinter: - """ - Pretty-printer for arrow::Schema. - """ - - def __init__(self, name, val): - self.val = val - self.schema = Schema(val) - # TODO endianness - - def display_hint(self): - return 'map' - - def children(self): - for field in self.schema.fields: - yield ("name", field.name.string_literal()) - yield ("type", field.type) - - def to_string(self): - num_fields = len(self.schema.fields) - md_items = len(self.schema.metadata) - if md_items > 0: - return (f"arrow::Schema with {num_fields} fields " - f"and {md_items} metadata items") - else: - return f"arrow::Schema with {num_fields} fields" - - -class BaseColumnarPrinter: - - def __init__(self, name, val, columnar): - self.name = name - self.val = val - self.columnar = columnar - self.schema = self.columnar.schema - - def display_hint(self): - return 'map' - - def children(self): - for field, col in zip(self.schema.fields, - self.columnar.columns): - yield ("name", field.name.string_literal()) - yield ("value", col) - - def to_string(self): - num_fields = len(self.schema.fields) - num_rows = self.columnar.num_rows - md_items = len(self.schema.metadata) - if md_items > 0: - return (f"arrow::{self.name} with {num_fields} columns, " - f"{num_rows} rows, {md_items} metadata items") - else: - return (f"arrow::{self.name} with {num_fields} columns, " - f"{num_rows} rows") - - -class RecordBatchPrinter(BaseColumnarPrinter): - """ - Pretty-printer for arrow::RecordBatch. - """ - - def __init__(self, name, val): - BaseColumnarPrinter.__init__(self, "RecordBatch", val, RecordBatch(val)) - - -class TablePrinter(BaseColumnarPrinter): - """ - Pretty-printer for arrow::Table. - """ - - def __init__(self, name, val): - BaseColumnarPrinter.__init__(self, "Table", val, Table(val)) - - -class DatumPrinter: - """ - Pretty-printer for arrow::Datum. - """ - - def __init__(self, name, val): - self.val = val - self.variant = Variant(val['value']) - - def to_string(self): - if self.variant.index == 0: - # Datum::NONE - return "arrow::Datum (empty)" - if self.variant.value_type is None: - return "arrow::Datum (uninitialized or corrupt?)" - # All non-empty Datums contain a shared_ptr<T> - value = deref(self.variant.value) - return f"arrow::Datum of value {value}" - - -class BufferPrinter: - """ - Pretty-printer for arrow::Buffer and subclasses. - """ - - def __init__(self, name, val): - self.name = name - self.val = val - - def to_string(self): - if bool(self.val['is_mutable_']): - mutable = 'mutable' - else: - mutable = 'read-only' - size = int(self.val['size_']) - if size == 0: - return f"arrow::{self.name} of size 0, {mutable}" - if not self.val['is_cpu_']: - return f"arrow::{self.name} of size {size}, {mutable}, not on CPU" - data = bytes_literal(self.val['data_'], size) - return f"arrow::{self.name} of size {size}, {mutable}, {data}" - - -class DayMillisecondsPrinter: - """ - Pretty-printer for arrow::DayTimeIntervalType::DayMilliseconds. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - return f"{self.val['days']}d{self.val['milliseconds']}ms" - - -class MonthDayNanosPrinter: - """ - Pretty-printer for arrow::MonthDayNanoIntervalType::MonthDayNanos. - """ - - def __init__(self, name, val): - self.val = val - - def to_string(self): - return (f"{self.val['months']}M{self.val['days']}d" - f"{self.val['nanoseconds']}ns") - - -class DecimalPrinter: - """ - Pretty-printer for Arrow decimal values. - """ - - def __init__(self, nbits, name, val): - self.name = name - self.val = val - self.nbits = nbits - - def to_string(self): - dec = Decimal.from_bits(self.nbits, self.val) - return f"{self.name}({int(dec)})" - - -printers = { - "arrow::ArrayData": ArrayDataPrinter, - "arrow::BasicDecimal128": partial(DecimalPrinter, 128), - "arrow::BasicDecimal256": partial(DecimalPrinter, 256), - "arrow::ChunkedArray": ChunkedArrayPrinter, - "arrow::Datum": DatumPrinter, - "arrow::DayTimeIntervalType::DayMilliseconds": DayMillisecondsPrinter, - "arrow::Decimal128": partial(DecimalPrinter, 128), - "arrow::Decimal256": partial(DecimalPrinter, 256), - "arrow::MonthDayNanoIntervalType::MonthDayNanos": MonthDayNanosPrinter, - "arrow::Field": FieldPrinter, - "arrow::KeyValueMetadata": MetadataPrinter, - "arrow::RecordBatch": RecordBatchPrinter, - "arrow::Result": ResultPrinter, - "arrow::Schema": SchemaPrinter, - "arrow::SimpleRecordBatch": RecordBatchPrinter, - "arrow::SimpleTable": TablePrinter, - "arrow::Status": StatusPrinter, - "arrow::Table": TablePrinter, - "arrow::util::optional": OptionalPrinter, - "arrow::util::string_view": StringViewPrinter, - "arrow::util::Variant": VariantPrinter, - "nonstd::optional_lite::optional": OptionalPrinter, - "nonstd::sv_lite::basic_string_view": StringViewPrinter, -} - -def arrow_pretty_print(val): - name = val.type.strip_typedefs().name - if name is None: - return - name = name.partition('<')[0] # Remove template parameters - printer = printers.get(name) - if printer is not None: - return printer(name, val) - - if not name.startswith("arrow::"): - return - arrow_name = name[len("arrow::"):] - - if arrow_name.endswith("Buffer"): - try: - val['data_'] - except Exception: - # Not a Buffer? - pass - else: - return BufferPrinter(arrow_name, val) - - elif arrow_name.endswith("Type"): - # Look up dynamic type, as it may be hidden behind a DataTypeClass - # pointer or reference. - try: - type_id = int(val['id_']) - except Exception: - # Not a DataTypeClass? - pass - else: - type_class = lookup_type_class(type_id) - if type_class is not None: - return type_class.type_printer(type_class.name, val) - - elif arrow_name.endswith("Array"): - return ArrayPrinter(val) - - elif arrow_name.endswith("Scalar"): - try: - val['is_valid'] - except Exception: - # Not a Scalar? - pass - else: - return ScalarPrinter(val) - - -gdb.pretty_printers.append(arrow_pretty_print) diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 9e51ef8..8347b87 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -29,7 +29,6 @@ #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" #include "arrow/array/builder_union.h" -#include "arrow/chunked_array.h" #include "arrow/ipc/json_simple.h" #include "arrow/scalar.h" #include "arrow/type_traits.h" @@ -932,19 +931,6 @@ Status ArrayFromJSON(const std::shared_ptr<DataType>& type, const char* json_str return ArrayFromJSON(type, util::string_view(json_string), out); } -Status ChunkedArrayFromJSON(const std::shared_ptr<DataType>& type, - const std::vector<std::string>& json_strings, - std::shared_ptr<ChunkedArray>* out) { - ArrayVector out_chunks; - out_chunks.reserve(json_strings.size()); - for (const std::string& chunk_json : json_strings) { - out_chunks.emplace_back(); - RETURN_NOT_OK(ArrayFromJSON(type, chunk_json, &out_chunks.back())); - } - *out = std::make_shared<ChunkedArray>(std::move(out_chunks), type); - return Status::OK(); -} - Status DictArrayFromJSON(const std::shared_ptr<DataType>& type, util::string_view indices_json, util::string_view dictionary_json, std::shared_ptr<Array>* out) { diff --git a/cpp/src/arrow/ipc/json_simple.h b/cpp/src/arrow/ipc/json_simple.h index e831d45..8269bd6 100644 --- a/cpp/src/arrow/ipc/json_simple.h +++ b/cpp/src/arrow/ipc/json_simple.h @@ -23,7 +23,6 @@ #include <string> #include "arrow/status.h" -#include "arrow/type_fwd.h" #include "arrow/util/string_view.h" #include "arrow/util/visibility.h" @@ -49,11 +48,6 @@ Status ArrayFromJSON(const std::shared_ptr<DataType>&, const char* json, std::shared_ptr<Array>* out); ARROW_EXPORT -Status ChunkedArrayFromJSON(const std::shared_ptr<DataType>& type, - const std::vector<std::string>& json_strings, - std::shared_ptr<ChunkedArray>* out); - -ARROW_EXPORT Status DictArrayFromJSON(const std::shared_ptr<DataType>&, util::string_view indices_json, util::string_view dictionary_json, std::shared_ptr<Array>* out); diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 97cbff6..c6f66d0 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -34,7 +34,6 @@ #include "arrow/array/builder_nested.h" #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" -#include "arrow/chunked_array.h" #include "arrow/ipc/json_simple.h" #include "arrow/scalar.h" #include "arrow/testing/builder.h" @@ -1350,24 +1349,6 @@ TEST(TestDictArrayFromJSON, Errors) { &array)); // dict value isn't string } -TEST(TestChunkedArrayFromJSON, Basics) { - auto type = int32(); - std::shared_ptr<ChunkedArray> chunked_array; - ASSERT_OK(ChunkedArrayFromJSON(type, {}, &chunked_array)); - ASSERT_OK(chunked_array->ValidateFull()); - ASSERT_EQ(chunked_array->num_chunks(), 0); - AssertTypeEqual(type, chunked_array->type()); - - ASSERT_OK(ChunkedArrayFromJSON(type, {"[1, 2]", "[3, null, 4]"}, &chunked_array)); - ASSERT_OK(chunked_array->ValidateFull()); - ASSERT_EQ(chunked_array->num_chunks(), 2); - std::shared_ptr<Array> expected_chunk; - ASSERT_OK(ArrayFromJSON(type, "[1, 2]", &expected_chunk)); - AssertArraysEqual(*expected_chunk, *chunked_array->chunk(0), /*verbose=*/true); - ASSERT_OK(ArrayFromJSON(type, "[3, null, 4]", &expected_chunk)); - AssertArraysEqual(*expected_chunk, *chunked_array->chunk(1), /*verbose=*/true); -} - TEST(TestScalarFromJSON, Basics) { // Sanity check for common types (not exhaustive) std::shared_ptr<Scalar> scalar; diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index 835eaca..2c63b66 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -34,7 +34,6 @@ set(ARROW_PYTHON_SRCS decimal.cc deserialize.cc extension_type.cc - gdb.cc helpers.cc inference.cc init.cc diff --git a/cpp/src/arrow/python/gdb.cc b/cpp/src/arrow/python/gdb.cc deleted file mode 100644 index 7c629b5..0000000 --- a/cpp/src/arrow/python/gdb.cc +++ /dev/null @@ -1,449 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <cerrno> -#include <csignal> -#include <cstdlib> -#include <memory> -#include <utility> - -#include "arrow/array.h" -#include "arrow/chunked_array.h" -#include "arrow/datum.h" -#include "arrow/extension_type.h" -#include "arrow/ipc/json_simple.h" -#include "arrow/python/gdb.h" -#include "arrow/record_batch.h" -#include "arrow/scalar.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/util/decimal.h" -#include "arrow/util/key_value_metadata.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/optional.h" -#include "arrow/util/string_view.h" -#include "arrow/util/variant.h" - -namespace arrow { - -using ipc::internal::json::ArrayFromJSON; -using ipc::internal::json::ChunkedArrayFromJSON; -using ipc::internal::json::ScalarFromJSON; - -namespace gdb { -namespace { - -void Trap() { - // XXX Perhaps vendor - // https://github.com/nemequ/portable-snippets/blob/master/debug-trap/debug-trap.h ? -#if defined(_MSC_VER) - __debugbreak(); -#elif defined(SIGTRAP) - raise(SIGTRAP); -#else - std::abort(); -#endif -} - -class CustomStatusDetail : public StatusDetail { - public: - const char* type_id() const override { return "custom-detail-id"; } - std::string ToString() const override { return "This is a detail"; } -}; - -class UuidType : public ExtensionType { - public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} - - std::string extension_name() const override { return "uuid"; } - - bool ExtensionEquals(const ExtensionType& other) const override { - return (other.extension_name() == this->extension_name()); - } - - std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override { - return std::make_shared<ExtensionArray>(data); - } - - Result<std::shared_ptr<DataType>> Deserialize( - std::shared_ptr<DataType> storage_type, - const std::string& serialized) const override { - return Status::NotImplemented(""); - } - - std::string Serialize() const override { return "uuid-serialized"; } -}; - -} // namespace - -void TestSession() { - // We define local variables for all types for which we want to test - // pretty-printing. - // Then, at the end of this function, we trap to the debugger, so that - // test instrumentation can print values from this frame by interacting - // with the debugger. - // The test instrumentation is in pyarrow/tests/test_gdb.py - -#ifdef __clang__ - _Pragma("clang diagnostic push"); - _Pragma("clang diagnostic ignored \"-Wunused-variable\""); -#elif defined(__GNUC__) - _Pragma("GCC diagnostic push"); - _Pragma("GCC diagnostic ignored \"-Wunused-variable\""); -#endif - - // Status & Result - auto ok_status = Status::OK(); - auto error_status = Status::IOError("This is an error"); - auto error_detail_status = - error_status.WithDetail(std::make_shared<CustomStatusDetail>()); - auto ok_result = Result<int>(42); - auto error_result = Result<int>(error_status); - auto error_detail_result = Result<int>(error_detail_status); - - // Optionals - util::optional<int> int_optional{42}; - util::optional<int> null_int_optional{}; - - // Variants - using VariantType = util::Variant<int, bool, std::string>; - - VariantType int_variant{42}; - VariantType bool_variant{false}; - VariantType string_variant{std::string("hello")}; - - // String views - util::string_view string_view_empty{}; - util::string_view string_view_abc{"abc"}; - std::string special_chars = std::string("foo\"bar") + '\x00' + "\r\n\t\x1f"; - util::string_view string_view_special_chars(special_chars); - std::string very_long = "abc" + std::string(5000, 'K') + "xyz"; - util::string_view string_view_very_long(very_long); - - // Buffers - Buffer buffer_null{nullptr, 0}; - Buffer buffer_abc{string_view_abc}; - Buffer buffer_special_chars{string_view_special_chars}; - char mutable_array[3] = {'a', 'b', 'c'}; - MutableBuffer buffer_mutable{reinterpret_cast<uint8_t*>(mutable_array), 3}; - auto heap_buffer = std::make_shared<Buffer>(string_view_abc); - auto heap_buffer_mutable = *AllocateBuffer(buffer_abc.size()); - memcpy(heap_buffer_mutable->mutable_data(), buffer_abc.data(), buffer_abc.size()); - - // KeyValueMetadata - auto empty_metadata = key_value_metadata({}, {}); - auto metadata = key_value_metadata( - {"key_text", "key_binary"}, {"some value", std::string("z") + '\x00' + "\x1f\xff"}); - - // Decimals - arrow::Decimal128 decimal128_zero{}; - arrow::Decimal128 decimal128_pos{"98765432109876543210987654321098765432"}; - arrow::Decimal128 decimal128_neg{"-98765432109876543210987654321098765432"}; - arrow::BasicDecimal128 basic_decimal128_zero{}; - arrow::BasicDecimal128 basic_decimal128_pos{decimal128_pos.native_endian_array()}; - arrow::BasicDecimal128 basic_decimal128_neg{decimal128_neg.native_endian_array()}; - arrow::Decimal256 decimal256_zero{}; - arrow::Decimal256 decimal256_pos{ - "9876543210987654321098765432109876543210987654321098765432109876543210987654"}; - arrow::Decimal256 decimal256_neg{ - "-9876543210987654321098765432109876543210987654321098765432109876543210987654"}; - arrow::BasicDecimal256 basic_decimal256_zero{}; - arrow::BasicDecimal256 basic_decimal256_pos{decimal256_pos.native_endian_array()}; - arrow::BasicDecimal256 basic_decimal256_neg{decimal256_neg.native_endian_array()}; - - // Data types - NullType null_type; - auto heap_null_type = null(); - BooleanType bool_type; - auto heap_bool_type = boolean(); - - Date32Type date32_type; - Date64Type date64_type; - Time32Type time_type_s(TimeUnit::SECOND); - Time32Type time_type_ms(TimeUnit::MILLI); - Time64Type time_type_us(TimeUnit::MICRO); - Time64Type time_type_ns(TimeUnit::NANO); - auto heap_time_type_ns = time64(TimeUnit::NANO); - - TimestampType timestamp_type_s(TimeUnit::SECOND); - TimestampType timestamp_type_ms_timezone(TimeUnit::MILLI, "Europe/Paris"); - TimestampType timestamp_type_us(TimeUnit::MICRO); - TimestampType timestamp_type_ns_timezone(TimeUnit::NANO, "Europe/Paris"); - auto heap_timestamp_type_ns_timezone = timestamp(TimeUnit::NANO, "Europe/Paris"); - - DayTimeIntervalType day_time_interval_type; - MonthIntervalType month_interval_type; - MonthDayNanoIntervalType month_day_nano_interval_type; - - DurationType duration_type_s(TimeUnit::SECOND); - DurationType duration_type_ns(TimeUnit::NANO); - - BinaryType binary_type; - StringType string_type; - LargeBinaryType large_binary_type; - LargeStringType large_string_type; - FixedSizeBinaryType fixed_size_binary_type(10); - auto heap_fixed_size_binary_type = fixed_size_binary(10); - - Decimal128Type decimal128_type(16, 5); - Decimal256Type decimal256_type(42, 12); - auto heap_decimal128_type = decimal128(16, 5); - - ListType list_type(uint8()); - LargeListType large_list_type(large_utf8()); - auto heap_list_type = list(uint8()); - auto heap_large_list_type = large_list(large_utf8()); - - FixedSizeListType fixed_size_list_type(float64(), 3); - auto heap_fixed_size_list_type = fixed_size_list(float64(), 3); - - DictionaryType dict_type_unordered(int16(), utf8()); - DictionaryType dict_type_ordered(int16(), utf8(), /*ordered=*/true); - auto heap_dict_type = dictionary(int16(), utf8()); - - MapType map_type_unsorted(utf8(), binary()); - MapType map_type_sorted(utf8(), binary(), /*keys_sorted=*/true); - auto heap_map_type = map(utf8(), binary()); - - StructType struct_type_empty({}); - StructType struct_type( - {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); - auto heap_struct_type = - struct_({field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); - - std::vector<int8_t> union_type_codes({7, 42}); - FieldVector union_fields( - {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); - SparseUnionType sparse_union_type(union_fields, union_type_codes); - DenseUnionType dense_union_type(union_fields, union_type_codes); - - UuidType uuid_type{}; - std::shared_ptr<DataType> heap_uuid_type = std::make_shared<UuidType>(); - - // Schema - auto schema_empty = schema({}); - auto schema_non_empty = schema({field("ints", int8()), field("strs", utf8())}); - auto schema_with_metadata = schema_non_empty->WithMetadata( - key_value_metadata({"key1", "key2"}, {"value1", "value2"})); - - // Fields - Field int_field("ints", int64()); - Field float_field("floats", float32(), /*nullable=*/false); - auto heap_int_field = field("ints", int64()); - - // Scalars - NullScalar null_scalar; - auto heap_null_scalar = MakeNullScalar(null()); - - BooleanScalar bool_scalar_null{}; - BooleanScalar bool_scalar{true}; - auto heap_bool_scalar = *MakeScalar(boolean(), true); - - Int8Scalar int8_scalar_null{}; - UInt8Scalar uint8_scalar_null{}; - Int64Scalar int64_scalar_null{}; - UInt64Scalar uint64_scalar_null{}; - Int8Scalar int8_scalar{-42}; - UInt8Scalar uint8_scalar{234}; - Int64Scalar int64_scalar{-9223372036854775807LL - 1}; - UInt64Scalar uint64_scalar{18446744073709551615ULL}; - HalfFloatScalar half_float_scalar{48640}; // -1.5 - FloatScalar float_scalar{1.25f}; - DoubleScalar double_scalar{2.5}; - - Time32Scalar time_scalar_s{100, TimeUnit::SECOND}; - Time32Scalar time_scalar_ms{1000, TimeUnit::MILLI}; - Time64Scalar time_scalar_us{10000, TimeUnit::MICRO}; - Time64Scalar time_scalar_ns{100000, TimeUnit::NANO}; - Time64Scalar time_scalar_null{time64(TimeUnit::NANO)}; - - DurationScalar duration_scalar_s{-100, TimeUnit::SECOND}; - DurationScalar duration_scalar_ms{-1000, TimeUnit::MILLI}; - DurationScalar duration_scalar_us{-10000, TimeUnit::MICRO}; - DurationScalar duration_scalar_ns{-100000, TimeUnit::NANO}; - DurationScalar duration_scalar_null{duration(TimeUnit::NANO)}; - - TimestampScalar timestamp_scalar_s{12345, timestamp(TimeUnit::SECOND)}; - TimestampScalar timestamp_scalar_ms{-123456, timestamp(TimeUnit::MILLI)}; - TimestampScalar timestamp_scalar_us{1234567, timestamp(TimeUnit::MICRO)}; - TimestampScalar timestamp_scalar_ns{-12345678, timestamp(TimeUnit::NANO)}; - TimestampScalar timestamp_scalar_null{timestamp(TimeUnit::NANO)}; - - TimestampScalar timestamp_scalar_s_tz{12345, - timestamp(TimeUnit::SECOND, "Europe/Paris")}; - TimestampScalar timestamp_scalar_ms_tz{-123456, - timestamp(TimeUnit::MILLI, "Europe/Paris")}; - TimestampScalar timestamp_scalar_us_tz{1234567, - timestamp(TimeUnit::MICRO, "Europe/Paris")}; - TimestampScalar timestamp_scalar_ns_tz{-12345678, - timestamp(TimeUnit::NANO, "Europe/Paris")}; - TimestampScalar timestamp_scalar_null_tz{timestamp(TimeUnit::NANO, "Europe/Paris")}; - - MonthIntervalScalar month_interval_scalar{23}; - MonthIntervalScalar month_interval_scalar_null{}; - DayTimeIntervalScalar day_time_interval_scalar{{23, -456}}; - DayTimeIntervalScalar day_time_interval_scalar_null{}; - MonthDayNanoIntervalScalar month_day_nano_interval_scalar{{1, 23, -456}}; - MonthDayNanoIntervalScalar month_day_nano_interval_scalar_null{}; - - Date32Scalar date32_scalar{23}; - Date32Scalar date32_scalar_null{}; - Date64Scalar date64_scalar{45 * 86000000LL}; - Date64Scalar date64_scalar_null{}; - - Decimal128Scalar decimal128_scalar_pos_scale_pos{Decimal128("1234567"), - decimal128(10, 4)}; - Decimal128Scalar decimal128_scalar_pos_scale_neg{Decimal128("-1234567"), - decimal128(10, 4)}; - Decimal128Scalar decimal128_scalar_neg_scale_pos{Decimal128("1234567"), - decimal128(10, -4)}; - Decimal128Scalar decimal128_scalar_neg_scale_neg{Decimal128("-1234567"), - decimal128(10, -4)}; - Decimal128Scalar decimal128_scalar_null{decimal128(10, 4)}; - auto heap_decimal128_scalar = *MakeScalar(decimal128(10, 4), Decimal128("1234567")); - - Decimal256Scalar decimal256_scalar_pos_scale_pos{ - Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, 4)}; - Decimal256Scalar decimal256_scalar_pos_scale_neg{ - Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, 4)}; - Decimal256Scalar decimal256_scalar_neg_scale_pos{ - Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, -4)}; - Decimal256Scalar decimal256_scalar_neg_scale_neg{ - Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, -4)}; - Decimal256Scalar decimal256_scalar_null{decimal256(50, 4)}; - auto heap_decimal256_scalar = *MakeScalar( - decimal256(50, 4), Decimal256("1234567890123456789012345678901234567890123456")); - - BinaryScalar binary_scalar_null{}; - BinaryScalar binary_scalar_unallocated{std::shared_ptr<Buffer>{nullptr}}; - BinaryScalar binary_scalar_empty{Buffer::FromString("")}; - BinaryScalar binary_scalar_abc{Buffer::FromString("abc")}; - BinaryScalar binary_scalar_bytes{ - Buffer::FromString(std::string() + '\x00' + "\x1f\xff")}; - - StringScalar string_scalar_null{}; - StringScalar string_scalar_unallocated{std::shared_ptr<Buffer>{nullptr}}; - StringScalar string_scalar_empty{Buffer::FromString("")}; - StringScalar string_scalar_hehe{Buffer::FromString("héhé")}; - StringScalar string_scalar_invalid_chars{ - Buffer::FromString(std::string("abc") + '\x00' + "def\xffghi")}; - - LargeBinaryScalar large_binary_scalar_abc{Buffer::FromString("abc")}; - LargeStringScalar large_string_scalar_hehe{Buffer::FromString("héhé")}; - - FixedSizeBinaryScalar fixed_size_binary_scalar{Buffer::FromString("abc"), - fixed_size_binary(3)}; - FixedSizeBinaryScalar fixed_size_binary_scalar_null{fixed_size_binary(3)}; - - std::shared_ptr<Array> dict_array; - ARROW_CHECK_OK(ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])", &dict_array)); - DictionaryScalar dict_scalar{{std::make_shared<Int8Scalar>(42), dict_array}, - dictionary(int8(), utf8())}; - DictionaryScalar dict_scalar_null{dictionary(int8(), utf8())}; - - std::shared_ptr<Array> list_value_array; - ARROW_CHECK_OK(ArrayFromJSON(int32(), R"([4, 5, 6])", &list_value_array)); - ListScalar list_scalar{list_value_array}; - ListScalar list_scalar_null{list(int32())}; - LargeListScalar large_list_scalar{list_value_array}; - LargeListScalar large_list_scalar_null{large_list(int32())}; - FixedSizeListScalar fixed_size_list_scalar{list_value_array}; - FixedSizeListScalar fixed_size_list_scalar_null{fixed_size_list(int32(), 3)}; - - auto struct_scalar_type = struct_({field("ints", int32()), field("strs", utf8())}); - StructScalar struct_scalar{ - ScalarVector{MakeScalar(int32_t(42)), MakeScalar("some text")}, struct_scalar_type}; - StructScalar struct_scalar_null{struct_scalar_type}; - - auto sparse_union_scalar_type = - sparse_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42}); - auto dense_union_scalar_type = - dense_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42}); - SparseUnionScalar sparse_union_scalar{MakeScalar(int32_t(43)), 7, - sparse_union_scalar_type}; - SparseUnionScalar sparse_union_scalar_null{7, sparse_union_scalar_type}; - DenseUnionScalar dense_union_scalar{MakeScalar(int32_t(43)), 7, - dense_union_scalar_type}; - DenseUnionScalar dense_union_scalar_null{7, dense_union_scalar_type}; - - auto extension_scalar_type = std::make_shared<UuidType>(); - ExtensionScalar extension_scalar{ - std::make_shared<FixedSizeBinaryScalar>(Buffer::FromString("0123456789abcdef"), - extension_scalar_type->storage_type()), - extension_scalar_type}; - ExtensionScalar extension_scalar_null{extension_scalar_type}; - - std::shared_ptr<Scalar> heap_map_scalar; - ARROW_CHECK_OK( - ScalarFromJSON(map(utf8(), int32()), R"([["a", 5], ["b", 6]])", &heap_map_scalar)); - auto heap_map_scalar_null = MakeNullScalar(heap_map_scalar->type); - - // Array and ArrayData - std::shared_ptr<Array> heap_int32_array; - ARROW_CHECK_OK(ArrayFromJSON(int32(), "[-5, 6, null, 42]", &heap_int32_array)); - ArrayData int32_array_data{*heap_int32_array->data()}; - Int32Array int32_array{heap_int32_array->data()->Copy()}; - - std::shared_ptr<Array> heap_list_array; - ARROW_CHECK_OK(ArrayFromJSON(list(int64()), "[[1, 2], null, []]", &heap_list_array)); - ListArray list_array{heap_list_array->data()}; - - // ChunkedArray - ArrayVector array_chunks(2); - ARROW_CHECK_OK(ArrayFromJSON(int32(), "[1, 2]", &array_chunks[0])); - ARROW_CHECK_OK(ArrayFromJSON(int32(), "[3, null, 4]", &array_chunks[1])); - ChunkedArray chunked_array{array_chunks}; - - // RecordBatch - auto batch_schema = schema({field("ints", int32()), field("strs", utf8())}); - ArrayVector batch_columns{2}; - ARROW_CHECK_OK(ArrayFromJSON(int32(), "[1, 2, 3]", &batch_columns[0])); - ARROW_CHECK_OK(ArrayFromJSON(utf8(), R"(["abc", null, "def"])", &batch_columns[1])); - auto batch = RecordBatch::Make(batch_schema, /*num_rows=*/3, batch_columns); - auto batch_with_metadata = batch->ReplaceSchemaMetadata( - key_value_metadata({"key1", "key2", "key3"}, {"value1", "value2", "value3"})); - - // Table - ChunkedArrayVector table_columns{2}; - ARROW_CHECK_OK( - ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5]"}, &table_columns[0])); - ARROW_CHECK_OK(ChunkedArrayFromJSON( - utf8(), {R"(["abc", null])", R"(["def"])", R"(["ghi", "jkl"])"}, - &table_columns[1])); - auto table = Table::Make(batch_schema, table_columns); - - // Datum - Datum empty_datum{}; - Datum scalar_datum{MakeNullScalar(boolean())}; - Datum array_datum{heap_int32_array}; - Datum chunked_array_datum{chunked_array}; - Datum batch_datum{batch}; - Datum table_datum{table}; - -#ifdef __clang__ - _Pragma("clang diagnostic pop"); -#elif defined(__GNUC__) - _Pragma("GCC diagnostic pop"); -#endif - - // Hook into debugger - Trap(); -} - -} // namespace gdb -} // namespace arrow diff --git a/cpp/src/arrow/python/gdb.h b/cpp/src/arrow/python/gdb.h deleted file mode 100644 index 1ddcbb5..0000000 --- a/cpp/src/arrow/python/gdb.h +++ /dev/null @@ -1,29 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "arrow/python/visibility.h" - -namespace arrow { -namespace gdb { - -ARROW_PYTHON_EXPORT -void TestSession(); - -} // namespace gdb -} // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index d0e1be9..56ba94d 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -423,9 +423,11 @@ std::shared_ptr<Array> DictArrayFromJSON(const std::shared_ptr<DataType>& type, std::shared_ptr<ChunkedArray> ChunkedArrayFromJSON(const std::shared_ptr<DataType>& type, const std::vector<std::string>& json) { - std::shared_ptr<ChunkedArray> out; - ABORT_NOT_OK(ipc::internal::json::ChunkedArrayFromJSON(type, json, &out)); - return out; + ArrayVector out_chunks; + for (const std::string& chunk_json : json) { + out_chunks.push_back(ArrayFromJSON(type, chunk_json)); + } + return std::make_shared<ChunkedArray>(std::move(out_chunks), type); } std::shared_ptr<RecordBatch> RecordBatchFromJSON(const std::shared_ptr<Schema>& schema, diff --git a/ci/scripts/python_test.sh b/python/pyarrow/_engine.pyx old mode 100755 new mode 100644 similarity index 52% copy from ci/scripts/python_test.sh copy to python/pyarrow/_engine.pyx index 4eb4bd1..ceae725 --- a/ci/scripts/python_test.sh +++ b/python/pyarrow/_engine.pyx @@ -1,5 +1,3 @@ -#!/usr/bin/env bash -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -17,17 +15,27 @@ # specific language governing permissions and limitations # under the License. -set -ex +# cython: language_level = 3 + +"""Engine is currently unstable. APIs subject to change without notice.""" + +from cython.operator cimport dereference as deref + +from pyarrow.lib cimport * + -arrow_dir=${1} +def valami(): + return True -export ARROW_SOURCE_DIR=${arrow_dir} -export ARROW_TEST_DATA=${arrow_dir}/testing/data -export PARQUET_TEST_DATA=${arrow_dir}/cpp/submodules/parquet-testing/data -export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -export ARROW_GDB_SCRIPT=${arrow_dir}/cpp/gdb_arrow.py -# Enable some checks inside Python itself -export PYTHONDEVMODE=1 +# from pyarrow.lib import ArrowTypeError, frombytes, tobytes +# from pyarrow.includes.libarrow_dataset cimport * +# from pyarrow._fs cimport FileSystem, FileInfo, FileSelector +# from pyarrow._csv cimport ( +# ConvertOptions, ParseOptions, ReadOptions, WriteOptions) +# from pyarrow.util import _is_iterable, _is_path_like, _stringify_path -pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow +# from pyarrow._parquet cimport ( +# _create_writer_properties, _create_arrow_writer_properties, +# FileMetaData, RowGroupMetaData, ColumnChunkMetaData +# ) diff --git a/python/pyarrow/engine.py b/python/pyarrow/engine.py new file mode 100644 index 0000000..e69de29 diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 514aaef..7a9115e 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2763,10 +2763,6 @@ cdef extern from "arrow/c/bridge.h" namespace "arrow" nogil: CResult[shared_ptr[CRecordBatchReader]] ImportRecordBatchReader( ArrowArrayStream*) - -cdef extern from "arrow/python/gdb.h" namespace "arrow::gdb" nogil: - void GdbTestSession "arrow::gdb::TestSession"() - cdef extern from "arrow/util/byte_size.h" namespace "arrow::util" nogil: CResult[int64_t] ReferencedBufferSize(const CArray& array_data) CResult[int64_t] ReferencedBufferSize(const CRecordBatch& record_batch) diff --git a/python/pyarrow/includes/libarrow_compute.pxd b/python/pyarrow/includes/libarrow_compute.pxd new file mode 100644 index 0000000..05a8b01 --- /dev/null +++ b/python/pyarrow/includes/libarrow_compute.pxd @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * + + +cdef extern from "arrow/compute/exec/exec_plan.h" namespace "arrow::compute" nogil: + + cdef cppclass CExecNode "arrow::compute::CExecNode": + int num_inputs() const + int num_outputs() const + + const vector[c_string]& input_labels() const + const shared_ptr[CSchema]& output_schema() const + + CExecPlan* plan() + + const c_string& label() const + void SetLabel(c_string label) + + CStatus Validate() const + + cdef cppclass CExecPlan "arrow::compute::CExecPlan": + @staticmethod + CResult[shared_ptr[CExecPlan]] Make(ExecContext*); + + CExecContext* exec_context() const + CStatus Validate() + CStatus StartProducing() + void CStopProducing() + c_string ToString() const + + # CExecNode* AddNode(unique_ptr[CExecNode] node) + # const NodeVector& sources() const + # const NodeVector& sinks() const + # CFuture[] finished(); + + cdef cppclass CDeclaration "arrow::compute::CDeclaration": + + CResult[CExecNode*] AddToPlan(CExecPlan* plan) const diff --git a/python/pyarrow/includes/libarrow_engine.pxd b/python/pyarrow/includes/libarrow_engine.pxd new file mode 100644 index 0000000..537db5e --- /dev/null +++ b/python/pyarrow/includes/libarrow_engine.pxd @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * + + +cdef extern from "arrow/engine/api.h" namespace "arrow::engine" nogil: + + # CResult[c_vector[CDeclaration]] ConvertPlan(const Buffer&) + # CResult[shared_ptr[CBuffer]] SerializeType(const CDataType&, CExtensionSet*) + + CResult[shared_ptr[CBuffer]] SubstraitFromJSON(c_string type_name, + c_string json) + + CResult[shared_ptr[CBuffer]] SerializeSchema(const CSchema&) + CResult[shared_ptr[CBuffer]] SerializeExpression(const CExpression&) + + CResult[shared_ptr[CDataType]] DeserializeType(const CBuffer&) + CResult[shared_ptr[CSchema]] DeserializeSchema(const CBuffer&) + CResult[CExpression] DeserializeExpression(const CBuffer&) diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 6e45af6..0c9cbcc 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -123,10 +123,6 @@ def _pc(): return pc -def _gdb_test_session(): - GdbTestSession() - - # Assorted compatibility helpers include "compat.pxi" diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py deleted file mode 100644 index 3f44d1d..0000000 --- a/python/pyarrow/tests/test_gdb.py +++ /dev/null @@ -1,854 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from functools import lru_cache -import os -import re -import shutil -import subprocess -import sys - -import pytest - - -here = os.path.dirname(os.path.abspath(__file__)) - -# The GDB script may be found in the source tree (if available) -# or in another location given by the ARROW_GDB_SCRIPT environment variable. -gdb_script = (os.environ.get('ARROW_GDB_SCRIPT') or - os.path.join(here, "../../../cpp/gdb_arrow.py")) - -gdb_command = ["gdb", "--nx"] - - -@lru_cache() -def is_gdb_available(): - try: - proc = subprocess.run(gdb_command + ["--version"], - stdin=subprocess.DEVNULL, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - except FileNotFoundError: - return False - return proc.returncode == 0 - - -@lru_cache() -def python_executable(): - path = shutil.which("python3") - assert path is not None, "Couldn't find python3 executable" - return path - - -def skip_if_gdb_unavailable(): - if not is_gdb_available(): - pytest.skip("gdb command unavailable") - - -class GdbSession: - proc = None - verbose = True - - def __init__(self, *args, **env): - # Let stderr through to let pytest display it separately on errors - self.proc = subprocess.Popen(gdb_command + list(args), - env=env, bufsize=0, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE) - self.last_stdout = [] - self.last_stdout_line = b"" - - def wait_until_ready(self): - """ - Record output until the gdb prompt displays. Return recorded output. - """ - # TODO: add timeout? - while (not self.last_stdout_line.startswith(b"(gdb) ") and - self.proc.poll() is None): - block = self.proc.stdout.read(4096) - if self.verbose: - sys.stdout.buffer.write(block) - sys.stdout.buffer.flush() - block, sep, last_line = block.rpartition(b"\n") - if sep: - self.last_stdout.append(self.last_stdout_line) - self.last_stdout.append(block + sep) - self.last_stdout_line = last_line - else: - assert block == b"" - self.last_stdout_line += last_line - - if self.proc.poll() is not None: - raise IOError("gdb session terminated unexpectedly") - - out = b"".join(self.last_stdout).decode('utf-8') - self.last_stdout = [] - self.last_stdout_line = b"" - return out - - def issue_command(self, line): - line = line.encode('utf-8') + b"\n" - if self.verbose: - sys.stdout.buffer.write(line) - sys.stdout.buffer.flush() - self.proc.stdin.write(line) - self.proc.stdin.flush() - - def run_command(self, line): - self.issue_command(line) - return self.wait_until_ready() - - def print_value(self, expr): - """ - Ask gdb to print the value of an expression and return the result. - """ - out = self.run_command(f"p {expr}") - out, n = re.subn(r"^\$\d+ = ", "", out) - assert n == 1, out - # gdb may add whitespace depending on result width, remove it - return out.strip() - - def select_frame(self, func_name): - """ - Select the innermost frame with the given function name. - """ - # Ideally, we would use the "frame function" command, - # but it's not available on old GDB versions (such as 8.1.1), - # so instead parse the stack trace for a matching frame number. - out = self.run_command("info stack") - pat = r"(?mi)^#(\d+)\s+.* in " + re.escape(func_name) + " " - m = re.search(pat, out) - if m is None: - pytest.fail(f"Could not select frame for function {func_name}") - - frame_num = int(m[1]) - out = self.run_command(f"frame {frame_num}") - assert f"in {func_name}" in out - - def join(self): - if self.proc is not None: - self.proc.stdin.close() - self.proc.stdout.close() # avoid ResourceWarning - self.proc.kill() - self.proc.wait() - self.proc = None - - def __del__(self): - self.join() - - [email protected](scope='session') -def gdb(): - skip_if_gdb_unavailable() - gdb = GdbSession("-q", python_executable()) - try: - gdb.wait_until_ready() - gdb.run_command("set confirm off") - gdb.run_command("set print array-indexes on") - # Make sure gdb formatting is not terminal-dependent - gdb.run_command("set width unlimited") - gdb.run_command("set charset UTF-8") - yield gdb - finally: - gdb.join() - - [email protected](scope='session') -def gdb_arrow(gdb): - assert os.path.exists(gdb_script), "GDB script not found" - gdb.run_command(f"source {gdb_script}") - code = "from pyarrow.lib import _gdb_test_session; _gdb_test_session()" - out = gdb.run_command(f"run -c '{code}'") - assert ("Trace/breakpoint trap" in out or - "received signal" in out), out - gdb.select_frame("arrow::gdb::TestSession") - return gdb - - -def test_gdb_session(gdb): - out = gdb.run_command("show version") - assert out.startswith("GNU gdb ("), out - - -def test_gdb_arrow(gdb_arrow): - s = gdb_arrow.print_value("42 + 1") - assert s == "43" - - -def check_stack_repr(gdb, expr, expected): - """ - Check printing a stack-located value. - """ - s = gdb.print_value(expr) - if isinstance(expected, re.Pattern): - assert expected.match(s), s - else: - assert s == expected - - -def check_heap_repr(gdb, expr, expected): - """ - Check printing a heap-located value, given its address. - """ - s = gdb.print_value(f"*{expr}") - # GDB may prefix the value with an adress or type specification - if s != expected: - assert s.endswith(f" {expected}") - - -def test_status(gdb_arrow): - check_stack_repr(gdb_arrow, "ok_status", "arrow::Status::OK()") - check_stack_repr(gdb_arrow, "error_status", - 'arrow::Status::IOError("This is an error")') - check_stack_repr( - gdb_arrow, "error_detail_status", - 'arrow::Status::IOError("This is an error", ' - 'detail=[custom-detail-id] "This is a detail")') - - check_stack_repr(gdb_arrow, "ok_result", "arrow::Result<int>(42)") - check_stack_repr( - gdb_arrow, "error_result", - 'arrow::Result<int>(arrow::Status::IOError("This is an error"))') - check_stack_repr( - gdb_arrow, "error_detail_result", - 'arrow::Result<int>(arrow::Status::IOError("This is an error", ' - 'detail=[custom-detail-id] "This is a detail"))') - - -def test_string_view(gdb_arrow): - check_stack_repr(gdb_arrow, "string_view_empty", - "arrow::util::string_view of size 0") - check_stack_repr(gdb_arrow, "string_view_abc", - 'arrow::util::string_view of size 3, "abc"') - check_stack_repr( - gdb_arrow, "string_view_special_chars", - r'arrow::util::string_view of size 12, "foo\"bar\000\r\n\t\037"') - check_stack_repr( - gdb_arrow, "string_view_very_long", - 'arrow::util::string_view of size 5006, ' - '"abc", \'K\' <repeats 5000 times>...') - - -def test_buffer_stack(gdb_arrow): - check_stack_repr(gdb_arrow, "buffer_null", - "arrow::Buffer of size 0, read-only") - check_stack_repr(gdb_arrow, "buffer_abc", - 'arrow::Buffer of size 3, read-only, "abc"') - check_stack_repr( - gdb_arrow, "buffer_special_chars", - r'arrow::Buffer of size 12, read-only, "foo\"bar\000\r\n\t\037"') - check_stack_repr(gdb_arrow, "buffer_mutable", - 'arrow::MutableBuffer of size 3, mutable, "abc"') - - -def test_buffer_heap(gdb_arrow): - check_heap_repr(gdb_arrow, "heap_buffer", - 'arrow::Buffer of size 3, read-only, "abc"') - check_heap_repr(gdb_arrow, "heap_buffer_mutable.get()", - 'arrow::Buffer of size 3, mutable, "abc"') - - -def test_optionals(gdb_arrow): - check_stack_repr(gdb_arrow, "int_optional", - "arrow::util::optional<int>(42)") - check_stack_repr(gdb_arrow, "null_int_optional", - "arrow::util::optional<int>(nullopt)") - - -def test_variants(gdb_arrow): - check_stack_repr( - gdb_arrow, "int_variant", - "arrow::util::Variant of index 0 (actual type int), value 42") - check_stack_repr( - gdb_arrow, "bool_variant", - "arrow::util::Variant of index 1 (actual type bool), value false") - check_stack_repr( - gdb_arrow, "string_variant", - re.compile(r'^arrow::util::Variant of index 2 \(actual type ' - r'std::.*string.*\), value .*"hello".*')) - - -def test_decimals(gdb_arrow): - v128 = "98765432109876543210987654321098765432" - check_stack_repr(gdb_arrow, "decimal128_zero", "arrow::Decimal128(0)") - check_stack_repr(gdb_arrow, "decimal128_pos", - f"arrow::Decimal128({v128})") - check_stack_repr(gdb_arrow, "decimal128_neg", - f"arrow::Decimal128(-{v128})") - check_stack_repr(gdb_arrow, "basic_decimal128_zero", - "arrow::BasicDecimal128(0)") - check_stack_repr(gdb_arrow, "basic_decimal128_pos", - f"arrow::BasicDecimal128({v128})") - check_stack_repr(gdb_arrow, "basic_decimal128_neg", - f"arrow::BasicDecimal128(-{v128})") - - v256 = ("9876543210987654321098765432109876543210" - "987654321098765432109876543210987654") - check_stack_repr(gdb_arrow, "decimal256_zero", "arrow::Decimal256(0)") - check_stack_repr(gdb_arrow, "decimal256_pos", - f"arrow::Decimal256({v256})") - check_stack_repr(gdb_arrow, "decimal256_neg", - f"arrow::Decimal256(-{v256})") - check_stack_repr(gdb_arrow, "basic_decimal256_zero", - "arrow::BasicDecimal256(0)") - check_stack_repr(gdb_arrow, "basic_decimal256_pos", - f"arrow::BasicDecimal256({v256})") - check_stack_repr(gdb_arrow, "basic_decimal256_neg", - f"arrow::BasicDecimal256(-{v256})") - - -def test_metadata(gdb_arrow): - check_heap_repr(gdb_arrow, "empty_metadata.get()", - "arrow::KeyValueMetadata of size 0") - check_heap_repr( - gdb_arrow, "metadata.get()", - ('arrow::KeyValueMetadata of size 2 = {' - '["key_text"] = "some value", ["key_binary"] = "z\\000\\037\\377"}')) - - -def test_types_stack(gdb_arrow): - check_stack_repr(gdb_arrow, "null_type", "arrow::null()") - check_stack_repr(gdb_arrow, "bool_type", "arrow::boolean()") - - check_stack_repr(gdb_arrow, "date32_type", "arrow::date32()") - check_stack_repr(gdb_arrow, "date64_type", "arrow::date64()") - check_stack_repr(gdb_arrow, "time_type_s", - "arrow::time32(arrow::TimeUnit::SECOND)") - check_stack_repr(gdb_arrow, "time_type_ms", - "arrow::time32(arrow::TimeUnit::MILLI)") - check_stack_repr(gdb_arrow, "time_type_us", - "arrow::time64(arrow::TimeUnit::MICRO)") - check_stack_repr(gdb_arrow, "time_type_ns", - "arrow::time64(arrow::TimeUnit::NANO)") - check_stack_repr(gdb_arrow, "timestamp_type_s", - "arrow::timestamp(arrow::TimeUnit::SECOND)") - check_stack_repr( - gdb_arrow, "timestamp_type_ms_timezone", - 'arrow::timestamp(arrow::TimeUnit::MILLI, "Europe/Paris")') - check_stack_repr(gdb_arrow, "timestamp_type_us", - "arrow::timestamp(arrow::TimeUnit::MICRO)") - check_stack_repr( - gdb_arrow, "timestamp_type_ns_timezone", - 'arrow::timestamp(arrow::TimeUnit::NANO, "Europe/Paris")') - - check_stack_repr(gdb_arrow, "day_time_interval_type", - "arrow::day_time_interval()") - check_stack_repr(gdb_arrow, "month_interval_type", - "arrow::month_interval()") - check_stack_repr(gdb_arrow, "month_day_nano_interval_type", - "arrow::month_day_nano_interval()") - check_stack_repr(gdb_arrow, "duration_type_s", - "arrow::duration(arrow::TimeUnit::SECOND)") - check_stack_repr(gdb_arrow, "duration_type_ns", - "arrow::duration(arrow::TimeUnit::NANO)") - - check_stack_repr(gdb_arrow, "decimal128_type", - "arrow::decimal128(16, 5)") - check_stack_repr(gdb_arrow, "decimal256_type", - "arrow::decimal256(42, 12)") - - check_stack_repr(gdb_arrow, "binary_type", "arrow::binary()") - check_stack_repr(gdb_arrow, "string_type", "arrow::utf8()") - check_stack_repr(gdb_arrow, "large_binary_type", "arrow::large_binary()") - check_stack_repr(gdb_arrow, "large_string_type", "arrow::large_utf8()") - check_stack_repr(gdb_arrow, "fixed_size_binary_type", - "arrow::fixed_size_binary(10)") - - check_stack_repr(gdb_arrow, "list_type", - "arrow::list(arrow::uint8())") - check_stack_repr(gdb_arrow, "large_list_type", - "arrow::large_list(arrow::large_utf8())") - check_stack_repr(gdb_arrow, "fixed_size_list_type", - "arrow::fixed_size_list(arrow::float64(), 3)") - check_stack_repr( - gdb_arrow, "map_type_unsorted", - "arrow::map(arrow::utf8(), arrow::binary(), keys_sorted=false)") - check_stack_repr( - gdb_arrow, "map_type_sorted", - "arrow::map(arrow::utf8(), arrow::binary(), keys_sorted=true)") - - check_stack_repr(gdb_arrow, "struct_type_empty", - "arrow::struct_({})") - check_stack_repr( - gdb_arrow, "struct_type", - ('arrow::struct_({arrow::field("ints", arrow::int8()), ' - 'arrow::field("strs", arrow::utf8(), nullable=false)})')) - - check_stack_repr( - gdb_arrow, "sparse_union_type", - ('arrow::sparse_union(fields={arrow::field("ints", arrow::int8()), ' - 'arrow::field("strs", arrow::utf8(), nullable=false)}, ' - 'type_codes={7, 42})')) - check_stack_repr( - gdb_arrow, "dense_union_type", - ('arrow::dense_union(fields={arrow::field("ints", arrow::int8()), ' - 'arrow::field("strs", arrow::utf8(), nullable=false)}, ' - 'type_codes={7, 42})')) - - check_stack_repr( - gdb_arrow, "dict_type_unordered", - "arrow::dictionary(arrow::int16(), arrow::utf8(), ordered=false)") - check_stack_repr( - gdb_arrow, "dict_type_ordered", - "arrow::dictionary(arrow::int16(), arrow::utf8(), ordered=true)") - - check_stack_repr( - gdb_arrow, "uuid_type", - ('arrow::ExtensionType "extension<uuid>" ' - 'with storage type arrow::fixed_size_binary(16)')) - - -def test_types_heap(gdb_arrow): - check_heap_repr(gdb_arrow, "heap_null_type", "arrow::null()") - check_heap_repr(gdb_arrow, "heap_bool_type", "arrow::boolean()") - - check_heap_repr(gdb_arrow, "heap_time_type_ns", - "arrow::time64(arrow::TimeUnit::NANO)") - check_heap_repr( - gdb_arrow, "heap_timestamp_type_ns_timezone", - 'arrow::timestamp(arrow::TimeUnit::NANO, "Europe/Paris")') - - check_heap_repr(gdb_arrow, "heap_decimal128_type", - "arrow::decimal128(16, 5)") - - check_heap_repr(gdb_arrow, "heap_list_type", - "arrow::list(arrow::uint8())") - check_heap_repr(gdb_arrow, "heap_large_list_type", - "arrow::large_list(arrow::large_utf8())") - check_heap_repr(gdb_arrow, "heap_fixed_size_list_type", - "arrow::fixed_size_list(arrow::float64(), 3)") - check_heap_repr( - gdb_arrow, "heap_map_type", - "arrow::map(arrow::utf8(), arrow::binary(), keys_sorted=false)") - - check_heap_repr( - gdb_arrow, "heap_struct_type", - ('arrow::struct_({arrow::field("ints", arrow::int8()), ' - 'arrow::field("strs", arrow::utf8(), nullable=false)})')) - - check_heap_repr( - gdb_arrow, "heap_dict_type", - "arrow::dictionary(arrow::int16(), arrow::utf8(), ordered=false)") - - check_heap_repr( - gdb_arrow, "heap_uuid_type", - ('arrow::ExtensionType "extension<uuid>" ' - 'with storage type arrow::fixed_size_binary(16)')) - - -def test_fields_stack(gdb_arrow): - check_stack_repr(gdb_arrow, "int_field", - 'arrow::field("ints", arrow::int64())') - check_stack_repr( - gdb_arrow, "float_field", - 'arrow::field("floats", arrow::float32(), nullable=false)') - - -def test_fields_heap(gdb_arrow): - check_heap_repr(gdb_arrow, "heap_int_field", - 'arrow::field("ints", arrow::int64())') - - -def test_scalars_stack(gdb_arrow): - check_stack_repr(gdb_arrow, "null_scalar", "arrow::NullScalar") - check_stack_repr(gdb_arrow, "bool_scalar", - "arrow::BooleanScalar of value true") - check_stack_repr(gdb_arrow, "bool_scalar_null", - "arrow::BooleanScalar of null value") - check_stack_repr(gdb_arrow, "int8_scalar", - "arrow::Int8Scalar of value -42") - check_stack_repr(gdb_arrow, "uint8_scalar", - "arrow::UInt8Scalar of value 234") - check_stack_repr(gdb_arrow, "int64_scalar", - "arrow::Int64Scalar of value -9223372036854775808") - check_stack_repr(gdb_arrow, "uint64_scalar", - "arrow::UInt64Scalar of value 18446744073709551615") - check_stack_repr(gdb_arrow, "half_float_scalar", - "arrow::HalfFloatScalar of value -1.5 [48640]") - check_stack_repr(gdb_arrow, "float_scalar", - "arrow::FloatScalar of value 1.25") - check_stack_repr(gdb_arrow, "double_scalar", - "arrow::DoubleScalar of value 2.5") - - check_stack_repr(gdb_arrow, "time_scalar_s", - "arrow::Time32Scalar of value 100s") - check_stack_repr(gdb_arrow, "time_scalar_ms", - "arrow::Time32Scalar of value 1000ms") - check_stack_repr(gdb_arrow, "time_scalar_us", - "arrow::Time64Scalar of value 10000us") - check_stack_repr(gdb_arrow, "time_scalar_ns", - "arrow::Time64Scalar of value 100000ns") - check_stack_repr(gdb_arrow, "time_scalar_null", - "arrow::Time64Scalar of null value [ns]") - - check_stack_repr(gdb_arrow, "duration_scalar_s", - "arrow::DurationScalar of value -100s") - check_stack_repr(gdb_arrow, "duration_scalar_ms", - "arrow::DurationScalar of value -1000ms") - check_stack_repr(gdb_arrow, "duration_scalar_us", - "arrow::DurationScalar of value -10000us") - check_stack_repr(gdb_arrow, "duration_scalar_ns", - "arrow::DurationScalar of value -100000ns") - check_stack_repr(gdb_arrow, "duration_scalar_null", - "arrow::DurationScalar of null value [ns]") - - check_stack_repr( - gdb_arrow, "timestamp_scalar_s", - "arrow::TimestampScalar of value 12345s [no timezone]") - check_stack_repr( - gdb_arrow, "timestamp_scalar_ms", - "arrow::TimestampScalar of value -123456ms [no timezone]") - check_stack_repr( - gdb_arrow, "timestamp_scalar_us", - "arrow::TimestampScalar of value 1234567us [no timezone]") - check_stack_repr( - gdb_arrow, "timestamp_scalar_ns", - "arrow::TimestampScalar of value -12345678ns [no timezone]") - check_stack_repr( - gdb_arrow, "timestamp_scalar_null", - "arrow::TimestampScalar of null value [ns, no timezone]") - - check_stack_repr( - gdb_arrow, "timestamp_scalar_s_tz", - 'arrow::TimestampScalar of value 12345s ["Europe/Paris"]') - check_stack_repr( - gdb_arrow, "timestamp_scalar_ms_tz", - 'arrow::TimestampScalar of value -123456ms ["Europe/Paris"]') - check_stack_repr( - gdb_arrow, "timestamp_scalar_us_tz", - 'arrow::TimestampScalar of value 1234567us ["Europe/Paris"]') - check_stack_repr( - gdb_arrow, "timestamp_scalar_ns_tz", - 'arrow::TimestampScalar of value -12345678ns ["Europe/Paris"]') - check_stack_repr( - gdb_arrow, "timestamp_scalar_null_tz", - 'arrow::TimestampScalar of null value [ns, "Europe/Paris"]') - - check_stack_repr(gdb_arrow, "month_interval_scalar", - "arrow::MonthIntervalScalar of value 23M") - check_stack_repr(gdb_arrow, "month_interval_scalar_null", - "arrow::MonthIntervalScalar of null value") - check_stack_repr(gdb_arrow, "day_time_interval_scalar", - "arrow::DayTimeIntervalScalar of value 23d-456ms") - check_stack_repr(gdb_arrow, "day_time_interval_scalar_null", - "arrow::DayTimeIntervalScalar of null value") - check_stack_repr( - gdb_arrow, "month_day_nano_interval_scalar", - "arrow::MonthDayNanoIntervalScalar of value 1M23d-456ns") - check_stack_repr( - gdb_arrow, "month_day_nano_interval_scalar_null", - "arrow::MonthDayNanoIntervalScalar of null value") - - check_stack_repr(gdb_arrow, "date32_scalar", - "arrow::Date32Scalar of value 23d") - check_stack_repr(gdb_arrow, "date32_scalar_null", - "arrow::Date32Scalar of null value") - check_stack_repr(gdb_arrow, "date64_scalar", - "arrow::Date64Scalar of value 3870000000ms") - check_stack_repr(gdb_arrow, "date64_scalar_null", - "arrow::Date64Scalar of null value") - - check_stack_repr( - gdb_arrow, "decimal128_scalar_null", - "arrow::Decimal128Scalar of null value [precision=10, scale=4]") - check_stack_repr( - gdb_arrow, "decimal128_scalar_pos_scale_pos", - "arrow::Decimal128Scalar of value 123.4567 [precision=10, scale=4]") - check_stack_repr( - gdb_arrow, "decimal128_scalar_pos_scale_neg", - "arrow::Decimal128Scalar of value -123.4567 [precision=10, scale=4]") - check_stack_repr( - gdb_arrow, "decimal128_scalar_neg_scale_pos", - ("arrow::Decimal128Scalar of value 1.234567e+10 " - "[precision=10, scale=-4]")) - check_stack_repr( - gdb_arrow, "decimal128_scalar_neg_scale_neg", - ("arrow::Decimal128Scalar of value -1.234567e+10 " - "[precision=10, scale=-4]")) - - check_stack_repr( - gdb_arrow, "decimal256_scalar_null", - "arrow::Decimal256Scalar of null value [precision=50, scale=4]") - check_stack_repr( - gdb_arrow, "decimal256_scalar_pos_scale_pos", - ("arrow::Decimal256Scalar of value " - "123456789012345678901234567890123456789012.3456 " - "[precision=50, scale=4]")) - check_stack_repr( - gdb_arrow, "decimal256_scalar_pos_scale_neg", - ("arrow::Decimal256Scalar of value " - "-123456789012345678901234567890123456789012.3456 " - "[precision=50, scale=4]")) - check_stack_repr( - gdb_arrow, "decimal256_scalar_neg_scale_pos", - ("arrow::Decimal256Scalar of value " - "1.234567890123456789012345678901234567890123456e+49 " - "[precision=50, scale=-4]")) - check_stack_repr( - gdb_arrow, "decimal256_scalar_neg_scale_neg", - ("arrow::Decimal256Scalar of value " - "-1.234567890123456789012345678901234567890123456e+49 " - "[precision=50, scale=-4]")) - - check_stack_repr( - gdb_arrow, "binary_scalar_null", - "arrow::BinaryScalar of null value") - check_stack_repr( - gdb_arrow, "binary_scalar_unallocated", - "arrow::BinaryScalar of value <unallocated>") - check_stack_repr( - gdb_arrow, "binary_scalar_empty", - 'arrow::BinaryScalar of size 0, value ""') - check_stack_repr( - gdb_arrow, "binary_scalar_abc", - 'arrow::BinaryScalar of size 3, value "abc"') - check_stack_repr( - gdb_arrow, "binary_scalar_bytes", - r'arrow::BinaryScalar of size 3, value "\000\037\377"') - check_stack_repr( - gdb_arrow, "large_binary_scalar_abc", - 'arrow::LargeBinaryScalar of size 3, value "abc"') - - check_stack_repr( - gdb_arrow, "string_scalar_null", - "arrow::StringScalar of null value") - check_stack_repr( - gdb_arrow, "string_scalar_unallocated", - "arrow::StringScalar of value <unallocated>") - check_stack_repr( - gdb_arrow, "string_scalar_empty", - 'arrow::StringScalar of size 0, value ""') - check_stack_repr( - gdb_arrow, "string_scalar_hehe", - 'arrow::StringScalar of size 6, value "héhé"') - # FIXME: excessive escaping ('\\xff' vs. '\x00') - check_stack_repr( - gdb_arrow, "string_scalar_invalid_chars", - r'arrow::StringScalar of size 11, value "abc\x00def\\xffghi"') - check_stack_repr( - gdb_arrow, "large_string_scalar_hehe", - 'arrow::LargeStringScalar of size 6, value "héhé"') - - check_stack_repr( - gdb_arrow, "fixed_size_binary_scalar", - 'arrow::FixedSizeBinaryScalar of size 3, value "abc"') - check_stack_repr( - gdb_arrow, "fixed_size_binary_scalar_null", - 'arrow::FixedSizeBinaryScalar of size 3, null value') - - check_stack_repr( - gdb_arrow, "dict_scalar", - re.compile( - (r'^arrow::DictionaryScalar of index ' - r'arrow::Int8Scalar of value 42, ' - r'dictionary arrow::StringArray '))) - check_stack_repr( - gdb_arrow, "dict_scalar_null", - ('arrow::DictionaryScalar of type ' - 'arrow::dictionary(arrow::int8(), arrow::utf8(), ordered=false), ' - 'null value')) - - check_stack_repr( - gdb_arrow, "list_scalar", - ('arrow::ListScalar of value arrow::Int32Array of ' - 'length 3, null count 0')) - check_stack_repr( - gdb_arrow, "list_scalar_null", - 'arrow::ListScalar of type arrow::list(arrow::int32()), null value') - check_stack_repr( - gdb_arrow, "large_list_scalar", - ('arrow::LargeListScalar of value arrow::Int32Array of ' - 'length 3, null count 0')) - check_stack_repr( - gdb_arrow, "large_list_scalar_null", - ('arrow::LargeListScalar of type arrow::large_list(arrow::int32()), ' - 'null value')) - check_stack_repr( - gdb_arrow, "fixed_size_list_scalar", - ('arrow::FixedSizeListScalar of value arrow::Int32Array of ' - 'length 3, null count 0')) - check_stack_repr( - gdb_arrow, "fixed_size_list_scalar_null", - ('arrow::FixedSizeListScalar of type ' - 'arrow::fixed_size_list(arrow::int32(), 3), null value')) - - check_stack_repr( - gdb_arrow, "struct_scalar", - ('arrow::StructScalar = {["ints"] = arrow::Int32Scalar of value 42, ' - '["strs"] = arrow::StringScalar of size 9, value "some text"}')) - check_stack_repr( - gdb_arrow, "struct_scalar_null", - ('arrow::StructScalar of type arrow::struct_(' - '{arrow::field("ints", arrow::int32()), ' - 'arrow::field("strs", arrow::utf8())}), null value')) - - check_stack_repr( - gdb_arrow, "sparse_union_scalar", - ('arrow::SparseUnionScalar of type code 7, ' - 'value arrow::Int32Scalar of value 43')) - check_stack_repr( - gdb_arrow, "sparse_union_scalar_null", re.compile( - r'^arrow::SparseUnionScalar of type arrow::sparse_union\(.*\), ' - r'type code 7, null value$')) - check_stack_repr( - gdb_arrow, "dense_union_scalar", - ('arrow::DenseUnionScalar of type code 7, ' - 'value arrow::Int32Scalar of value 43')) - check_stack_repr( - gdb_arrow, "dense_union_scalar_null", re.compile( - r'^arrow::DenseUnionScalar of type arrow::dense_union\(.*\), ' - r'type code 7, null value$')) - - check_stack_repr( - gdb_arrow, "extension_scalar", - ('arrow::ExtensionScalar of type "extension<uuid>", ' - 'value arrow::FixedSizeBinaryScalar of size 16, ' - 'value "0123456789abcdef"')) - check_stack_repr( - gdb_arrow, "extension_scalar_null", - 'arrow::ExtensionScalar of type "extension<uuid>", null value') - - -def test_scalars_heap(gdb_arrow): - check_heap_repr(gdb_arrow, "heap_null_scalar", "arrow::NullScalar") - check_heap_repr(gdb_arrow, "heap_bool_scalar", - "arrow::BooleanScalar of value true") - check_heap_repr( - gdb_arrow, "heap_decimal128_scalar", - "arrow::Decimal128Scalar of value 123.4567 [precision=10, scale=4]") - check_heap_repr( - gdb_arrow, "heap_decimal256_scalar", - ("arrow::Decimal256Scalar of value " - "123456789012345678901234567890123456789012.3456 " - "[precision=50, scale=4]")) - - check_heap_repr( - gdb_arrow, "heap_map_scalar", - ('arrow::MapScalar of type arrow::map(arrow::utf8(), arrow::int32(), ' - 'keys_sorted=false), value length 2, null count 0')) - check_heap_repr( - gdb_arrow, "heap_map_scalar_null", - ('arrow::MapScalar of type arrow::map(arrow::utf8(), arrow::int32(), ' - 'keys_sorted=false), null value')) - - -def test_array_data(gdb_arrow): - check_stack_repr( - gdb_arrow, "int32_array_data", - "arrow::ArrayData of type arrow::int32(), length 4, null count 1") - - -def test_arrays_stack(gdb_arrow): - check_stack_repr( - gdb_arrow, "int32_array", - "arrow::Int32Array of length 4, null count 1") - check_stack_repr( - gdb_arrow, "list_array", - ("arrow::ListArray of type arrow::list(arrow::int64()), " - "length 3, null count 1")) - - -def test_arrays_heap(gdb_arrow): - check_heap_repr( - gdb_arrow, "heap_int32_array", - "arrow::Int32Array of length 4, null count 1") - check_heap_repr( - gdb_arrow, "heap_list_array", - ("arrow::ListArray of type arrow::list(arrow::int64()), " - "length 3, null count 1")) - - -def test_schema(gdb_arrow): - check_heap_repr(gdb_arrow, "schema_empty", - "arrow::Schema with 0 fields") - check_heap_repr( - gdb_arrow, "schema_non_empty", - ('arrow::Schema with 2 fields = {["ints"] = arrow::int8(), ' - '["strs"] = arrow::utf8()}')) - check_heap_repr( - gdb_arrow, "schema_with_metadata", - ('arrow::Schema with 2 fields and 2 metadata items = ' - '{["ints"] = arrow::int8(), ["strs"] = arrow::utf8()}')) - - -def test_chunked_array(gdb_arrow): - check_stack_repr( - gdb_arrow, "chunked_array", - ("arrow::ChunkedArray of type arrow::int32(), length 5, null count 1 " - "with 2 chunks = {[0] = length 2, null count 0, " - "[1] = length 3, null count 1}")) - - -def test_record_batch(gdb_arrow): - expected_batch = ( - 'arrow::RecordBatch with 2 columns, 3 rows = {' - '["ints"] = arrow::ArrayData of type arrow::int32(), ' - 'length 3, null count 0, ' - '["strs"] = arrow::ArrayData of type arrow::utf8(), ' - 'length 3, null count 1}') - - # Representations may differ between those two because of - # RecordBatch (base class) vs. SimpleRecordBatch (concrete class). - check_heap_repr(gdb_arrow, "batch", expected_batch) - check_heap_repr(gdb_arrow, "batch.get()", expected_batch) - - expected_batch_with_metadata = ( - 'arrow::RecordBatch with 2 columns, 3 rows, 3 metadata items = {' - '["ints"] = arrow::ArrayData of type arrow::int32(), ' - 'length 3, null count 0, ' - '["strs"] = arrow::ArrayData of type arrow::utf8(), ' - 'length 3, null count 1}') - - check_heap_repr(gdb_arrow, "batch_with_metadata", - expected_batch_with_metadata) - - -def test_table(gdb_arrow): - expected_table = ( - 'arrow::Table with 2 columns, 5 rows = {' - '["ints"] = arrow::ChunkedArray of type arrow::int32(), ' - 'length 5, null count 0 with 2 chunks = ' - '{[0] = length 3, null count 0, [1] = length 2, null count 0}, ' - '["strs"] = arrow::ChunkedArray of type arrow::utf8(), ' - 'length 5, null count 1 with 3 chunks = ' - '{[0] = length 2, null count 1, [1] = length 1, null count 0, ' - '[2] = length 2, null count 0}}') - - # Same as RecordBatch above (Table vs. SimpleTable) - check_heap_repr(gdb_arrow, "table", expected_table) - check_heap_repr(gdb_arrow, "table.get()", expected_table) - - -def test_datum(gdb_arrow): - check_stack_repr(gdb_arrow, "empty_datum", "arrow::Datum (empty)") - check_stack_repr( - gdb_arrow, "scalar_datum", - "arrow::Datum of value arrow::BooleanScalar of null value") - check_stack_repr( - gdb_arrow, "array_datum", - re.compile(r"^arrow::Datum of value arrow::ArrayData of type ")) - check_stack_repr( - gdb_arrow, "chunked_array_datum", - re.compile(r"^arrow::Datum of value arrow::ChunkedArray of type ")) - check_stack_repr( - gdb_arrow, "batch_datum", - re.compile(r"^arrow::Datum of value arrow::RecordBatch " - r"with 2 columns, 3 rows ")) - check_stack_repr( - gdb_arrow, "table_datum", - re.compile(r"^arrow::Datum of value arrow::Table " - r"with 2 columns, 5 rows "))
