pitrou commented on a change in pull request #12092: URL: https://github.com/apache/arrow/pull/12092#discussion_r788987815
########## File path: cpp/gdb_arrow.py ########## @@ -0,0 +1,1894 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections import namedtuple +from collections.abc import Sequence +import decimal +import enum +from functools import lru_cache, partial +import struct +import sys +import warnings + +import gdb +from gdb.types import get_basic_type + +# gdb API docs at https://sourceware.org/gdb/onlinedocs/gdb/Python-API.html#Python-API + +# TODO check guidelines here: https://sourceware.org/gdb/onlinedocs/gdb/Writing-a-Pretty_002dPrinter.html +# TODO investigate auto-loading: https://sourceware.org/gdb/onlinedocs/gdb/Auto_002dloading-extensions.html#Auto_002dloading-extensions + + +_type_ids = [ + 'NA', 'BOOL', 'UINT8', 'INT8', 'UINT16', 'INT16', 'UINT32', 'INT32', + 'UINT64', 'INT64', 'HALF_FLOAT', 'FLOAT', 'DOUBLE', 'STRING', 'BINARY', + 'FIXED_SIZE_BINARY', 'DATE32', 'DATE64', 'TIMESTAMP', 'TIME32', 'TIME64', + 'INTERVAL_MONTHS', 'INTERVAL_DAY_TIME', 'DECIMAL128', 'DECIMAL256', + 'LIST', 'STRUCT', 'SPARSE_UNION', 'DENSE_UNION', 'DICTIONARY', 'MAP', + 'EXTENSION', 'FIXED_SIZE_LIST', 'DURATION', 'LARGE_STRING', + 'LARGE_BINARY', 'LARGE_LIST', 'INTERVAL_MONTH_DAY_NANO'] + +# Mirror the C++ Type::type enum +Type = enum.IntEnum('Type', _type_ids, start=0) + + +@lru_cache() +def byte_order(): + """ + Get the target program (not the GDB host's) endianness. + """ + s = gdb.execute("show endian", to_string=True).strip() + if 'big' in s: + return 'big' + elif 'little' in s: + return 'little' + warnings.warn('Could not determine target endianness ' + f'from GDB\'s response:\n"""{s}"""') + # Fall back to host endianness + return sys.byteorder + + +def for_evaluation(val, ty=None): + """ + Return a parsable form of gdb.Value `val`, optionally with gdb.Type `ty`. + """ + if ty is None: + ty = get_basic_type(val.type) + if ty.code == gdb.TYPE_CODE_PTR: + # It's already a pointer, can represent it directly + return f"(({ty}) ({val}))" + if val.address is None: + raise ValueError(f"Cannot further evaluate rvalue: {val}") + return f"(* ({ty}*) ({val.address}))" + + +def is_char_star(ty): + # Note that "const char*" can have TYPE_CODE_INT as target type... + ty = get_basic_type(ty) + return (ty.code == gdb.TYPE_CODE_PTR and + get_basic_type(ty.target()).code + in (gdb.TYPE_CODE_CHAR, gdb.TYPE_CODE_INT)) + + +def deref(val): + """ + Dereference a raw or smart pointer. + """ + ty = get_basic_type(val.type) + if ty.code == gdb.TYPE_CODE_PTR: + return val.dereference() + if ty.name.startswith('std::'): + if "shared" in ty.name: + return SharedPtr(val).value + if "unique" in ty.name: + return UniquePtr(val).value + raise TypeError(f"Cannot dereference value of type '{ty.name}'") + + +_string_literal_mapping = { + ord('\\'): r'\\', + ord('\n'): r'\n', + ord('\r'): r'\r', + ord('\t'): r'\t', + ord('"'): r'\"', +} + +for c in range(0, 32): + if c not in _string_literal_mapping: + _string_literal_mapping[c] = f"\\x{c:02x}" + + +def string_literal(s): + """ + Format a Python string or gdb.Value for display as a literal. + """ + max_len = 50 + if isinstance(s, gdb.Value): + s = s.string() + if len(s) > max_len: + s = s[:max_len] + return '"' + s.translate(_string_literal_mapping) + '" [continued]' + else: + return '"' + s.translate(_string_literal_mapping) + '"' + + +def bytes_literal(val, size=None): + """ + Format a gdb.Value for display as a literal containing possibly + unprintable characters. + """ + return val.lazy_string(length=size).value() + + +def utf8_literal(val, size=None): + """ + Format a gdb.Value for display as a utf-8 literal. + """ + if size is None: + s = val.string(encoding='utf8', errors='backslashreplace') + elif size != 0: + s = val.string(encoding='utf8', errors='backslashreplace', length=size) + else: + s = "" + return string_literal(s) + + +def half_float_value(val): + """ + Return a Python float of the given half-float (represented as a uint64_t + gdb.Value). + """ + buf = gdb.selected_inferior().read_memory(val.address, 2) + return struct.unpack("e", buf)[0] + + +def load_atomic(val): + """ + Load a std::atomic<T>'s value. + """ + valty = val.type.template_argument(0) + # XXX This assumes std::atomic<T> has the same layout as a raw T. + return val.address.reinterpret_cast(valty.pointer()).dereference() + + +def load_null_count(val): + """ + Load a null count from a gdb.Value of an integer (either atomic or not). + """ + if get_basic_type(val.type).code != gdb.TYPE_CODE_INT: + val = load_atomic(val) + return val + + +def format_null_count(val): + """ + Format a null count value. + """ + if not isinstance(val, int): + null_count = int(load_null_count(val)) + return (f"null count {null_count}" if null_count != -1 + else "unknown null count") + + +def short_time_unit(val): + return ['s', 'ms', 'us', 'ns'][int(val)] + + +def format_month_interval(val): + """ + Format a MonthInterval value. + """ + return f"{int(val)}M" + + +def cast_to_concrete(val, ty): + return (val.reference_value().reinterpret_cast(ty.reference()) + .referenced_value()) + + +def scalar_class_from_type(name): + """ + Given a DataTypeClass class name (such as "BooleanType"), return the + corresponding Scalar class name. + """ + assert name.endswith("Type") + return name[:-4] + "Scalar" + + +def array_class_from_type(name): + """ + Given a DataTypeClass class name (such as "BooleanType"), return the + corresponding Array class name. + """ + assert name.endswith("Type") + return name[:-4] + "Array" + + +class CString: + """ + A `const char*` or similar value. + """ + + def __init__(self, val): + self.val = val + + def __bool__(self): + return int(data) != 0 and int(data[0]) != 0 + + @property + def data(self): + return self.val + + def bytes_literal(self): + return self.val.lazy_string().value() + + def string_literal(self): + # XXX use lazy_string() as well? + return string_literal(self.val) + + def string(self): + return self.val.string() + + def __format__(self, fmt): + return str(self.bytes_literal()) + + +# NOTE: gdb.parse_and_eval() is *slow* and calling it multiple times +# may add noticeable latencies. For standard C++ classes, we therefore +# try to fetch their properties from libstdc++ internals (which hopefully +# are stable), before falling back on calling the public API methods. + +class SharedPtr: + """ + A `std::shared_ptr<T>` value. + """ + + def __init__(self, val): + self.val = val + try: + # libstdc++ internals + self._ptr = val['_M_ptr'] + except gdb.error: + # fallback for other C++ standard libraries + self._ptr = gdb.parse_and_eval(f"{for_evaluation(val)}.get()") Review comment: Hmm, as you can see in the `StdVector` constructor, there can be a bit more divergence, so I'm not sure a simple get/fallback function would suffice. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
