Repository: incubator-impala Updated Branches: refs/heads/master 4c9c74dd3 -> a59408b57
IMPALA-3489: Add script to extract breakpad symbols from binaries Change-Id: I3ee0972efcb50609407b04cd6f4309b244a84861 Reviewed-on: http://gerrit.cloudera.org:8080/2961 Reviewed-by: Lars Volker <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/12799fae Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/12799fae Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/12799fae Branch: refs/heads/master Commit: 12799fae6c01d2ca29fcab37d6d7f2a8a6a409df Parents: 4c9c74d Author: Lars Volker <[email protected]> Authored: Thu May 5 14:37:10 2016 +0200 Committer: Tim Armstrong <[email protected]> Committed: Tue May 17 01:30:11 2016 -0700 ---------------------------------------------------------------------- bin/dump_breakpad_symbols.py | 275 ++++++++++++++++++++++++++++++++ infra/python/deps/requirements.txt | 1 + 2 files changed, 276 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12799fae/bin/dump_breakpad_symbols.py ---------------------------------------------------------------------- diff --git a/bin/dump_breakpad_symbols.py b/bin/dump_breakpad_symbols.py new file mode 100755 index 0000000..a164783 --- /dev/null +++ b/bin/dump_breakpad_symbols.py @@ -0,0 +1,275 @@ +#!/usr/bin/env impala-python +# Copyright 2016 Cloudera Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script can be used to dump symbols using the 'dump_syms' binary, which is contained +# in Google Breakpad. It supports collecting binary files from different sources: +# +# - Scan an Impala build dir for ELF files +# - Read files from stdin +# - Process a list of one or multiple explicitly specified files +# - Extract an Impala rpm and corresponding debuginfo rpm file, scan for ELF files, and +# process them together with their respective .debug file. +# +# Dependencies: +# - rpm2cpio (sudo apt-get -y install rpm2cpio) +# - cpio (sudo apt-get -y install cpio) +# - Google Breakpad, either installed via the Impala toolchain or separately +# +# Usage: dump_breakpad_symbols.py -h +# +# Typical usage patterns: +# ----------------------- +# +# * Extract symbols from an rpm file and its debuginfo counterpart: +# ./dump_breakpad_symbols -d /tmp/syms \ +# -r tmp/impala-2.5.0+cdh5.7.0+0-1.cdh5.7.0.p0.147.el6.x86_64.rpm \ +# -s tmp/impala-debuginfo-2.5.0+cdh5.7.0+0-1.cdh5.7.0.p0.147.el6.x86_64.rpm +# +# Note that this will process all ELF binaries in the rpm, including both debug and +# release builds. Files are identified by hashes, so you don't need to worry about +# collisions and you can expect it to 'just work'. +# +# * Scan an impalad build directory and extract Breakpad symbols from all binaries: +# ./dump_breakpad_symbols.py -d /tmp/syms -b be/build/debug +# +# * Use the 'minidump_stackwalk' after symbol extraction tool to process a minidump file: +# $IMPALA_TOOLCHAIN/breakpad-*/bin/minidump_stackwalk \ +# /tmp/impala-minidumps/impalad/03c0ee26-bfd1-cf3e-43fa49ca-1a6aae25.dmp /tmp/syms + +import errno +import logging +import glob +import magic +import os +import shutil +import subprocess +import sys +import tempfile + +from argparse import ArgumentParser +from collections import namedtuple + +logging.basicConfig(level=logging.INFO) + +BinaryDebugInfo = namedtuple('BinaryDebugInfo', 'path, debug_path') + + +def die(msg=''): + """End the process, optionally after printing the passed error message.""" + logging.error('ERROR: %s\n' % msg) + sys.exit(1) + + +def find_dump_syms_binary(): + """Locate the 'dump_syms' binary from Breakpad. + + We try to locate the package in the Impala toolchain folder. + TODO: Lookup the binary in the system path. Not urgent, since the user can specify the + path as a command line switch. + """ + toolchain = os.environ.get('IMPALA_TOOLCHAIN') + if toolchain: + if not os.path.isdir(toolchain): + die('Could not find toolchain directory') + breakpad_version = os.environ.get('IMPALA_BREAKPAD_VERSION') + if not breakpad_version: + die('Could not determine breakpad version from toolchain') + breakpad_dir = 'breakpad-%s' % breakpad_version + dump_syms = os.path.join(toolchain, breakpad_dir, 'bin', 'dump_syms') + if not os.path.isfile(dump_syms): + die('Could not find dump_syms executable at %s' % dump_syms) + return dump_syms + return '' + + +def parse_args(): + """Parse command line arguments and perform sanity checks.""" + parser = ArgumentParser() + parser.add_argument('-d', '--dest_dir', required=True, help="""The target directory, + below which to place extracted symbol files""") + parser.add_argument('--dump_syms', help='Path to the dump_syms binary from Breakpad') + # Options controlling how to find input files. + parser.add_argument('-b', '--build_dir', help="""Path to a directory containing results + from an Impala build, e.g. be/build/debug""") + parser.add_argument('-f', '--binary_files', nargs='+', metavar="FILE", + help='List of binary files to process') + parser.add_argument('-i', '--stdin_files', action='store_true', help="""Read the list + of files to process from stdin""") + parser.add_argument('-r', '--rpm', help="""RPM file containing the binaries to process, + use with --debuginfo_rpm""") + parser.add_argument('-s', '--debuginfo_rpm', help="""RPM file containing the debug + symbols matching the binaries in --rpm""") + args = parser.parse_args() + + # Post processing checks + # Check that either both rpm and debuginfo_rpm are specified, or none. + if bool(args.rpm) != bool(args.debuginfo_rpm): + parser.print_usage() + die('Either both --rpm and --debuginfo_rpm have to be specified, or none') + input_flags = [args.build_dir, args.binary_files, args.stdin_files, args.rpm] + if sum(1 for flag in input_flags if flag) != 1: + die('You need to specify exactly one way to locate input files (-b/-f/-i/-r,-s)') + + return args + + +def ensure_dir_exists(path): + """Make sure the directory 'path' exists in a thread-safe way.""" + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST or not os.path.isdir(path): + raise e + + +def walk_path(path): + for dirpath, dirnames, filenames in os.walk(path): + for name in filenames: + yield os.path.join(dirpath, name) + + +def is_regular_file(path): + """Check whether 'path' is a regular file, especially not a symlink.""" + return os.path.isfile(path) and not os.path.islink(path) + + +def is_elf_file(path): + """Check whether 'path' is an ELF file.""" + return is_regular_file(path) and 'ELF' in magic.from_file(path) + + +def find_elf_files(path): + """Walk 'path' and return a generator over all ELF files below.""" + return (f for f in walk_path(path) if is_elf_file(f)) + + +def extract_rpm(rpm, out_dir): + """Extract 'rpm' into 'out_dir'.""" + assert os.path.isdir(out_dir) + cmd = 'rpm2cpio %s | cpio -id' % rpm + subprocess.check_call(cmd, shell=True, cwd=out_dir) + + +def assert_file_exists(path): + if not os.path.isfile(path): + die('File does not exists: %s' % path) + + +def enumerate_rpm_files(rpm, debuginfo_rpm): + """Return a generator over BinaryDebugInfo tuples for all ELF files in 'rpm'. + + This function extracts both RPM files, then walks the binary rpm directory to enumerate + all ELF files, matches them to the location of their respective .debug file and yields + all tuples thereof. We use a generator here to keep the temporary directory and its + contents around until the consumer of the generator has finished its processing. + """ + IMPALA_BINARY_BASE = os.path.join('usr', 'lib', 'impala') + IMPALA_DEBUGINFO_BASE = os.path.join('usr', 'lib', 'debug', IMPALA_BINARY_BASE) + assert_file_exists(rpm) + assert_file_exists(debuginfo_rpm) + tmp_dir = tempfile.mkdtemp() + try: + # Extract rpm + logging.info('Extracting: %s' % rpm) + extract_rpm(os.path.abspath(rpm), tmp_dir) + # Extract debuginfo_rpm + logging.info('Extracting: %s' % debuginfo_rpm) + extract_rpm(os.path.abspath(debuginfo_rpm), tmp_dir) + # Walk rpm path and find elf files + binary_base = os.path.join(tmp_dir, IMPALA_BINARY_BASE) + debuginfo_base = os.path.join(tmp_dir, IMPALA_DEBUGINFO_BASE) + # Find folder with .debug file in debuginfo_rpm path + for binary_path in find_elf_files(binary_base): + # Add tuple to output + rel_dir = os.path.relpath(os.path.dirname(binary_path), binary_base) + debug_dir = os.path.join(debuginfo_base, rel_dir) + yield BinaryDebugInfo(binary_path, debug_dir) + finally: + shutil.rmtree(tmp_dir) + + +def enumerate_binaries(args): + """Enumerate all BinaryDebugInfo tuples, from which symbols should be extracted. + + This function returns iterables, either lists or generators. + """ + if args.binary_files: + return (BinaryDebugInfo(f, None) for f in args.binary_files) + elif args.stdin_files: + return (BinaryDebugInfo(f, None) for f in sys.stdin.read().splitlines()) + elif args.rpm: + return enumerate_rpm_files(args.rpm, args.debuginfo_rpm) + elif args.build_dir: + return (BinaryDebugInfo(f, None) for f in find_elf_files(args.build_dir)) + die('No input method provided') + + +def process_binary(dump_syms, binary, out_dir): + """Dump symbols of a single binary file and move the result. + + Symbols will be extracted to a temporary file and moved into place afterwards. Required + directories will be created if necessary. + """ + logging.info('Processing binary file: %s' % binary.path) + ensure_dir_exists(out_dir) + # tmp_fd will be closed when the file object created by os.fdopen() below gets + # destroyed. + tmp_fd, tmp_file = tempfile.mkstemp(dir=out_dir, suffix='.sym') + try: + # Run dump_syms on the binary. + args = [dump_syms, binary.path] + if binary.debug_path: + args.append(binary.debug_path) + proc = subprocess.Popen(args, stdout=os.fdopen(tmp_fd, 'wb'), stderr=subprocess.PIPE) + _, stderr = proc.communicate() + if proc.returncode != 0: + sys.stderr.write('Failed to dump symbols from %s, return code %s\n' % + (binary.path, proc.returncode)) + sys.stderr.write(stderr) + os.remove(tmp_file) + return False + # Parse the temporary file to determine the full target path. + with open(tmp_file, 'r') as f: + header = f.readline().strip() + # Format of header is: MODULE os arch binary_id binary + _, _, _, binary_id, binary = header.split(' ') + out_path = os.path.join(out_dir, binary, binary_id) + ensure_dir_exists(out_path) + # Move the temporary file to its final destination. + shutil.move(tmp_file, os.path.join(out_path, '%s.sym' % binary)) + except Exception as e: + # Only need to clean up in case of errors. + try: + os.remove(tmp_file) + except EnvironmentError: + pass + raise e + return True + + +def main(): + args = parse_args() + dump_syms = args.dump_syms or find_dump_syms_binary() + assert dump_syms + status = 0 + ensure_dir_exists(args.dest_dir) + for binary in enumerate_binaries(args): + if not process_binary(dump_syms, binary, args.dest_dir): + status = 1 + sys.exit(status) + + +if __name__ == '__main__': + main() http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/12799fae/infra/python/deps/requirements.txt ---------------------------------------------------------------------- diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt index 9ed842c..2725344 100644 --- a/infra/python/deps/requirements.txt +++ b/infra/python/deps/requirements.txt @@ -46,6 +46,7 @@ pytest == 2.7.2 py == 1.4.30 pytest-random == 0.02 pytest-xdist == 1.12 +python-magic == 0.4.11 pywebhdfs == 0.3.2 pbr == 1.8.1 requests == 2.7.0
