Hi all

I recently wrote a utility that adds a $node->gdb_backends() method to
PostgresNode instances - figured I'd share it here in case anyone finds it
useful, or wants to adopt it into the features of the TAP tools.

This function provides a one-line way to dump stacks for all running
backends to per-pid files or to the main test log, as well as the values of
various global variables that are potentially of interest. A default set of
globals will be dumped for each backend and the caller can specify
additional expressions of interest.

If requested, cores will be dumped for each running backend.

A subset of backends may be passed by pid instead, so you can easily target
specific backends you're interested in.

I initially wrote this to help debug a variety of issues with shutdown,
where I hacked the PostgresNode stop() method to trap failed shutdowns and
report stacks for all surviving processes + the postmaster in my wrapper
class for PostgresNode:

sub stop {
    my ($self, $mode) = @_;
    local($@);
    eval {
        PostgresNode::stop($self, $mode);
    };
    if ($@) {
        $node->gdb_backends(want_cores => 1);
        die $@;
    }
}
#
# This is an excerpt from a subclass of PostgresNode
#

# Generate backtraces and optionally core files for all user backends and
# walsenders associated with this node. Requires gdb to be present. Cores
# will be labeled by node name.
sub gdb_backends {
        my ($self, %kwargs) = @_;
        $kwargs{backtrace_timeout_s} //= '60';
        $kwargs{core_timeout_s} //= '60';
        $kwargs{want_cores} //= 0;
        $kwargs{core_name_pattern} //= 'core.{{pid}}';
        $kwargs{gdb_logfile_pattern} //= '';

        my $postmaster_pid = $self->{_pid};
        my $pgname = $self->name;

        # Globals
        # TODO make these conditional on an expression to filter them.
        # TODO handle statics that vary across files
        # TODO add typecasts for when we don't have debuginfo
        # TODO useful GUCs
        #
        my @print_exprs = (
                # All backends
                'IsPostmasterEnvironment',
                'IsUnderPostmaster',
                'PostmasterPid',
                'LocalRecoveryInProgress',
                '*MyProc',
                'MyAuxProcType',
                '*XLogCtl',
                '*ControlFile',

                # Generic signal handling
                'InterruptPending',
                'ProcDiePending',
                'ShutdownRequestPending',
                'ConfigReloadPending',

                # user backend / postgres
                'xact_started',
                'doing_extended_query_message',
                'ignore_till_sync',

                # startup process
                'ThisTimeLineID',
                'LastRec',
                'ArchiveRecoveryRequested',
                'InArchiveRecovery',
                'PrimaryConnInfo',
                'PrimarySlotName',
                'StandbyMode',

                # autovac
                'am_autovacuum_launcher',
                'am_autovacuum_worker',
                'got_SIGHUP',
                'got_SIGUSR2',
                'got_SIGTERM',
                "'autovacuum.c':got_SIGTERM",

                # for walsenders
                'am_walsender',
                'am_cascading_walsender',
                'am_db_walsender',
                '*MyWalSnd',
                '*xlogreader',
                'sendTimeLine',
                'sentPtr',
                'streamingDoneSending',
                'streamingDoneReceiving',
                "'walsender.c':got_SIGTERM",
                'got_STOPPING',
                'got_SIGUSR2',
                'replication_active',
                '*logical_decoding_ctx',
                'logical_startptr',

                # walreceiver
                'recvFileTLI',
                '*wrconn',

                # checkpointer
                '*CheckpointerShmem',
                'last_checkpoint_time',
                'ckpt_active',

                # for bgworkers
                'IsBackgroundWorker',

                # for pgl backends
                '*MyPGLogicalWorker',
                '*MyPGLSubscription',

                # for bdr backends
                '*MyBdrSubscription',

                # postmaster
                'pmState',
        );

        # Add your own print expressions by passing print_exprs => ['var1', 
'var2']
        push @print_exprs, @{$kwargs{print_exprs}}
                if (defined($kwargs{print_exprs}));

        my @pids;
        if (defined($kwargs{pids})) {
                if (ref($kwargs{pids}) eq 'ARRAY') {
                        # arrayref pid-list
                        @pids = @{$kwargs{pids}};
                } elsif (ref($kwargs{pids}) eq '') {
                        # Scalar pid-list
                        @pids = split(qr/[\r\n]/, $kwargs{pids});
                } else {
                        die("keyword argument 'pids' must be undef, an 
arrayref, or a scalar string of pids");
                }
        } else {
                # Probe all children. Default if no pid list passed.
                #
                # We can't rely on querying the db because it might be shutting 
down so we don't
                # want to use pg_stat_activity and pg_stat_repliation. Use the 
postmaster pid
                # instead, with ps.
                my ($stdout, $stderr);
                IPC::Run::run(['pgrep', '--parent', $postmaster_pid], '>', 
\$stdout, '2>', \$stderr);
                print("raw pid list: $stdout\n");
                @pids = split(qr/[\r\n]/, $stdout);
                if (scalar(@pids) == 0) {
                        print("Failed to find child processes for pid 
$postmaster_pid. pgrep produced stdout \"$stdout\" and stderr \"$stderr\".\n");
                        return;
                }
                # Include postmaster itself in the list
                push @pids, $postmaster_pid;
        }
        print("getting backtraces of children of postmaster $postmaster_pid for 
node $pgname: @pids\n");
        foreach my $pid (@pids) {
                my $core_path = $kwargs{core_name_pattern};
                $core_path =~ s/\{\{pmpid\}\}/$postmaster_pid/g;
                $core_path =~ s/\{\{pid\}\}/$pid/g;
                $core_path =~ s/\{\{name\}\}/$pgname/g;
                if (dirname($core_path) ne "") {
                        make_path(dirname($core_path));
                }
                my $gdb_logfile = $kwargs{gdb_logfile_pattern};
                $gdb_logfile =~ s/\{\{pmpid\}\}/$postmaster_pid/g;
                $gdb_logfile =~ s/\{\{pid\}\}/$pid/g;
                $gdb_logfile =~ s/\{\{name\}\}/$pgname/g;
                if (dirname($gdb_logfile) ne "") {
                        make_path(dirname($gdb_logfile));
                }
                my $gdbcmds = q[set prompt
                        set style enabled off
                        set pagination off
                        set print pretty on
                        set print max-depth 20
                        set print frame-arguments all
                        set print frame-info source-and-location
                        set print entry-values if-needed
                        set print symbol-filename on
                        set print symbol-loading full
                        set print type typedefs on
                        set print symbol on
                        set print array on
                        set print array-indexes on
                        set print elements 100
                        set print null-stop on
                ];
                $gdbcmds .= qq[attach $pid
                ];
                $gdbcmds .= q[info proc
                        if ($_exitsignal)
                        printf "Exited with signal: %d\n", $_exitsignal
                        end
                        printf "application_name = %s\n", application_name
                        printf "debug_query_string = %s\n", debug_query_string
                        echo \nbacktrace (short):\n
                        bt
                        echo \nbacktrace (extended):\n
                        thread apply all bt full
                        printf "\n\nEXPRESSIONS:\n"
                ];
                for my $print_expr (@print_exprs) {
                        # This prints annoying $nn convenience variable labels, 
but
                        # there's not much to be done about that.
                        $gdbcmds .= qq[printf "$print_expr: "
                                p $print_expr
                        ];
                }
                if ($kwargs{'want_cores'}) {
                        $gdbcmds .= qq[printf "\\n\\n"
                        gcore $core_path
                        ];
                }
                $gdbcmds .= q[quit 0
                ];

                # TODO: recursively expand some of the target vars
                #
                # Hint: if you want to run REALLY REALLY SLOW you can also get 
state of
                # globals. We should possibly do this but it's not easy to 
filter out the
                # libc stuff etc, and requires some back-and-forth with gdb.
                #
                my @log_arg = ();
                if ($gdb_logfile) {
                        print("Writing gdb log to ${gdb_logfile}\n");
                        @log_arg = ('&>', $gdb_logfile);
                }
                my $timeout = IPC::Run::timeout($kwargs{'backtrace_timeout_s'});
                print("--BACKTRACE-START-- $pid\n") unless ($gdb_logfile);
                my $h = IPC::Run::start(['gdb'], '<', \$gdbcmds, @log_arg, 
$timeout);
                do {
                        $h->pump;
                } while $h->pumpable;
                $h->finish;
                print("--BACKTRACE-END-- $pid\n") unless ($gdb_logfile);
                my $rc = $h->result(0);
                print("gdb exited with $rc\n") unless ($rc == 0);
        }
        print("all backtraces and (if requested) cores have been collected.\n");
}

Reply via email to