#!/usr/bin/perl
# This script is a combined perl and systemtap script to collect information
# on a system stalling in writeback. Ordinarily, one would expect that all
# information be collected in a STAP script. Unfortunately, in practice the
# stack unwinder in systemtap may not work with a current kernel version,
# have trouble collecting all the data necessary or some other oddities.
# Hence this hack. A systemtap script is run and as it records interesting
# events, the remaining information is collected from the script. This means
# that the data is *never* exact but can be better than nothing and easier
# than a fully manual check
#
# Copyright Mel Gorman <mgorman@suse.de> 2011

use File::Temp qw/mkstemp/;
use File::Find;
use FindBin qw($Bin);
use Getopt::Long;
use strict;

my @trace_functions = (
	# "get_request_wait" is now special cased unfortunately
	"wait_for_completion",
	"wait_on_page_bit",
	"wait_on_page_bit_killable",
	"try_to_free_pages",
	"shrink_zone");

my @trace_conditional = (
	"sync_page",
	"sync_buffer",
	"sleep_on_buffer",
	"try_to_compact_pages",
	"balance_dirty_pages_ratelimited_nr",
	"balance_dirty_pages",
	"jbd2_log_wait_commit",
	"__jbd2_log_wait_for_space",
	"log_wait_commit",
	"__log_wait_for_space");

# Information on each stall is gathered and stored in a hash table for
# dumping later. Define some constants for the table lookup to avoid
# blinding headaches
use constant VMSTAT_AT_STALL       => 0;
use constant VMSTAT_AT_COMPLETE    => 1;
use constant BLOCKSTAT_AT_STALL    => 2;
use constant BLOCKSTAT_AT_COMPLETE => 3;
use constant PROCNAME              => 4;
use constant STACKTRACE            => 5;
use constant STALLFUNCTION         => 6;

use constant NR_WRITEBACK => 0;
use constant NR_DIRTY     => 2;
use constant VMSCAN_WRITE => 1;

sub usage() {
	print("In general, this script is not supported and that includes help.\n");
	exit(0);
}

# Option variables
my $opt_help;
my $opt_output;
my $opt_traceout;
GetOptions(
	'help|h'		=> \$opt_help,
	'output|o=s'		=> \$opt_output,
	'traceout|t=s'		=> \$opt_traceout,
);

usage() if $opt_help;
if ($opt_output) {
	open(OUTPUT, ">$opt_output") || die("Failed to open $opt_output for writing");
}
if ($opt_traceout) {
	open(TRACEOUT, ">$opt_traceout") || die("Failed to open $opt_traceout for writing");
}

my @symbols;
my @addresses;

# Handle cleanup of temp files
my $stappid;
my ($handle, $stapscript) = mkstemp("/tmp/stapdXXXXX");
sub cleanup {
	if (defined($stappid)) {
		kill INT => $stappid;
	}
	if (defined($opt_output)) {
		close(OUTPUT);
	}
	unlink($stapscript);
}
sub sigint_handler {
	close(STAP);
	cleanup();
	exit(0);
}
$SIG{INT} = "sigint_handler";

# Build a list of stat files to read. Obviously this is not great if device
# hotplug occurs but that is not expected for the moment and this is lighter
# than running find every time
my @block_iostat_files;
sub d {
	my $file = $File::Find::name;
	return if $file eq "/sys/block";
	push(@block_iostat_files, "$file/stat");
}
find(\&d, ("/sys/block/"));

sub binsearch {
	my $val = $_[0];
	my $arr = $_[1];
	my $s = 0;
	my $e = $#{$arr};
	my $m;

	while ($s < $e) {
		$m = ($s + $e + 1) / 2;
		if (${$arr}[$m] < $val) {
			$s = $m;
		} elsif (${$arr}[$m] > $val) {
			$e = $m - 1;
		} else {
			return $m;
		}
	}
	return $e;
}

##
# Read the stack trace from the trace buffer
sub read_stacktrace {
	my $stack;
	my $index;
	my $addr;
	my @line;
	my $fh = shift;

	while (<$fh>) {
		log_trace($_);
		chomp;
		@line = split/ +/;
		if ($line[5] eq "--") {
			last;
		}
		$addr = hex($line[5]);
		$index = binsearch($addr, \@addresses);
		if ($index > 0) {
			$stack .= sprintf("<%016lx> %s\n",
					  $addr, $symbols[$index]);
		}
	}
	return $stack;
}

##
# Read information of relevant from /proc/vmstat
sub read_vmstat {
	if (!open(VMSTAT, "/proc/vmstat")) {
		cleanup();
		die("Failed to read /proc/vmstat");
	}

	my $vmstat;
	my ($key, $value);
	my @values;
	while (!eof(VMSTAT)) {
		$vmstat = <VMSTAT>;
		($key, $value) = split(/\s+/, $vmstat);
		chomp($value);

		if ($key eq "nr_writeback") {
			$values[NR_WRITEBACK] = $value;
		}
		if ($key eq "nr_dirty") {
			$values[NR_DIRTY] = $value;
		}
		if ($key eq "nr_vmscan_write") {
			$values[VMSCAN_WRITE] = $value;
		}
	}

	return \@values;
}

##
# Read information from all /sys/block stat files
sub read_blockstat($) {
	my $prefix = $_[0];
	my $stat;
	my $ret;
	
	foreach $stat (@block_iostat_files) {
		if (open(STAT, $stat)) {
			$ret .= sprintf "%s%20s %s", $prefix, $stat, <STAT>;
			close(STAT);
		}
	}
	return $ret;
}

##
# Record a line of output
sub log_printf {
	if (defined($opt_output)) {
		printf OUTPUT @_;
	}
	printf @_;
}

sub log_trace {
	if (defined($opt_traceout)) {
		print TRACEOUT @_;
	}
}

# Read kernel symbols and add conditional trace functions if they exist
open(KALLSYMS, "/proc/kallsyms") || die("Failed to open /proc/kallsyms");
my $found_get_request_wait = 0;
while (<KALLSYMS>) {
	my ($addr, $type, $symbol) = split(/\s+/, $_);
	my $conditional;

	push(@symbols, $symbol);
	push(@addresses, hex($addr));
	if ($symbol eq "get_request_wait") {
		push(@trace_functions, $symbol);
		$found_get_request_wait = 1;
		next;
	}
	foreach $conditional (@trace_conditional) {
		if ($symbol eq $conditional) {
			push(@trace_functions, $symbol);
			last;
		}
	}
}
close(KALLSYMS);

if (!$found_get_request_wait) {
	push(@trace_functions, "get_request");
}

# Extract the framework script and fill in the rest
open(SELF, "$0") || die("Failed to open running script");
while (<SELF>) {
	chomp($_);
	if ($_ ne "__END__") {
		next;
	}
	while (<SELF>) {
		print $handle $_;
	}
}
foreach(@trace_functions) {
	print $handle "probe kprobe.function(\"$_\")
{ 
	t=tid()
	stalled_at[t]=time()
	name[t]=execname()
	where[t]=\"$_\"
	delete stalled[t]
}";
}

foreach(@trace_functions) {
	print $handle "probe kprobe.function(\"$_\").return
{
	t=tid()

	if ([t] in stalled) %{ {
		char *where = _stp_map_get_is(global.s_where, l->t);
		int i;
		unsigned long stack_entries[MAX_STACK_ENTRIES];
		struct stack_trace trace = {
			.skip = 6,
			.max_entries = MAX_STACK_ENTRIES,
			.entries = stack_entries
		};

		trace_printk(\"C %s\\n\", where ? : \"\");
		save_stack_trace(&trace);
		for (i = 0; i < trace.nr_entries; i++)
			if (stack_entries[i] != ULONG_MAX)
				trace_printk(\"%lx\\n\", stack_entries[i]);
		trace_printk(\"--\\n\");
		0;
	} %}

	delete stalled[t]
	delete stalled_at[t]
	delete name[t]
	delete where[t]
}"
}

close($handle);

# Contact
$stappid = open(STAP, "stap -g $stapscript|");
if (!defined($stappid)) {
	die("Failed to execute stap script");
}

open(TRACE, "<", "/sys/kernel/debug/tracing/trace_pipe") || die("Cannot open trace pipe!");

# Collect information until interrupted
my %stalled;
while (1) {
	my $input = <TRACE>;
	log_trace($input);
        #              proc-pid CPU        state? time              func   data...
	if ($input =~ / +[^ ]+ +\[[0-9]*\].* ([0-9]+\.[0-9]+): probe_[0-9]+: S ([0-9]*) \((.*)\) ([0-9]*) ms (.*)/) {
		my $pid      = $2;
		my $name     = $3;
		my $stallbegin = $1*1000 - $4;
		my $where    = $5;
	
		if (defined($stalled{$pid}->{NAME})) {
			cleanup();
			print("Apparently recursing stalls! This should not happen.\n");
			print("Process:  $pid ($name)\n");
			print("Stalled:  " . $stalled{$pid}->{STALLFUNCTION} . "\n");
			print($stalled{$pid}->{STACKTRACE});
			print("Stalling: $where\n");
			exit(-1);
		}
		$stalled{$pid}->{NAME} = $name;
		$stalled{$pid}->{VMSTAT_AT_STALL} = read_vmstat();
		$stalled{$pid}->{BLOCKSTAT_AT_STALL} = read_blockstat("-");
		$stalled{$pid}->{STALLFUNCTION} = $where;
		$stalled{$pid}->{STALLBEGIN} = $stallbegin;
	} elsif ($input =~ / +[^ ]+-([0-9]+) +\[[0-9]*\].* ([0-9]+\.[0-9]+): probe_[0-9]+: C (.*)/) {
		my $pid      = $1;
		my $stallend = $2*1000;
		my $where    = $3;

		if ($where ne $stalled{$pid}->{STALLFUNCTION}) {
			cleanup();
			die("The stalling function teleported.");
		}

		$stalled{$pid}->{STACKTRACE} = read_stacktrace(\*TRACE);
		$stalled{$pid}->{VMSTAT_AT_COMPLETE} = read_vmstat();
		$stalled{$pid}->{BLOCKSTAT_AT_COMPLETE} = read_blockstat("+");
		my $delta_writeback    = $stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_WRITEBACK] - $stalled{$pid}->{VMSTAT_AT_STALL}[NR_WRITEBACK];
		my $delta_dirty        = $stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_DIRTY]     - $stalled{$pid}->{VMSTAT_AT_STALL}[NR_DIRTY];
		my $delta_vmscan_write = $stalled{$pid}->{VMSTAT_AT_COMPLETE}[VMSCAN_WRITE] - $stalled{$pid}->{VMSTAT_AT_STALL}[VMSCAN_WRITE];

		# Blind stab in the dark as to what is going on
		my $status;
		if ($where eq "balance_dirty_pages") {
			$status = "DirtyThrottled";
		} else {
			$status = "IO";
		}
		if ($delta_writeback < 0) {
			$status = "${status}_WritebackInProgress";
		}
		if ($delta_writeback > 0) {
			$status = "${status}_WritebackSlow";
		}

		log_printf("time %d %u (%s) Stalled: %u ms: %s\n", time(),
			   $pid, $stalled{$pid}->{NAME},
			   $stallend - $stalled{$pid}->{STALLBEGIN}, $where);
		log_printf("Guessing: %s\n", $status);
		log_printf("-%-15s %12d\n", "nr_dirty",        $stalled{$pid}->{VMSTAT_AT_STALL}[NR_DIRTY]);
		log_printf("-%-15s %12d\n", "nr_writeback",    $stalled{$pid}->{VMSTAT_AT_STALL}[NR_WRITEBACK]);
		log_printf("-%-15s %12d\n", "nr_vmscan_write", $stalled{$pid}->{VMSTAT_AT_STALL}[VMSCAN_WRITE]);
		log_printf("%s", $stalled{$pid}->{BLOCKSTAT_AT_STALL});
		log_printf("+%-15s %12d %12d\n", "nr_dirty",
			$stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_DIRTY], $delta_dirty);
		log_printf("+%-15s %12d %12d\n", "nr_writeback",
			$stalled{$pid}->{VMSTAT_AT_COMPLETE}[NR_WRITEBACK], $delta_writeback);
		log_printf("+%-15s %12d %12d\n", "nr_vmscan_write",
			$stalled{$pid}->{VMSTAT_AT_COMPLETE}[VMSCAN_WRITE],
			$delta_vmscan_write);
		log_printf("%s", $stalled{$pid}->{BLOCKSTAT_AT_COMPLETE});
		log_printf($stalled{$pid}->{STACKTRACE});

		delete($stalled{$pid});
	} else {
		cleanup();
		die("Failed to parse input from stap script:\n".$input);
	}
}

cleanup();
exit(0);
__END__
%{
#include <linux/kernel.h>
#include <linux/stacktrace.h>

#define MAX_STACK_ENTRIES 32
%}

function time () { return gettimeofday_ms() }
global stall_threshold = 1000
global stalled_at, stalled, where
global name

probe timer.profile {
	foreach (tid+ in stalled_at) {
		if ([tid] in stalled) continue

		stall_time = time() - stalled_at[tid]
		if (stall_time >= stall_threshold) {
			%{ {
			char *where = _stp_map_get_is(global.s_where, l->tid);
			char *pname = _stp_map_get_is(global.s_name, l->tid);

			trace_printk("S %lld (%s) %lld ms %s\n", l->tid, pname ? : "", l->stall_time, where ? : "");
			0;
			} %}
			stalled[tid] = 1 # defer further reports to wakeup
		}
	}
}

