Lots of changes to how we handle traces - adds robustness & speed

From:  Alan D. Brunelle <[EMAIL PROTECTED]>
Signed-off-by: Alan D. Brunelle <[EMAIL PROTECTED]>

This large patch contains the following changes to the trace handling
aspects of btt:

1. Use larger buffers for output options.

2. Use mmap to handle the input of trace data.

3. More precise btt statistics are output at the end.

4. Added in (under DEBUG) the display of unhandled traces. I was running
into the problem where traces were not being connected, and the rb trees
would get quite large. This would slow things down considerably. (See
below for details on why traces weren't being handled.)

5. Sprinkled some ASSERTs (under DEBUG).

6. Added a new btt-specific trace type: "links" - since 'A' (remaps)
contain two separate pieces of information, I broke them up into a link
and a remap trace. [Thus, it is easy to find either end of the remap.]

7. Added in the notion of retries of completes (and requeues). I'm finding
some discrepencies in the time stamps, in order to make btt handle these
better, I've added the notion of keeping the trace around for a bit,
to see if it gets linked up later.

Note there are still circumstances under which I just punt: for example,
if the 'C' comes out much sooner than the associated 'D'. Here's a
snippet to give you an idea. This was done on a dual-Athlon 64 workstation
with a single disk.

 22,0    1      174    10.508091327  5867  C   R 442499106 + 8 [0]
 22,0    0       31    10.508929343  8300  Q   R 442499106 + 8 [cp]
 22,0    0       32    10.508931125  8300  G   R 442499106 + 8 [cp]
 22,0    0       33    10.508931674  8300  P   R [cp]
 22,0    0       34    10.508931962  8300  I   R 442499106 + 8 (     837) [cp]
 22,0    0       35    10.508933662  8300  D   R 442499106 + 8 (    1700) [cp]

We see almost a 900 microsecond discrepancy in time on the two CPUs. [At
least the 'C' for the 'D' event is coming out with a timestamp 842
microseconds earlier.] I have also seen this on IA64's, which means the
problem is probably architecture-independent.

8. Separated trace streams into: simple IOs and remapped IOs. Due to the
various issues in incorrect timelines, it was just getting too hard to
combine the two. Of course, the real fix is to fix the time stamping -
and when that is done, we can then get back to where we were: having a
notion of all the IOs that are split off from one.

oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo

This work was done in order to handle some of these large data sets, and
I've found that the performance is reasonable - here are some stats for
very large file (the largest of which used to take well over 20 minutes,
now it takes about 5 1/2 minutes - and a lot of that is just getting the
18GiB of data read in):

Size  Real     User     System
----- -------- -------- -------
 7GiB 123.445s  80.188s 11.392s
10GiB 179.148s 137.456s 16.680s
13GiB 237.561s 156.992s 21.968s
16GiB 283.262s 187.468s 26.748s
18GiB 336.345s 225.084s 31.200s

This set of patches seems to triple performance (we were maxing out in
the low 300 K traces per second, now I'm seeing on the order of 850-900
K traces per second).
---

btt/Makefile         |    3 -
btt/args.c           |    8 +-
btt/bt_timeline.c    |   47 +++++++----
btt/devs.c           |   60 +++++++++++++-
btt/dip_rb.c         |    5 +
btt/globals.h        |   85 ++++++++++----------
btt/inlines.h        |  149 ++++++++++++++++++++++++++++------
btt/misc.c           |   34 --------
btt/mmap.c           |  144 +++++++++++++++++++++++++++++++++
btt/trace.c          |   88 ++++++++------------
btt/trace_complete.c |  217 ++++++++++++++++++++++++++++----------------------
btt/trace_im.c       |   94 ++++++++++++++--------
btt/trace_issue.c    |  104 ++++++++++++------------
btt/trace_queue.c    |   89 +++++++++------------
btt/trace_remap.c    |  145 ++++++++++++++++-----------------
btt/trace_requeue.c  |   91 +++++++++++++++------
16 files changed, 852 insertions(+), 511 deletions(-)

diff --git a/btt/Makefile b/btt/Makefile
index df65723..9c90668 100644
--- a/btt/Makefile
+++ b/btt/Makefile
@@ -8,7 +8,8 @@ #PLIBS  = -lpthread
LIBS    = $(PLIBS) $(ELIBS)
OBJS    = args.o bt_timeline.o devmap.o devs.o dip_rb.o iostat.o latency.o \
  misc.o output.o proc.o seek.o trace.o trace_complete.o trace_im.o \
-         trace_issue.o trace_queue.o trace_remap.o trace_requeue.o rbtree.o
+         trace_issue.o trace_queue.o trace_remap.o trace_requeue.o rbtree.o \
+         mmap.o

all: depend $(PROGS)

diff --git a/btt/args.c b/btt/args.c
index b0351e8..e6df5e7 100644
--- a/btt/args.c
+++ b/btt/args.c
@@ -235,11 +235,7 @@ void handle_args(int argc, char *argv[])
        exit(1);
}

-       ifd = open(input_name, O_RDONLY);
-       if (ifd < 0) {
-               perror(input_name);
-               exit(1);
-       }
+       setup_ifile(input_name);

if (output_name == NULL)
        ranges_ofp = avgs_ofp = stdout;
@@ -273,6 +269,7 @@ void handle_args(int argc, char *argv[])
                perror(iostat_name);
                exit(1);
        }
+               setbuffer(iostat_ofp, malloc(64 * 1024), 64 * 1024);
}

if (per_io_name != NULL) {
@@ -281,5 +278,6 @@ void handle_args(int argc, char *argv[])
                perror(per_io_name);
                exit(1);
        }
+               setbuffer(per_io_ofp, malloc(64 * 1024), 64 * 1024);
}
}
diff --git a/btt/bt_timeline.c b/btt/bt_timeline.c
index e6522c9..36d9c74 100644
--- a/btt/bt_timeline.c
+++ b/btt/bt_timeline.c
@@ -21,6 +21,8 @@
#include <stdio.h>
#include <string.h>
#include <unistd.h>
+#include <sys/time.h>
+#include <time.h>
#include "globals.h"

char bt_timeline_version[] = "0.99";
@@ -28,7 +30,7 @@ char bt_timeline_version[] = "0.99";
char *devices, *exes, *input_name, *output_name, *seek_name;
char *d2c_name, *q2c_name, *per_io_name;
FILE *ranges_ofp, *avgs_ofp, *per_io_ofp;
-int ifd, verbose, done, time_bounded;
+int verbose, done, time_bounded;
double t_astart, t_aend;
unsigned long n_traces;
struct avgs_info all_avgs;
@@ -40,6 +42,7 @@ LIST_HEAD(free_ios);

double range_delta = 0.1;
__u64 last_q = (__u64)-1;
+__u64 next_retry_check = 0;

struct region_info all_regions = {
.qranges = LIST_HEAD_INIT(all_regions.qranges),
@@ -48,6 +51,10 @@ struct region_info all_regions = {
.cr_cur = NULL
};

+#if defined(DEBUG)
+int rb_tree_size;
+#endif
+
int process(void);

int main(int argc, char *argv[])
@@ -62,28 +69,28 @@ int main(int argc, char *argv[])
return 0;
}

+static inline double tv2dbl(struct timeval *tv)
+{
+       return (double)tv->tv_sec + (((double)tv->tv_usec) / (1000.0 * 1000.0));
+}
+
int process(void)
{
int ret = 0;
-       struct blk_io_trace *t;
struct io *iop = io_alloc();
+       struct timeval tvs, tvi, tve;

genesis = last_vtrace = time(NULL);
-       while (!done && !do_read(ifd, &iop->t, sizeof(struct blk_io_trace))) {
-               t = convert_to_cpu(&iop->t);
-               if (t->pdu_len > 0) {
-                       iop->pdu = malloc(t->pdu_len);
-                       if (do_read(ifd, iop->pdu, t->pdu_len)) {
-                               ret = 1;
-                               break;
-                       }
-               }
+       gettimeofday(&tvs, NULL);
+       while (!done && next_trace(&iop->t, &iop->pdu)) {
        add_trace(iop);
        iop = io_alloc();
}

io_release(iop);
-       do_retries();
+       do_retries(0);
+
+       gettimeofday(&tvi, NULL);

if (iostat_ofp) {
        fprintf(iostat_ofp, "\n");
@@ -92,11 +99,21 @@ int process(void)

seek_clean();
latency_clean();
+       gettimeofday(&tve, NULL);

if (verbose) {
-               double tps = (double)n_traces / 
-                                       (double)((time(NULL) + 1) - genesis);
-               printf("%10lu traces @ %.1lf Ktps\n", n_traces, tps/1000.0);
+               double tps, dt_input = tv2dbl(&tvi) - tv2dbl(&tvs);
+               
+               tps = (double)n_traces / dt_input;
+               printf("%10lu traces @ %.1lf Ktps\t%.6lf+%.6lf=%.6lf\n", 
+                       n_traces, tps/1000.0,
+                       dt_input, tv2dbl(&tve) - tv2dbl(&tvi),
+                       tv2dbl(&tve) - tv2dbl(&tvs));
+#if defined(DEBUG)
+               printf("\ttree = |%d|\n", rb_tree_size);
+               if (rb_tree_size > 0)
+                       dump_rb_trees();
+#endif
}

return ret;
diff --git a/btt/devs.c b/btt/devs.c
index d2df095..5eba802 100644
--- a/btt/devs.c
+++ b/btt/devs.c
@@ -25,6 +25,55 @@ #define N_DEV_HASH   128
#define DEV_HASH(dev)   ((MAJOR(dev) ^ MINOR(dev)) & (N_DEV_HASH - 1))
struct list_head        dev_heads[N_DEV_HASH];

+#if defined(DEBUG)
+void __dump_rb_node(struct rb_node *n)
+{
+       struct io *iop = rb_entry(n, struct io, rb_node);
+
+       dbg_ping();
+       if (iop->type == IOP_A)
+               __dump_iop2(stdout, iop, iop->down_iop);
+       else
+               __dump_iop(stdout, iop, 0);
+       if (n->rb_left)
+               __dump_rb_node(n->rb_left);
+       if (n->rb_right)
+               __dump_rb_node(n->rb_right);
+}
+
+void __dump_rb_tree(struct d_info *dip, enum iop_type type)
+{
+       struct rb_root *roots = dip->heads;
+       struct rb_root *root = &roots[type];
+       struct rb_node *n = root->rb_node;
+
+       if (n) {
+               printf("\tIOP_%c\n", type2c(type));
+               __dump_rb_node(n);
+       }
+}
+
+void dump_rb_trees(void)
+{
+       int i;
+       enum iop_type type;
+       struct d_info *dip;
+       struct list_head *p;
+
+       for (i = 0; i < N_DEV_HASH; i++) {
+               __list_for_each(p, &dev_heads[i]) {
+                       dip = list_entry(p, struct d_info, hash_head);
+                       printf("Trees for %3d,%-3d\n", MAJOR(dip->device),
+                              MINOR(dip->device));
+                       for (type = IOP_Q; type < N_IOP_TYPES; type++) {
+                               if (type != IOP_L)
+                                       __dump_rb_tree(dip, type);
+                       }
+               }
+       }
+}
+#endif
+
void init_dev_heads(void)
{
int i;
@@ -34,8 +83,8 @@ void init_dev_heads(void)

struct d_info *__dip_find(__u32 device)
{
-       struct list_head *p;
struct d_info *dip;
+       struct list_head *p;

__list_for_each(p, &dev_heads[DEV_HASH(device)]) {
        dip = list_entry(p, struct d_info, hash_head);
@@ -67,12 +116,19 @@ struct d_info *dip_add(__u32 device, str
}

iop->linked = dip_rb_ins(dip, iop);
+#if defined(DEBUG)
+       if (iop->linked) 
+               rb_tree_size++;
+#endif
return dip;
}

void dip_rem(struct io *iop)
{
-       dip_rb_rem(iop);
+       if (iop->linked) {
+               dip_rb_rem(iop);
+               iop->linked = 0;
+       }
}

void dip_foreach(struct io *iop, enum iop_type type, 
diff --git a/btt/dip_rb.c b/btt/dip_rb.c
index cfc607f..13861c7 100644
--- a/btt/dip_rb.c
+++ b/btt/dip_rb.c
@@ -76,7 +76,10 @@ void rb_foreach(struct rb_node *n, struc

        if ((iop_s <= this_s) && (this_e <= iop_e)) {
                if (fnc) fnc(iop, this);
-                       if (head) list_add_tail(&this->f_head, head);
+                       if (head) {
+                               ASSERT(this->f_head.next == LIST_POISON1);
+                               list_add_tail(&this->f_head, head);
+                       }
        }
        if (iop_s < this_s)
                rb_foreach(n->rb_left, iop, fnc, head);
diff --git a/btt/globals.h b/btt/globals.h
index 802ff95..56c10a6 100644
--- a/btt/globals.h
+++ b/btt/globals.h
@@ -52,9 +52,9 @@ #else
#define ASSERT(truth)
#define DBG_PING()
#define LIST_DEL(hp)    do {                                            \
-                               if (((hp)->next != NULL) &&             \
-                                   ((hp)->next != LIST_POISON1))       \
-                                       list_del(hp);                   \
+                               ASSERT((hp)->next != NULL);             \
+                               ASSERT(!list_empty(hp));                \
+                               list_del(hp);                           \
                } while (0)
#endif

@@ -62,14 +62,14 @@ enum iop_type {
IOP_Q = 0,
IOP_X = 1,
IOP_A = 2,
-       IOP_L = 3,      // Betwen-device linkage
+       IOP_I = 3,
IOP_M = 4,
-       IOP_I = 5,
-       IOP_D = 6,
-       IOP_C = 7,
-       IOP_R = 8,
+       IOP_D = 5,
+       IOP_C = 6,
+       IOP_R = 7,
+       IOP_L = 8,      // IOP_A -> IOP_A + IOP_L
};
-#define N_IOP_TYPES    (IOP_R + 1)
+#define N_IOP_TYPES    (IOP_L + 1)

struct file_info {
struct file_info *next;
@@ -158,18 +158,16 @@ struct d_info {
};

struct io {
+       struct io *up_iop, *down_iop;
struct rb_node rb_node;
-       struct list_head f_head;
+       struct list_head f_head, down_head, up_head, links, c_pending, retry;
struct d_info *dip;
struct p_info *pip;
-       struct blk_io_trace t;
void *pdu;
-       enum iop_type type;
-
-       struct list_head down_head, up_head, c_pending, retry;
-       struct list_head down_list, up_list;
+       struct blk_io_trace t;
__u64 bytes_left;
-       int run_ready, linked, self_remap, displayed, on_retry_list;
+       int linked, on_retry_list;
+       enum iop_type type;
};

/* bt_timeline.c */
@@ -178,17 +176,20 @@ extern char bt_timeline_version[], *devi
extern char *seek_name, *iostat_name, *d2c_name, *q2c_name, *per_io_name;
extern double range_delta;
extern FILE *ranges_ofp, *avgs_ofp, *iostat_ofp, *per_io_ofp;;
-extern int verbose, ifd, dump_level, done, time_bounded;
+extern int verbose, done, time_bounded;
extern unsigned int n_devs;
extern unsigned long n_traces;
extern struct list_head all_devs, all_procs, retries;
extern struct avgs_info all_avgs;
-extern __u64 last_q;
+extern __u64 last_q, next_retry_check;
extern struct region_info all_regions;
extern struct list_head free_ios;
extern __u64 iostat_interval, iostat_last_stamp;
extern time_t genesis, last_vtrace;
extern double t_astart, t_aend;
+#if defined(DEBUG)
+extern int rb_tree_size;
+#endif

/* args.c */
void handle_args(int argc, char *argv[]);
@@ -198,6 +199,9 @@ int dev_map_read(char *fname);
struct devmap *dev_map_find(__u32 device);

/* devs.c */
+#if defined(DEBUG)
+void dump_rb_trees(void);
+#endif
void init_dev_heads(void);
struct d_info *dip_add(__u32 device, struct io *iop);
void dip_rem(struct io *iop);
@@ -232,13 +236,16 @@ void latency_d2c(struct d_info *dip, __u
void latency_q2c(struct d_info *dip, __u64 tstamp, __u64 latency);

/* misc.c */
-struct blk_io_trace *convert_to_cpu(struct blk_io_trace *t);
int in_devices(struct blk_io_trace *t);
-unsigned int do_read(int ifd, void *buf, int len);
void add_file(struct file_info **fipp, FILE *fp, char *oname);
void clean_files(struct file_info **fipp);
void dbg_ping(void);

+/* mmap.c */
+void setup_ifile(char *fname);
+void cleanup_ifile(void);
+int next_trace(struct blk_io_trace *t, void **pdu);
+
/* output.c */
int output_avgs(FILE *ofp);
int output_ranges(FILE *ofp);
@@ -260,46 +267,42 @@ long long seeki_median(void *handle);
int seeki_mode(void *handle, struct mode *mp);

/* trace.c */
-void dump_iop(FILE *ofp, struct io *to_iop, struct io *from_iop, int indent);
-void release_iops(struct list_head *del_head);
+void __dump_iop(FILE *ofp, struct io *iop, int extra_nl);
+void __dump_iop2(FILE *ofp, struct io *a_iop, struct io *l_iop);
+void release_iops(struct list_head *rmhd);
void add_trace(struct io *iop);
-void do_retries(void);
+void do_retries(__u64 now);

/* trace_complete.c */
void trace_complete(struct io *c_iop);
-int retry_complete(struct io *c_iop);
-int ready_complete(struct io *c_iop, struct io *top);
-void run_complete(struct io *c_iop);
+void retry_complete(struct io *c_iop, __u64 now);

/* trace_im.c */
+void run_im(struct io *im_iop, struct io *c_iop, struct list_head *rmhd);
+void run_unim(struct io *im_iop, struct list_head *rmhd);
+int ready_im(struct io *im_iop, struct io *c_iop);
void trace_insert(struct io *i_iop);
void trace_merge(struct io *m_iop);
-int ready_im(struct io *im_iop, struct io *top);
-void run_im(struct io *im_iop, struct io *top, struct list_head *del_head);
-void run_unim(struct io *im_iop, struct list_head *del_head);

/* trace_issue.c */
+void run_issue(struct io *d_iop, struct io *c_iop, struct list_head *rmhd);
+void run_unissue(struct io *d_iop, struct list_head *rmhd);
+int ready_issue(struct io *d_iop, struct io *c_iop);
void trace_issue(struct io *d_iop);
-int ready_issue(struct io *d_iop, struct io *top);
-void run_issue(struct io *d_iop, struct io *top, struct list_head *del_head);
-void run_unissue(struct io *d_iop, struct list_head *del_head);

/* trace_queue.c */
+void run_queue(struct io *q_iop, struct io *c_iop, struct list_head *rmhd);
+int ready_queue(struct io *q_iop, struct io *c_iop);
void trace_queue(struct io *q_iop);
-int ready_queue(struct io *q_iop, struct io *top);
-void run_queue(struct io *q_iop, struct io *top, struct list_head *del_head);
-void run_unqueue(struct io *q_iop, struct list_head *del_head);

/* trace_remap.c */
+void run_remap(struct io *a_iop, struct io *c_iop, struct list_head *rmhd);
+int ready_remap(struct io *a_iop, struct io *c_iop);
void trace_remap(struct io *a_iop);
-int ready_remap(struct io *a_iop, struct io *top);
-void run_remap(struct io *a_iop, struct io *top, struct list_head *del_head);
-void run_unremap(struct io *a_iop, struct list_head *del_head);

/* trace_requeue.c */
-void trace_requeue(struct io *r_iop);
-int retry_requeue(struct io *r_iop);
-int ready_requeue(struct io *r_iop, struct io *top);
void run_requeue(struct io *r_iop);
+void retry_requeue(struct io *r_iop, __u64 now);
+void trace_requeue(struct io *r_iop);

#include "inlines.h"
diff --git a/btt/inlines.h b/btt/inlines.h
index cfc7160..ee16b1e 100644
--- a/btt/inlines.h
+++ b/btt/inlines.h
@@ -105,6 +105,15 @@ static inline struct io *io_alloc(void)
        iop = malloc(sizeof(struct io));

memset(iop, 0, sizeof(struct io));
+       INIT_LIST_HEAD(&iop->links);
+
+#if defined(DEBUG)
+       iop->f_head.next = LIST_POISON1;
+       iop->down_head.next = LIST_POISON1;
+       iop->up_head.next = LIST_POISON1;
+       iop->c_pending.next = LIST_POISON1;
+       iop->retry.next = LIST_POISON1;
+#endif

return iop;
}
@@ -123,8 +132,6 @@ static inline int io_setup(struct io *io
iop->dip = dip_add(iop->t.device, iop);
if (iop->linked) {
        iop->pip = find_process(iop->t.pid, NULL);
-               INIT_LIST_HEAD(&iop->down_list);
-               INIT_LIST_HEAD(&iop->up_list);
        iop->bytes_left = iop->t.bytes;
}

@@ -133,13 +140,71 @@ static inline int io_setup(struct io *io

static inline void io_release(struct io *iop)
{
+       ASSERT(list_empty(&iop->links));
+       ASSERT(iop->type == IOP_A || iop->down_iop == NULL);
+
+       ASSERT(iop->f_head.next == LIST_POISON1);
+       ASSERT(iop->down_head.next == LIST_POISON1);
+       ASSERT(iop->up_head.next == LIST_POISON1);
+       ASSERT(iop->c_pending.next == LIST_POISON1);
+       ASSERT(iop->retry.next == LIST_POISON1);
+
if (iop->linked)
        dip_rem(iop);
if (iop->pdu) 
        free(iop->pdu);
+
io_free(iop);
}

+static inline void io_link_up(struct io *down, struct io *up)
+{
+       list_add_tail(&down->up_head, &up->links);
+       down->up_iop = up;
+}
+
+static inline void io_unlink_up(struct io *down)
+{
+       LIST_DEL(&down->up_head);
+       down->up_iop = NULL;
+}
+
+static inline void io_link_down(struct io *down, struct io *up)
+{
+       list_add_tail(&up->down_head, &down->links);
+       up->down_iop = down;
+}
+
+static inline void io_unlink_down(struct io *up)
+{
+       LIST_DEL(&up->down_head);
+       up->down_iop = NULL;
+}
+
+static inline struct io *io_link_first_down(struct io *iop)
+{
+       if (list_empty(&iop->links))
+               return NULL;
+       return list_entry(iop->links.next, struct io, up_head);
+}
+
+static inline struct io *io_link_first_up(struct io *iop)
+{
+       if (list_empty(&iop->links))
+               return NULL;
+       return list_entry(iop->links.next, struct io, down_head);
+}
+
+static inline void link_queue(struct io *q_iop, struct io *lim_iop)
+{
+       io_link_down(q_iop, lim_iop);
+}
+
+static inline void unlink_queue(struct io *lim_iop)
+{
+       io_unlink_down(lim_iop);
+}
+
#define UPDATE_AVGS(_avg, _iop, _pip, _time) do {                       \
        avg_update(&all_avgs. _avg , _time);                    \
        avg_update(&_iop->dip->avgs. _avg , _time);             \
@@ -167,6 +232,11 @@ static inline void update_q2i(struct io 
UPDATE_AVGS(q2i, iop, iop->pip, i_time);
}

+static inline void unupdate_q2i(struct io *iop, __u64 i_time)
+{
+       UNUPDATE_AVGS(q2i, iop, iop->pip, i_time);
+}
+
static inline void update_i2d(struct io *iop, __u64 d_time)
{
UPDATE_AVGS(i2d, iop, iop->pip, d_time);
@@ -212,6 +282,9 @@ static inline int dip_rb_ins(struct d_in
static inline void dip_rb_rem(struct io *iop)
{
rb_erase(&iop->rb_node, __get_root(iop->dip, iop->type));
+#if defined(DEBUG)
+       rb_tree_size--;
+#endif
}

static inline void dip_rb_fe(struct d_info *dip, enum iop_type type, 
@@ -228,37 +301,17 @@ static inline struct io *dip_rb_find_sec
return rb_find_sec(__get_root(dip, type), sec);
}

-static inline struct io *list_first_down(struct io *iop)
-{
-       struct list_head *p = list_first(&iop->down_list);
-       return p ? list_entry(p, struct io, up_head) : NULL;
-}
-
-static inline struct io *list_first_up(struct io *iop)
-{
-       struct list_head *p = list_first(&iop->up_list);
-       return p ? list_entry(p, struct io, down_head) : NULL;
-}
-
-static inline int list_empty_up(struct io *iop)
+static inline void bump_retry(__u64 now)
{
-       return list_empty(&iop->up_list);
-}
-
-static inline void __link(struct io *down_iop, struct io *up_iop)
-{
-       list_add_tail(&down_iop->up_head, &up_iop->down_list);
-       list_add_tail(&up_iop->down_head, &down_iop->up_list);
-}
-
-static inline void __unlink(struct io *down_iop, struct io *up_iop)
-{
-       LIST_DEL(&down_iop->up_head);
-       LIST_DEL(&up_iop->down_head);
+       if (!list_empty(&retries))
+               next_retry_check = now + (100 * 1000); // 100 usec
+       else 
+               next_retry_check = 0;
}

static inline void add_retry(struct io *iop)
{
+       bump_retry(iop->t.time);
if (!iop->on_retry_list) {
        list_add_tail(&iop->retry, &retries);
        iop->on_retry_list = 1;
@@ -271,6 +324,7 @@ static inline void del_retry(struct io *
        LIST_DEL(&iop->retry);
        iop->on_retry_list = 0;
}
+       bump_retry(iop->t.time);
}

static inline __u64 tdelta(struct io *iop1, struct io *iop2)
@@ -279,3 +333,42 @@ static inline __u64 tdelta(struct io *io
__u64 t2 = iop2->t.time;
return (t1 < t2) ? (t2 - t1) : 1;
}
+
+static inline int remapper_dev(__u32 dev)
+{
+       int mjr = MAJOR(dev);
+       return mjr == 9 || mjr == 254;  // 253? That's what is in blkparse.c
+                                       // But I see 254...
+}
+
+static inline void dump_iop(struct io *iop, int extra_nl)
+{
+       if (per_io_ofp) 
+               __dump_iop(per_io_ofp, iop, extra_nl);
+}
+
+static inline void dump_iop2(struct io *a_iop, struct io *l_iop)
+{
+       if (per_io_ofp) 
+               __dump_iop2(per_io_ofp, a_iop, l_iop);
+}
+
+static inline int type2c(enum iop_type type)
+{
+       int c;
+
+       switch (type) {
+       case IOP_Q: c = 'Q'; break;
+       case IOP_X: c = 'X'; break;
+       case IOP_A: c = 'A'; break;
+       case IOP_I: c = 'I'; break;
+       case IOP_M: c = 'M'; break;
+       case IOP_D: c = 'D'; break;
+       case IOP_C: c = 'C'; break;
+       case IOP_R: c = 'R'; break;
+       case IOP_L: c = 'L'; break;
+       default   : c = '?'; break;
+       }
+
+       return c;
+}
diff --git a/btt/misc.c b/btt/misc.c
index 1ec05cb..022cef5 100644
--- a/btt/misc.c
+++ b/btt/misc.c
@@ -27,21 +27,6 @@ #include <unistd.h>
#define INLINE_DECLARE
#include "globals.h"

-int data_is_native = -1;
-
-struct blk_io_trace *convert_to_cpu(struct blk_io_trace *t)
-{
-       if (data_is_native == -1)
-               check_data_endianness(t->magic);
-
-       trace_to_cpu(t);
-
-       ASSERT(CHECK_MAGIC(t));
-       ASSERT((t->magic & 0xff) == SUPPORTED_VERSION);
-
-       return t;
-}
-
int in_devices(struct blk_io_trace *t)
{
int i;
@@ -66,25 +51,6 @@ int in_devices(struct blk_io_trace *t)
return 0;
}

-unsigned int do_read(int ifd, void *buf, int len)
-{
-       int n;
-
-       n = read(ifd, buf, len);
-       if (n < 0) {
-               perror(input_name);
-               return 1;
-       }
-       else if (0 < n && n < len) {
-               fprintf(stderr,"Short read on %s\n", input_name);
-               return 1;
-       }
-       else if (n == 0) /* EOF */
-               return 1;
-
-       return 0;
-}
-
void add_file(struct file_info **fipp, FILE *fp, char *oname)
{
struct file_info *fip;
diff --git a/btt/mmap.c b/btt/mmap.c
new file mode 100644
index 0000000..3718817
--- /dev/null
+++ b/btt/mmap.c
@@ -0,0 +1,144 @@
+/*
+ * blktrace output analysis: generate a timeline & gather statistics
+ *
+ * Copyright (C) 2006 Alan D. Brunelle <[EMAIL PROTECTED]>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include "blktrace.h"
+
+#define DEF_LEN        (16 * 1024 * 1024)
+
+static int fd;
+static void *cur_map = MAP_FAILED;
+static off_t cur_min, cur, cur_max, total_size;
+static size_t len;
+static struct blk_io_trace *next_t;
+static long pgsz;
+
+int data_is_native = -1;
+
+static inline size_t min_len(size_t a, size_t b) { return a < b ? a : b; }
+
+static inline size_t convert_to_cpu(struct blk_io_trace *t, 
+                                    struct blk_io_trace *tp, 
+                                   void **pdu)
+{
+       if (data_is_native == -1)
+               check_data_endianness(t->magic);
+
+       if (data_is_native)
+               memcpy(tp, t, sizeof(*tp));
+       else {
+               tp->magic       = be32_to_cpu(t->magic);
+               tp->sequence    = be32_to_cpu(t->sequence);
+               tp->time        = be64_to_cpu(t->time);
+               tp->sector      = be64_to_cpu(t->sector);
+               tp->bytes       = be32_to_cpu(t->bytes);
+               tp->action      = be32_to_cpu(t->action);
+               tp->pid         = be32_to_cpu(t->pid);
+               tp->device      = be32_to_cpu(t->device);
+               tp->cpu         = be16_to_cpu(t->cpu);
+               tp->error       = be16_to_cpu(t->error);
+               tp->pdu_len     = be16_to_cpu(t->pdu_len);
+       }
+
+       assert(CHECK_MAGIC(tp));
+       assert((tp->magic & 0xff) == SUPPORTED_VERSION);
+
+       if (tp->pdu_len) {
+               *pdu = malloc(tp->pdu_len);
+               memcpy(*pdu, t+1, tp->pdu_len);
+       }
+       else
+               *pdu = NULL;
+
+       return sizeof(*tp) + tp->pdu_len;
+}
+
+static int move_map(void)
+{
+       if (cur_map != MAP_FAILED)
+               munmap(cur_map, len);
+
+       cur_min = (cur & ~(pgsz-1));
+       len = min_len(DEF_LEN, total_size - cur_min);
+       if (len < sizeof(*next_t))
+               return 0;
+
+       cur_map = mmap(NULL, len, PROT_READ, MAP_SHARED, fd,
+                      cur_min);
+       if (cur_map == MAP_FAILED) {
+               perror("mmap");
+               exit(1);
+       }
+
+       cur_max = cur_min + len;
+       return (cur < cur_max);
+}
+
+void setup_ifile(char *fname)
+{
+       struct stat buf;
+
+       pgsz = sysconf(_SC_PAGESIZE);
+
+       fd = open(fname, O_RDONLY);
+       if (fd < 0) {
+               perror(fname);
+               exit(1);
+       }
+       if (fstat(fd, &buf) < 0) {
+               perror(fname);
+               exit(1);
+       }
+       total_size = buf.st_size;
+
+       if (!move_map())
+               exit(0);
+}
+
+void cleanup_ifile(void)
+{
+       if (cur_map != MAP_FAILED)
+               munmap(cur_map, len);
+       close(fd);
+}
+
+int next_trace(struct blk_io_trace *t, void **pdu)
+{
+       size_t this_len;
+
+       if ((cur + 512) > cur_max)
+               if (!move_map())
+                       return 0;
+
+       next_t = cur_map + (cur - cur_min);
+       this_len = convert_to_cpu(next_t, t, pdu);
+       cur += this_len;
+
+       return 1;
+}
diff --git a/btt/trace.c b/btt/trace.c
index 50de1bc..93f5b84 100644
--- a/btt/trace.c
+++ b/btt/trace.c
@@ -23,69 +23,40 @@ #include "globals.h"
int dump_level;
LIST_HEAD(retries);

-static inline void dump_dev(FILE *ofp, __u32 dev)
+void __dump_iop(FILE *ofp, struct io *iop, int extra_nl)
{
-       fprintf(ofp, "%3d,%-3d ", MAJOR(dev), MINOR(dev));
+       fprintf(ofp, "%5d.%09lu %3d,%-3d %c %10llu+%-4u\n",
+               (int)SECONDS(iop->t.time),
+               (unsigned long)NANO_SECONDS(iop->t.time),
+               MAJOR(iop->t.device), MINOR(iop->t.device), type2c(iop->type),
+               (unsigned long long)iop->t.sector, t_sec(&iop->t));
+       if (extra_nl) fprintf(ofp, "\n");
}

-static inline void dump_desc(FILE *ofp, struct io *iop)
+void __dump_iop2(FILE *ofp, struct io *a_iop, struct io *l_iop)
{
-       fprintf(ofp, "%10llu+%-4u ", (unsigned long long)iop->t.sector, 
-               t_sec(&iop->t));
+       fprintf(ofp, "%5d.%09lu %3d,%-3d %c %10llu+%-4u <- (%3d,%-3d) %10llu\n",
+               (int)SECONDS(a_iop->t.time),
+               (unsigned long)NANO_SECONDS(a_iop->t.time),
+               MAJOR(a_iop->t.device), MINOR(a_iop->t.device), 
+               type2c(a_iop->type), (unsigned long long)a_iop->t.sector, 
+               t_sec(&a_iop->t), MAJOR(l_iop->t.device), 
+               MINOR(l_iop->t.device), (unsigned long long)l_iop->t.sector);
}

-void dump_iop(FILE *ofp, struct io *to_iop, struct io *from_iop, int indent)
-{
-       int i, c;
-
-       if (!ofp) return;
-       if (to_iop->displayed) return;
-
-       fprintf(ofp, "%5d.%09lu ", (int)SECONDS(to_iop->t.time),
-               (unsigned long)NANO_SECONDS(to_iop->t.time));
-
-       for (i = 0; i < ((dump_level * 4) + indent); i++)
-               fprintf(ofp, " ");
-
-       dump_dev(ofp, to_iop->t.device);
-
-       switch (to_iop->type) {
-       case IOP_Q: c = 'Q'; break;
-       case IOP_L: c = 'L'; break;
-       case IOP_A: c = 'A'; break;
-       case IOP_I: c = 'I'; break;
-       case IOP_M: c = 'M'; break;
-       case IOP_D: c = 'D'; break;
-       case IOP_C: c = 'C'; break;
-       default   : c = '?'; break;
-       }
-
-       fprintf(ofp, "%c ", c);
-       dump_desc(ofp, to_iop);
-       if (from_iop) {
-               fprintf(ofp, "<- ");
-               dump_dev(ofp, from_iop->t.device);
-               dump_desc(ofp, from_iop);
-       }
-               
-       fprintf(ofp, "\n");
-
-       to_iop->displayed = 1;
-}
-
-void release_iops(struct list_head *del_head)
+void release_iops(struct list_head *rmhd)
{
struct io *x_iop;
struct list_head *p, *q;

-       list_for_each_safe(p, q, del_head) {
+       list_for_each_safe(p, q, rmhd) {
        x_iop = list_entry(p, struct io, f_head);
        LIST_DEL(&x_iop->f_head);
        io_release(x_iop);
}
}

-void do_retries(void)
+void do_retries(__u64 now)
{
struct io *iop;
struct list_head *p, *q;
@@ -94,21 +65,33 @@ void do_retries(void)
        iop = list_entry(p, struct io, retry);
        // iop could be gone after call...
        if (iop->type == IOP_C) 
-                       retry_complete(iop);
+                       retry_complete(iop, now);
        else
-                       retry_requeue(iop);
+                       retry_requeue(iop, now);
}
}

+static inline int retry_check_time(__u64 t)
+{
+       return next_retry_check && (t > next_retry_check);
+}
+
static void __add_trace(struct io *iop)
{
time_t now = time(NULL);
+       __u64 tstamp = iop->t.time;
+       int run_retry = retry_check_time(iop->t.time);

n_traces++;
iostat_check_time(iop->t.time);

if (verbose && ((now - last_vtrace) > 0)) {
+#if defined(DEBUG)
+               printf("%10lu t\tretries=|%10d|\ttree size=|%10d|\r", 
+                       n_traces, list_len(&retries), rb_tree_size);
+#else
        printf("%10lu t\r", n_traces);
+#endif
        if ((n_traces % 1000000) == 0) printf("\n");
        fflush(stdout);
        last_vtrace = now;
@@ -128,9 +111,10 @@ static void __add_trace(struct io *iop)
        return;
}

-       if (((iop->t.action & 0xffff) != __BLK_TA_REQUEUE) && 
-                                               !list_empty(&retries))
-               do_retries();
+       if (run_retry && !list_empty(&retries)) {
+               do_retries(tstamp);
+               bump_retry(tstamp);
+       }
}

void add_trace(struct io *iop)
diff --git a/btt/trace_complete.c b/btt/trace_complete.c
index 32c43b5..ae2e2ac 100644
--- a/btt/trace_complete.c
+++ b/btt/trace_complete.c
@@ -17,140 +17,169 @@
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
+ * --------------------------------------------------------------------------
+ *  links used below (D or Q's)
+ *  up_iop, down_iop and down_head not used
*/
#include "globals.h"

LIST_HEAD(pending_cs);

-static void gen_c_list(struct io *c_iop, struct list_head *c_head)
+static inline void __run_dev_mapper(struct io *c_iop, struct list_head *rmhd)
{
-       struct io *iop;
-       struct list_head *p;
-
-       __list_for_each(p, &pending_cs) {
-               iop = list_entry(p, struct io, c_pending);
-               if (iop->t.device == c_iop->t.device)
-                       continue;
-               if (dip_find_sec(iop->dip, IOP_D, BIT_START(iop)) == NULL)
-                       continue;
+       struct io *a_iop;
+       struct list_head *p, *q;

-               __link(iop, c_iop);
-               if (ready_complete(iop, c_iop))
-                       list_add_tail(&iop->f_head, c_head);
-               __unlink(iop, c_iop);
+       list_for_each_safe(p, q, &c_iop->links) {
+               a_iop = list_entry(p, struct io, up_head);
+               run_remap(a_iop, c_iop, rmhd);
+               io_unlink_up(a_iop);
}
}

-static void run_comp(struct io *c_iop, struct io *top, struct list_head *rmhd)
+static inline void __run_self_mapper(struct io *c_iop, struct list_head *rmhd)
{
-       struct io *d_iop = dip_find_sec(c_iop->dip, IOP_D, BIT_START(c_iop));
+       struct io *d_iop = io_link_first_down(c_iop);

-       update_blks(c_iop);
-       if (d_iop) {
-               __u64 d2c = tdelta(d_iop, c_iop);
+       run_issue(d_iop, c_iop, rmhd);
+       io_unlink_up(d_iop);
+}

-               update_d2c(d_iop, d2c);
-               latency_d2c(d_iop->dip, c_iop->t.time, d2c);
-               iostat_complete(d_iop, c_iop);
+static inline void __run_complete(struct io *c_iop)
+{
+       LIST_HEAD(rmhd);

-               __link(d_iop, c_iop);
-               run_issue(d_iop, top, rmhd);
-               __unlink(d_iop, c_iop);
-       }
-       else {
-               LIST_HEAD(head);
-               struct io *iop;
-               struct list_head *p, *q;
-
-               gen_c_list(c_iop, &head);
-               list_for_each_safe(p, q, &head) {
-                       iop = list_entry(p, struct io, f_head);
-                       LIST_DEL(&iop->f_head);
-
-                       dump_level++;
-                       __link(iop, c_iop);
-                       run_comp(iop, top, rmhd);
-                       __unlink(iop, c_iop);
-                       dump_level--;
-               }
-       }
+       if (remapper_dev(c_iop->t.device))
+               __run_dev_mapper(c_iop, &rmhd);
+       else
+               __run_self_mapper(c_iop, &rmhd);

-       dump_iop(per_io_ofp, c_iop, NULL, 0);
+       dump_iop(c_iop, 1);

LIST_DEL(&c_iop->c_pending);
del_retry(c_iop);
-       list_add_tail(&c_iop->f_head, rmhd);
+
+       list_add_tail(&c_iop->f_head, &rmhd);
+       release_iops(&rmhd);
}

-static int ready_comp(struct io *c_iop, 
-                               __attribute__((__unused__)) struct io *top)
+static int ready_complete_remapper(struct io *c_iop)
{
+       int rval = -1;
LIST_HEAD(head);
-       struct io *iop;
struct list_head *p, *q;
-       __u64 bl = c_iop->bytes_left;
+       struct io *l_iop, *a_iop;

-       gen_c_list(c_iop, &head);
+       dip_foreach_list(c_iop, IOP_L, &head);
list_for_each_safe(p, q, &head) {
-               iop = list_entry(p, struct io, f_head);
-               LIST_DEL(&iop->f_head);
+               l_iop = list_entry(p, struct io, f_head);
+               LIST_DEL(&l_iop->f_head);

-               __link(iop, c_iop);
-               if (ready_complete(iop, c_iop))
-                       bl -= iop->bytes_left;
-               __unlink(iop, c_iop);
-       }
+               dip_rem(l_iop);
+               if (l_iop->up_iop == NULL)
+                       continue;

-       return bl == 0;
-}
+               a_iop = l_iop->up_iop;
+               if (!a_iop->up_iop)
+                       io_link_up(a_iop, c_iop);

-void trace_complete(struct io *c_iop)
-{
-       if (!io_setup(c_iop, IOP_C)) {
-               io_release(c_iop);
-               return;
+               if (!ready_remap(a_iop, c_iop))
+                       rval = 0;
+               else if (rval < 0)
+                       rval = 1;
}

-       list_add_tail(&c_iop->c_pending, &pending_cs);
-       if (ready_complete(c_iop, c_iop)) {
-               dump_level = 0;
-               run_complete(c_iop);
-       }
-       else 
-               add_retry(c_iop);
+       return rval;
}

-int retry_complete(struct io *c_iop)
+int ready_complete(struct io *c_iop)
{
-       if (!ready_complete(c_iop, c_iop))
-               return 0;
+       struct io *d_iop;
+
+       if (remapper_dev(c_iop->t.device))
+               return ready_complete_remapper(c_iop);
+
+       d_iop = io_link_first_down(c_iop);
+       if (!d_iop) {
+               __u64 d2c;
+
+               d_iop = dip_find_sec(c_iop->dip, IOP_D, BIT_START(c_iop));
+               if (!d_iop)
+                       return -1;
+
+               if (c_iop->t.bytes != d_iop->t.bytes) {
+                       fprintf(stderr, 
+                               "\nFATAL: Probable time anomaly detected\n");
+                       fprintf(stderr, 
+                               "D @ %15.9lf missing C, later C @ %15.9lf\n", 
+                               BIT_TIME(d_iop->t.time), 
+                               BIT_TIME(c_iop->t.time));
+                       exit(1);
+               }
+               c_iop->bytes_left = 0;
+
+               d2c = tdelta(d_iop, c_iop);
+               update_d2c(d_iop, d2c);
+               latency_d2c(d_iop->dip, c_iop->t.time, d2c);
+               iostat_complete(d_iop, c_iop);
+
+               io_link_up(d_iop, c_iop);
+
+               dip_rem(d_iop);
+       }

-       run_complete(c_iop);
-       return 1;
+       return ready_issue(d_iop, c_iop);
}

-int ready_complete(struct io *c_iop, struct io *top)
+void trace_complete(struct io *c_iop)
{
-       struct io *d_iop = dip_find_sec(c_iop->dip, IOP_D, BIT_START(c_iop));
-
-       if (d_iop) {
-               ASSERT(d_iop->t.bytes == c_iop->bytes_left);
-               return ready_issue(d_iop, top);
+       if (io_setup(c_iop, IOP_C)) {
+               update_blks(c_iop);
+               update_cregion(&all_regions, c_iop->t.time);
+               update_cregion(&c_iop->dip->regions, c_iop->t.time);
+               if (c_iop->pip)
+                       update_cregion(&c_iop->pip->regions, c_iop->t.time);
+
+               list_add_tail(&c_iop->c_pending, &pending_cs);
+               switch (ready_complete(c_iop)) {
+               case  1: 
+                       __run_complete(c_iop); 
+                       break;
+               case  0: 
+                       add_retry(c_iop); 
+                       break;
+               case -1: 
+                       LIST_DEL(&c_iop->c_pending);
+                       del_retry(c_iop);
+                       io_release(c_iop);
+                       break;
+               }
}
else 
-               return ready_comp(c_iop, top);
+               io_release(c_iop);
}

-void run_complete(struct io *c_iop)
+void retry_complete(struct io *c_iop, __u64 now)
{
-       LIST_HEAD(rmhd);
-
-       update_cregion(&all_regions, c_iop->t.time);
-       update_cregion(&c_iop->dip->regions, c_iop->t.time);
-       if (c_iop->pip)
-               update_cregion(&c_iop->pip->regions, c_iop->t.time);
-
-       run_comp(c_iop, c_iop, &rmhd);
-       if (per_io_ofp) fprintf(per_io_ofp, "\n");
-       release_iops(&rmhd);
+       double tc = BIT_TIME(c_iop->t.time);
+
+       switch (ready_complete(c_iop)) {
+       case  1: 
+#if defined(DEBUG)
+               fprintf(stderr, "Retried %15.9lf success!\n", tc);
+#endif
+               __run_complete(c_iop); 
+               break;
+       case  0:
+               if (now == 0 || ((BIT_TIME(now) - tc) < 1.0))
+                       break;
+               if (!list_empty(&c_iop->links))
+                       break;
+               /*FALLTHROUGH*/
+       case -1: 
+               LIST_DEL(&c_iop->c_pending);
+               del_retry(c_iop);
+               io_release(c_iop);
+               break;
+       }
}
diff --git a/btt/trace_im.c b/btt/trace_im.c
index eecd32a..8c2eb58 100644
--- a/btt/trace_im.c
+++ b/btt/trace_im.c
@@ -17,61 +17,85 @@
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
+ * --------------------------------------------------------------------------
+ * down_iop points to Q
+ * down_head links with Q
+ * up_head used to link to D
+ * up_iop & links not used
*/
#include "globals.h"

-void trace_insert(struct io *i_iop)
+static inline void __run_im(struct io *im_iop, struct io *c_iop, 
+                                       struct list_head *rmhd, int run_un)
{
-       if (!io_setup(i_iop, IOP_I)) {
-               io_release(i_iop);
-               return;
+       struct io *q_iop = im_iop->down_iop;
+
+       if (!run_un) {
+               run_queue(q_iop, c_iop, rmhd);
+               dump_iop(im_iop, 0);
}
-       iostat_insert(i_iop);
+       else {
+               if (q_iop->bytes_left == 0) {
+                       q_iop->linked = dip_rb_ins(q_iop->dip, q_iop);
+                       ASSERT(q_iop->linked);
+#if defined(DEBUG)
+                       rb_tree_size++;
+#endif
+               }
+               q_iop->bytes_left += im_iop->t.bytes;
+               unupdate_q2i(q_iop, tdelta(q_iop, im_iop));
+       }
+
+       unlink_queue(im_iop);
+       list_add_tail(&im_iop->f_head, rmhd);
}

-void trace_merge(struct io *m_iop)
+void run_im(struct io *im_iop, struct io *c_iop, struct list_head *rmhd)
{
-       if (!io_setup(m_iop, IOP_M)) {
-               io_release(m_iop);
-               return;
-       }
-       iostat_merge(m_iop);
+       __run_im(im_iop, c_iop, rmhd, 0);
}

-int ready_im(struct io *im_iop, struct io *top)
+void run_unim(struct io *im_iop, struct list_head *rmhd)
{
-       struct io *q_iop = dip_find_sec(im_iop->dip, IOP_Q, BIT_START(im_iop));
-
-       if (q_iop) {
-               ASSERT(q_iop->bytes_left >= im_iop->bytes_left);
-               return ready_queue(q_iop, top);
-       }
-
-       return 0;
+       __run_im(im_iop, NULL, rmhd, 1);
}

-void run_im(struct io *im_iop, struct io *top, struct list_head *del_head)
+int ready_im(struct io *im_iop, struct io *c_iop)
{
-       struct io *q_iop = dip_find_sec(im_iop->dip, IOP_Q, BIT_START(im_iop));
+       struct io *q_iop = im_iop->down_iop;
+
+       if (!q_iop) {
+               q_iop = dip_find_sec(im_iop->dip, IOP_Q, BIT_START(im_iop));
+               if (!q_iop)
+                       return 0;

-       ASSERT(q_iop);
-       update_q2i(q_iop, tdelta(q_iop, im_iop));
+               ASSERT(q_iop->t.bytes >= im_iop->t.bytes);
+               im_iop->bytes_left = 0;

-       __link(q_iop, im_iop);
-       run_queue(q_iop, top, del_head);
-       __unlink(q_iop, im_iop);
+               update_q2i(q_iop, tdelta(q_iop, im_iop));
+               link_queue(q_iop, im_iop);

-       dump_iop(per_io_ofp, im_iop, NULL, 0);
-       list_add_tail(&im_iop->f_head, del_head);
+               q_iop->bytes_left -= im_iop->t.bytes;
+               if (q_iop->bytes_left == 0)
+                       dip_rem(q_iop);
+       }
+
+       return ready_queue(q_iop, c_iop);
}

-void run_unim(struct io *im_iop, struct list_head *del_head)
+void trace_insert(struct io *i_iop)
{
-       struct io *q_iop = dip_find_sec(im_iop->dip, IOP_Q, BIT_START(im_iop));
+       if (io_setup(i_iop, IOP_I))
+               iostat_insert(i_iop);
+       else
+               io_release(i_iop);

-       __link(q_iop, im_iop);
-       run_unqueue(q_iop, del_head);
-       __unlink(q_iop, im_iop);
+}

-       list_add_tail(&im_iop->f_head, del_head);
+void trace_merge(struct io *m_iop)
+{
+       if (io_setup(m_iop, IOP_M))
+               iostat_merge(m_iop);
+       else
+               io_release(m_iop);
}
diff --git a/btt/trace_issue.c b/btt/trace_issue.c
index 17bfae0..1cb5c48 100644
--- a/btt/trace_issue.c
+++ b/btt/trace_issue.c
@@ -17,49 +17,60 @@
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
+ * --------------------------------------------------------------------------
+ * links used below for I and M
+ * up_head used to link to C (or R)
+ * up_iop points to upper C (or R)
+ * down_head and down_iop not used
*/
#include "globals.h"

-void trace_issue(struct io *d_iop)
-{
-       if (!io_setup(d_iop, IOP_D)) {
-               io_release(d_iop);
-               return;
-       }
-
-       if (seek_name)
-               seeki_add(d_iop->dip->seek_handle, d_iop);
-       iostat_issue(d_iop);
-       d_iop->dip->n_ds++;
-}
-
-int ready_issue(struct io *d_iop, struct io *top)
+static inline void __run_issue(struct io *d_iop, struct io *c_iop, 
+                                       struct list_head *rmhd, int run_un)
{
LIST_HEAD(head);
struct io *im_iop;
struct list_head *p, *q;
-       __u64 bl = d_iop->bytes_left;

-       dip_foreach_list(d_iop, IOP_I, &head);
-       dip_foreach_list(d_iop, IOP_M, &head);
-       list_for_each_safe(p, q, &head) {
-               im_iop = list_entry(p, struct io, f_head);
-               LIST_DEL(&im_iop->f_head);
+       list_for_each_safe(p, q, &d_iop->links) {
+               im_iop = list_entry(p, struct io, up_head);

-               if (!ready_im(im_iop, top))
-                       return 0;
+               if (run_un) {
+                       unupdate_i2d(im_iop, tdelta(im_iop, d_iop));
+                       run_unim(im_iop, rmhd);
+               }
+               else {
+                       update_i2d(im_iop, tdelta(im_iop, d_iop));
+                       run_im(im_iop, c_iop, rmhd);
+               }

-               bl -= im_iop->bytes_left;
+               io_unlink_up(im_iop);
}

-       return bl == 0;
+       if (!run_un)
+               dump_iop(d_iop, 0);
+       list_add_tail(&d_iop->f_head, rmhd);
+}
+
+void run_issue(struct io *d_iop, struct io *c_iop, struct list_head *rmhd)
+{
+       __run_issue(d_iop, c_iop, rmhd, 0);
+}
+
+void run_unissue(struct io *d_iop, struct list_head *rmhd)
+{
+       __run_issue(d_iop, NULL, rmhd, 1);
}

-void run_issue(struct io *d_iop, struct io *top, struct list_head *del_head)
+int ready_issue(struct io *d_iop, struct io *c_iop)
{
+       int ready = 1;
+       struct io *im_iop;
LIST_HEAD(head);
struct list_head *p, *q;
-       struct io *im_iop;
+
+       if (d_iop->bytes_left == 0)
+               return 1;

dip_foreach_list(d_iop, IOP_I, &head);
dip_foreach_list(d_iop, IOP_M, &head);
@@ -67,36 +78,29 @@ void run_issue(struct io *d_iop, struct 
        im_iop = list_entry(p, struct io, f_head);
        LIST_DEL(&im_iop->f_head);

-               update_i2d(im_iop, tdelta(im_iop, d_iop));
+               ASSERT(!im_iop->up_iop);
+               io_link_up(im_iop, d_iop);
+               dip_rem(im_iop);

-               __link(im_iop, d_iop);
-               run_im(im_iop, top, del_head);
-               __unlink(im_iop, d_iop);
+               ASSERT(d_iop->bytes_left >= im_iop->t.bytes);
+               d_iop->bytes_left -= im_iop->t.bytes;
+
+               if (!ready_im(im_iop, c_iop))
+                       ready = 0;
}

-       dump_iop(per_io_ofp, d_iop, NULL, 0);
-       list_add_tail(&d_iop->f_head, del_head);
+       return ready && d_iop->bytes_left == 0;
}

-void run_unissue(struct io *d_iop, struct list_head *del_head)
+void trace_issue(struct io *d_iop)
{
-       LIST_HEAD(head);
-       struct io *im_iop;
-       struct list_head *p, *q;
-
-       iostat_unissue(d_iop);
-
-       dip_foreach_list(d_iop, IOP_I, &head);
-       dip_foreach_list(d_iop, IOP_M, &head);
-       list_for_each_safe(p, q, &head) {
-               im_iop = list_entry(p, struct io, f_head);
-               LIST_DEL(&im_iop->f_head);
-
-               __link(im_iop, d_iop);
-               unupdate_i2d(im_iop, tdelta(im_iop, d_iop));
-               run_unim(im_iop, del_head);
-               __unlink(im_iop, d_iop);
+       if (io_setup(d_iop, IOP_D)) {
+               if (seek_name)
+                       seeki_add(d_iop->dip->seek_handle, d_iop);
+               iostat_issue(d_iop);
+               d_iop->dip->n_ds++;
}
+       else
+               io_release(d_iop);

-       list_add_tail(&d_iop->f_head, del_head);
}
diff --git a/btt/trace_queue.c b/btt/trace_queue.c
index d32e159..6ded1e5 100644
--- a/btt/trace_queue.c
+++ b/btt/trace_queue.c
@@ -17,76 +17,69 @@
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
+ * ---------------------------------------------------------------------------
+ * down_iop points to A (if there)
+ * If remapper device
+ *     up_head and up_iop linked to C
+ *     links and down_head not used
+ * else
+ *     links used with L, I and M's above
+ *     up_iop, up_head and down_head not used
*/
#include "globals.h"

-void trace_queue(struct io *q_iop)
+static inline void __update_q2c(struct io *q_iop, struct io *c_iop)
{
-       if (!io_setup(q_iop, IOP_Q)) {
-               io_release(q_iop);
-               return;
-       }
+       __u64 q2c = tdelta(q_iop, c_iop);

-       update_lq(&last_q, &all_avgs.q2q, q_iop->t.time);
-       update_qregion(&all_regions, q_iop->t.time);
-       dip_update_q(q_iop->dip, q_iop);
-       pip_update_q(q_iop);
+       update_q2c(q_iop, q2c);
+       latency_q2c(q_iop->dip, q_iop->t.time, q2c);
}

-int ready_queue(struct io *q_iop, struct io *top)
+void run_queue(struct io *q_iop, struct io *c_iop, struct list_head *rmhd)
{
-       struct io *a_iop = dip_find_sec(q_iop->dip, IOP_A, BIT_START(q_iop));
+       struct io *a_iop;

+       a_iop = q_iop->down_iop;
if (a_iop) {
-               ASSERT(a_iop->bytes_left == q_iop->bytes_left);
-               return ready_remap(a_iop, top);
+               run_remap(a_iop, c_iop, rmhd);
+               q_iop->down_iop = NULL;
}

-       return q_iop->t.device == top->t.device &&
-              BIT_START(top) <= BIT_START(q_iop) &&
-                                BIT_END(q_iop) <= BIT_END(top);
+       __update_q2c(q_iop, c_iop);
+       dump_iop(q_iop, 0);
+
+       list_add_tail(&q_iop->f_head, rmhd);
}

-void run_queue(struct io *q_iop, struct io *top, struct list_head *del_head)
+int ready_queue(struct io *q_iop, struct io *c_iop)
{
-       struct io *iop;
-       struct io *a_iop = dip_find_sec(q_iop->dip, IOP_A, BIT_START(q_iop));
-
-       if (a_iop) {
-               __link(a_iop, q_iop);
-               run_remap(a_iop, top, del_head);
-               __unlink(a_iop, q_iop);
-       }
-
-       for (iop = q_iop; iop != NULL; iop = list_first_up(iop)) {
-               if (iop->type == IOP_C && iop->t.device == q_iop->t.device) {
-                       __u64 q2c = tdelta(q_iop, iop);
+       struct io *a_iop = q_iop->down_iop;

-                       update_q2c(q_iop, q2c);
-                       latency_q2c(q_iop->dip, q_iop->t.time, q2c);
+       if (!a_iop) {
+               a_iop = dip_find_sec(q_iop->dip, IOP_A, BIT_START(q_iop));
+               if (a_iop) {
+                       ASSERT(q_iop->t.bytes == a_iop->t.bytes);
+                       q_iop->down_iop = a_iop;

-                       dump_iop(per_io_ofp, q_iop, NULL, 
-                                (q_iop->t.device == top->t.device) ? -4 : 0);
-
-                       break;
+                       dip_rem(a_iop);
        }
}

-       iop = list_first_up(q_iop);
-       q_iop->bytes_left -= iop->bytes_left;
-       if (q_iop->bytes_left == 0)
-               list_add_tail(&q_iop->f_head, del_head);
+       if (a_iop && !ready_remap(a_iop, c_iop))
+               return 0;
+
+       return 1;
}

-void run_unqueue(struct io *q_iop, struct list_head *del_head)
+void trace_queue(struct io *q_iop)
{
-       struct io *a_iop = dip_find_sec(q_iop->dip, IOP_A, BIT_START(q_iop));
-
-       if (a_iop) {
-               __link(a_iop, q_iop);
-               run_unremap(a_iop, del_head);
-               __unlink(a_iop, q_iop);
+       if (io_setup(q_iop, IOP_Q)) {
+               update_lq(&last_q, &all_avgs.q2q, q_iop->t.time);
+               update_qregion(&all_regions, q_iop->t.time);
+               dip_update_q(q_iop->dip, q_iop);
+               pip_update_q(q_iop);
}
-
-       list_add_tail(&q_iop->f_head, del_head);
+       else
+               io_release(q_iop);
}
diff --git a/btt/trace_remap.c b/btt/trace_remap.c
index cc7c643..f132a71 100644
--- a/btt/trace_remap.c
+++ b/btt/trace_remap.c
@@ -17,109 +17,100 @@
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
+ * ---------------------------------------------------------------------------
+ * up_iop, down_iop, up_head, down_head, and links not used
*/
#include "globals.h"

-void trace_remap(struct io *a_iop)
+void run_remap(struct io *a_iop, struct io *c_iop, struct list_head *rmhd)
{
-       struct io *l_iop;
-       struct blk_io_trace_remap *rp = a_iop->pdu;
-       __u32 remap_dev = be32_to_cpu(rp->device);
-       __u64 remap_sec = be64_to_cpu(rp->sector);
+       struct io *l_iop = a_iop->down_iop;

-       if (!io_setup(a_iop, IOP_A)) {
-               io_release(a_iop);
-               return;
+       if (l_iop->down_iop) {
+               run_queue(l_iop->down_iop, c_iop, rmhd);
+               unlink_queue(l_iop);
}

-       /* 
-        * Create a fake LINK trace
-        */
-       l_iop = io_alloc();
-       memcpy(&l_iop->t, &a_iop->t, sizeof(a_iop->t));
-       l_iop->t.device = remap_dev;
-       l_iop->t.sector = remap_sec;
+       dump_iop2(a_iop, l_iop);

-       if (!io_setup(l_iop, IOP_L)) {
-               io_release(l_iop);
-               io_release(a_iop);
-               return;
+#if defined(DEBUG)
+       l_iop->up_iop = NULL;
+#else
+       l_iop->up_iop = a_iop->down_iop = NULL;
+#endif
+       list_add_tail(&l_iop->f_head, rmhd);
+       list_add_tail(&a_iop->f_head, rmhd);
+}
+
+int ready_dev_remap(struct io *l_iop, struct io *c_iop)
+{
+       struct io *q_iop = l_iop->down_iop;
+       
+       if (!q_iop) {
+               q_iop = dip_find_sec(l_iop->dip, IOP_Q, l_iop->t.sector);
+               if (!q_iop)
+                       return 0;
+
+               ASSERT(l_iop->t.bytes <= q_iop->t.bytes);
+               update_q2a(q_iop, tdelta(q_iop, l_iop));
+               link_queue(q_iop, l_iop);
+
+               q_iop->bytes_left -= l_iop->t.bytes;
+               if (q_iop->bytes_left == 0)
+                       dip_rem(q_iop);
}

-       __link(l_iop, a_iop);
-       l_iop->self_remap = (MAJOR(a_iop->t.device) == MAJOR(remap_dev));
+       return ready_queue(q_iop, c_iop);
}

-
-int ready_remap(struct io *a_iop, struct io *top)
+int ready_self_remap(struct io *l_iop)
{
-       struct io *l_iop = list_first_down(a_iop);
-       struct blk_io_trace_remap *rp = a_iop->pdu;
-       __u64 remap_sec = be64_to_cpu(rp->sector);
+       struct io *a_iop = dip_find_sec(l_iop->dip, IOP_A, l_iop->t.sector);

-       if (l_iop->self_remap) {
-               struct io *a_iop = dip_find_sec(l_iop->dip, IOP_A, remap_sec);
-               if (a_iop)
-                       return ready_remap(a_iop, top);
-       }
-       else {
-               struct io *q_iop = dip_find_sec(l_iop->dip, IOP_Q, remap_sec);
-               if (q_iop)
-                       return ready_queue(q_iop, top);
+       if (a_iop) {
+               update_q2a(a_iop, tdelta(a_iop, l_iop));
+               dip_rem(a_iop);
}

-       return 0;
+       return 1;
}

-void run_remap(struct io *a_iop, struct io *top, struct list_head *del_head)
+int ready_remap(struct io *a_iop, struct io *c_iop)
{
-       struct io *l_iop = list_first_down(a_iop);
-       struct blk_io_trace_remap *rp = a_iop->pdu;
-       __u64 remap_sec = be64_to_cpu(rp->sector);
-
-       if (l_iop->self_remap) {
-               struct io *a2_iop = dip_find_sec(l_iop->dip, IOP_A, remap_sec);
-               ASSERT(a2_iop);
-               __link(a2_iop, l_iop);
-               run_remap(a2_iop, top, del_head);
-               __unlink(a2_iop, l_iop);
-       }
-       else {
-               struct io *q_iop = dip_find_sec(l_iop->dip, IOP_Q, remap_sec);
-               ASSERT(q_iop);
-               __link(q_iop, l_iop);
-               update_q2a(q_iop, tdelta(q_iop, a_iop));
-               run_queue(q_iop, top, del_head);
-               __unlink(q_iop, l_iop);
-       }
+       struct io *l_iop = a_iop->down_iop;

-       dump_iop(per_io_ofp, a_iop, l_iop, 0);
-
-       __unlink(l_iop, a_iop);
-       list_add_tail(&l_iop->f_head, del_head);
-       list_add_tail(&a_iop->f_head, del_head);
+       if (remapper_dev(l_iop->t.device))
+               return ready_dev_remap(l_iop, c_iop);
+       else
+               return ready_self_remap(l_iop);
}

-void run_unremap(struct io *a_iop, struct list_head *del_head)
+void trace_remap(struct io *a_iop)
{
-       struct io *l_iop = list_first_down(a_iop);
+       struct io *l_iop;
struct blk_io_trace_remap *rp = a_iop->pdu;
-       __u64 remap_sec = be64_to_cpu(rp->sector);

-       if (l_iop->self_remap) {
-               struct io *a_iop = dip_find_sec(l_iop->dip, IOP_A, remap_sec);
-               __link(a_iop, l_iop);
-               run_unremap(a_iop, del_head);
-               __unlink(a_iop, l_iop);
+       a_iop->t.device = be32_to_cpu(rp->device_from);
+       if (!io_setup(a_iop, IOP_A)) {
+               io_release(a_iop);
+               return;
+       }
+
+       l_iop = io_alloc();
+       memcpy(&l_iop->t, &a_iop->t, sizeof(a_iop->t));
+       if (l_iop->t.pdu_len) {
+               l_iop->pdu = malloc(l_iop->t.pdu_len);
+               memcpy(l_iop->pdu, a_iop->pdu, l_iop->t.pdu_len);
}
-       else {
-               struct io *q_iop = dip_find_sec(l_iop->dip, IOP_Q, remap_sec);
-               __link(q_iop, l_iop);
-               run_unqueue(q_iop, del_head);
-               __unlink(q_iop, l_iop);
+
+       l_iop->t.device = be32_to_cpu(rp->device);
+       l_iop->t.sector = be64_to_cpu(rp->sector);
+       if (!io_setup(l_iop, IOP_L)) {
+               io_release(l_iop);
+               io_release(a_iop);
+               return;
}

-       __unlink(l_iop, a_iop);
-       list_add_tail(&l_iop->f_head, del_head);
-       list_add_tail(&a_iop->f_head, del_head);
+       a_iop->down_iop = l_iop;
+       l_iop->up_iop = a_iop;
}
diff --git a/btt/trace_requeue.c b/btt/trace_requeue.c
index 55bb3f4..cfcdb86 100644
--- a/btt/trace_requeue.c
+++ b/btt/trace_requeue.c
@@ -17,49 +17,84 @@
*  along with this program; if not, write to the Free Software
*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*
+ * ---------------------------------------------------------------------------
+ *  links used below for D
+ *  down_head, up_head and down_iop not used
*/
#include "globals.h"

-void trace_requeue(struct io *r_iop)
+void run_requeue(struct io *r_iop)
{
-       if (!io_setup(r_iop, IOP_R)) {
-               io_release(r_iop);
-               return;
-       }
+       LIST_HEAD(rmhd);
+       struct io *d_iop = io_link_first_down(r_iop);

-       if (ready_requeue(r_iop, r_iop))
-               run_requeue(r_iop);
-       else 
-               add_retry(r_iop);
+       run_unissue(d_iop, &rmhd);
+       io_unlink_up(d_iop);
+
+       del_retry(r_iop);
+       list_add_tail(&r_iop->f_head, &rmhd);
+       release_iops(&rmhd);
}

-int retry_requeue(struct io *r_iop)
+int ready_requeue(struct io *r_iop)
{
-       if (!ready_requeue(r_iop, r_iop))
-               return 0;
+       struct io *d_iop = io_link_first_down(r_iop);
+
+       if (!d_iop) {
+               d_iop = dip_find_sec(r_iop->dip, IOP_D, BIT_START(r_iop));
+               if (!d_iop)
+                       return 0;
+
+               io_link_up(d_iop, r_iop);
+               dip_rem(d_iop);
+       }

-       run_requeue(r_iop);
-       return 1;
+       return ready_issue(d_iop, r_iop);
}

-int ready_requeue(struct io *r_iop, struct io *top)
+void retry_requeue(struct io *r_iop, __u64 now)
{
-       struct io *d_iop = dip_find_sec(r_iop->dip, IOP_D, BIT_START(r_iop));
-       if (d_iop)
-               return ready_issue(d_iop, top);
-       return 0;
+       double tc = BIT_TIME(r_iop->t.time);
+
+       switch (ready_requeue(r_iop)) {
+       case 1:
+#if defined(DEBUG)
+               fprintf(stderr, "Requeue %15.9lf retried!\n", tc);
+#endif
+               run_requeue(r_iop);
+               break;
+
+       case 0:
+               if (now == 0 || ((BIT_TIME(now) - tc) < 1.0))
+                       break;
+               if (!list_empty(&r_iop->links))
+                       break;
+               /*FALLTHROUGH*/
+       case -1:
+               del_retry(r_iop);
+               io_release(r_iop);
+               break;
+       }
}

-void run_requeue(struct io *r_iop)
+void trace_requeue(struct io *r_iop)
{
-       LIST_HEAD(del_head);
-       struct io *d_iop = dip_find_sec(r_iop->dip, IOP_D, BIT_START(r_iop));
+       if (io_setup(r_iop, IOP_R)) {
+               switch (ready_requeue(r_iop)) {
+               case 1:
+                       run_requeue(r_iop);
+                       break;

-       __link(d_iop, r_iop);
-       run_unissue(d_iop, &del_head);
-       __unlink(d_iop, r_iop);
+               case 0:
+                       add_retry(r_iop);
+                       break;
+               case -1:
+                       del_retry(r_iop);
+                       io_release(r_iop);
+                       break;
+               }
+       }
+       else
+               io_release(r_iop);

-       del_retry(r_iop);
-       list_add_tail(&r_iop->f_head, &del_head);
-       release_iops(&del_head);
}

Reply via email to