/* buf.c: copyright relinquished by Kragen Sitaker, sole author */

/*
 * My first mail-message-buffering strategy turned out fiendishly
 * complex.  So I thought maybe I'd make a better design with a couple
 * of layers: one that doesn't know anything about messages and From_
 * lines, and just gives you a sort of sliding window on a file, and a
 * message layer that controls the lower layer to frame successive
 * messages.
 */

/* 
 * My next thought was:
 *   So you can construct a "window" on a file descriptor.  Then you
 *   can advance the leading and trailing edges of the window, or ask
 *   for an address relative to the leading edge... or something.  I
 *   dunno.
 * I couldn't figure out how to make that simple, either.
 */

/* So instead I defined an abstract input stream type that lets me
 * "read up to" some delimiter.  My new do_messages function ended up
 * keeping nearly the same data as before, but the operations on it
 * seem a little cleaner.  It relies (directly) only on
 * read_until_delimiter, discard_char, is_eof, and instream_init.
 */

/*
 * I'm still not happy with the cleanliness of this code.  I feel
 * really stupid that it is so large and complicated, but it isn't
 * obvious to me how to avoid this level of complication.  The
 * following 100+ lines of C are what mmap was saving me before.
 */

/* 
 * So here I have a mailbox buffering system which seems to work
 * reasonably well.  My sample program still uses several times as
 * much CPU as "grep" to print out the From_ lines in an mbox file,
 * and it only uses 20% of the CPU on my PIII/600 --- it's I/O-bound
 * at a CPU speed of 32 megabytes per second, compiled with gcc 3.3.3
 * prerelease with -ftracer -O3.  Unlike the previous mmap-based
 * mailbox indexers, this one doesn't cause my system to page out
 * everything essential so it can paw through a 600MB mailbox, and it
 * should be able to paw through mailboxes of any arbitrary size.
 *
 * Still haven't grafted it together with the mailbox indexing code to
 * get a fast, sequential-I/O, inverted indexer for arbitrarily large
 * mailboxes.
 */

#define _GNU_SOURCE
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <stdio.h>
#include "buf.h"

#define MIN_READ_SIZE 262144   /* 1/4MiB */
#define MAX_READ_SIZE 2097152  /* 2MiB */
#define DEBUG 0

static char *current_pointer(instream *is) { return is->buf + is->cur; }
static int is_eof(instream *is) { return is->eof; }
static int data_length(instream *is) { return is->buffered - is->cur; }

/* buf_find_delim's definition is a little tricky: it returns NULL if
 *   we need to read more, but points at the end of the buffer in case
 *   of EOF, or the found delimiter otherwise. */

static char *buf_find_delim(instream *is, char *delim) {
  char *rv = memmem(current_pointer(is), data_length(is),
                    delim, strlen(delim));
  if (rv) return rv;
  if (is_eof(is)) return is->buf + is->buffered;
  return NULL;
}

/* Move live space to the beginning of a buffer.  Usually newspace is
 * is->buf. */
static void compact(instream *is, char *newspace) {
  is->buffered -= is->cur;
  if (DEBUG) fprintf(stderr, "COMPACTING\n");
  memmove(newspace, is->buf + is->cur, is->buffered);
  if (DEBUG) fprintf(stderr, "DONE\n");
  is->cur = 0;
}

static int expand_space(instream *is) {
  int new_size = is->buflen * 2 + 1;
  char *new_space = malloc(new_size);
  if (!new_space) return 0;
  compact(is, new_space);
  memset(is->buf, '%', is->buflen);
  free(is->buf);
  is->buf = new_space;
  is->buflen = new_size;
  return 1;
}

static void dump_instream(instream *is, FILE *fh, char *msg) {
  char *buf = "???";
  if (DEBUG) {
    if (is->buf) buf = is->buf;
    fprintf(fh, "(#%d%s (%c%c%c...) %d/%d (@%d)) %s\n", is->fd, 
            (is->eof ? " (closed)" : ""),
            buf[0], buf[1], buf[2], 
            is->buffered, is->buflen, 
            is->cur, msg);
  }
}

static int not_much_slack_space(instream *is) {
  int slack_space = is->buflen - is->buffered;
  return slack_space < MIN_READ_SIZE || slack_space < is->buflen/4;
}

static void maybe_compact(instream *is) {
  /* We're about to read more, and if it's going to save us work, we'd
   *     like to move the live data to the beginning of the buffer.
   *     I'm not sure how to tell if it's going to save us work,
   *     though.  The work of compacting is proportional to the live
   *     data; but what work does it save us?  */
  if (not_much_slack_space(is) && (is->buflen - is->cur) * 4 < is->buflen)
    compact(is, is->buf);
}

static int amount_to_read(instream *is) {
  int available_space = is->buflen - is->buffered;
  if (available_space > MAX_READ_SIZE) return MAX_READ_SIZE;
  return available_space;
}  

/* read_more is supposed to return false only if there's
 * some kind of error (memory allocation or file I/O); it's supposed
 * to set eof if it hits eof, but not return false. */

static int read_more(instream *is) {
  dump_instream(is, stderr, "want to read more");
  maybe_compact(is);
  if (not_much_slack_space(is)) {
    if (!expand_space(is)) return 0;
  }
  dump_instream(is, stderr, "prepared");

  if (DEBUG) fprintf(stderr, "READING\n");
  int read_bytes = read(is->fd, 
                        is->buf + is->buffered, 
                        amount_to_read(is));
  if (DEBUG) fprintf(stderr, "READ\n");
  if (read_bytes < 0) return 0;
  is->buffered += read_bytes;
  if (!read_bytes) is->eof = 1;
  return 1;
}

static int read_until_delimiter(char *delim, instream *is, char **rv, int 
*rvlen) {
  char *d;
  /* XXX it's goofy (in an O(N^2) way) that this may end up searching
   * the beginning of a long message several times */
  while (!(d=buf_find_delim(is, delim))) {
    if (!read_more(is)) return 0;
  }
  char *cp = current_pointer(is);
  *rv = cp;
  *rvlen = d - cp;
  is->cur += d - cp;
  return 1;
}

static void discard_char(instream *is) {
  assert(is->cur < is->buffered);
  is->cur++;
}



static int starts_with(char *haystack, int len, char *needle, int needlelen) {
  if (needlelen > len) return 0;
  return (0 == memcmp(haystack, needle, needlelen));
}

static int starts_with_str(char *haystack, int len, char *needle) {
  return starts_with(haystack, len, needle, strlen(needle));
}


int do_messages(instream *is, void (*process_msg)(char *s, int len)) {
  char *s;
  int len;
  for (;;) {
    if (!read_until_delimiter("\nFrom ", is, &s, &len)) {
      perror("read_until_delimiter");
      return 0;
    }
    if (starts_with_str(s, len, "From "))  /* skip possible first partial msg */
      process_msg(s, len);
    if (is_eof(is)) break;
    discard_char(is);  // for the \n
  }
  return 1;
}

void instream_init(instream *is, int infd) {
  is->fd = infd;
  is->eof = 0;
  is->buf = NULL;
  is->buffered = 0;
  is->buflen = 0;
  is->cur = 0;
}


END_OF_BUF_DOT_C = here


Here's buf.h:


/* buf.h: copyright relinquished by Kragen Sitaker, sole author */

#ifndef __INSTREAM_BUF_H
#define __INSTREAM_BUF_H

typedef struct instream {
  int fd;                       /* file descriptor */
  int eof;                      /* true if it's hit EOF */
  char *buf;                    /* currently buffered data */
  int buffered;                 /* number of buffered bytes */
  int buflen;                   /* allocated size of buffer in bytes */
  int cur;                      /* pointer to beginning of live data in buf */
} instream;

int do_messages(instream *is, void (*process_msg)(char *s, int len));
void instream_init(instream *is, int infd);

#endif


END_OF_BUF_DOT_H = here


Here's the example program that prints out the "From " lines:

/* from_buf.c: copyright relinquished by Kragen Sitaker, sole author */

#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include "buf.h"

int open_or_die(char *fname) {
  int rv = open(fname, O_RDONLY);
  if (rv < 0) {
    perror(fname);
    exit(1);
  }
  return rv;
}

void print_from_line(char *msg, int msg_len) {
  char *eol = memchr(msg, '\n', msg_len);
  if (!eol) eol = msg + msg_len;
  fwrite(msg, eol - msg, 1, stdout);
  printf("\n");
}

int main(int argc, char **argv) {
  instream is;
  int infile = open_or_die(argv[1]);
  instream_init(&is, infile);
  do_messages(&is, print_from_line);
}

END_OF_FROM_BUF_DOT_C = here

Reply via email to