/* buf.c: copyright relinquished by Kragen Sitaker, sole author */
/*
* My first mail-message-buffering strategy turned out fiendishly
* complex. So I thought maybe I'd make a better design with a couple
* of layers: one that doesn't know anything about messages and From_
* lines, and just gives you a sort of sliding window on a file, and a
* message layer that controls the lower layer to frame successive
* messages.
*/
/*
* My next thought was:
* So you can construct a "window" on a file descriptor. Then you
* can advance the leading and trailing edges of the window, or ask
* for an address relative to the leading edge... or something. I
* dunno.
* I couldn't figure out how to make that simple, either.
*/
/* So instead I defined an abstract input stream type that lets me
* "read up to" some delimiter. My new do_messages function ended up
* keeping nearly the same data as before, but the operations on it
* seem a little cleaner. It relies (directly) only on
* read_until_delimiter, discard_char, is_eof, and instream_init.
*/
/*
* I'm still not happy with the cleanliness of this code. I feel
* really stupid that it is so large and complicated, but it isn't
* obvious to me how to avoid this level of complication. The
* following 100+ lines of C are what mmap was saving me before.
*/
/*
* So here I have a mailbox buffering system which seems to work
* reasonably well. My sample program still uses several times as
* much CPU as "grep" to print out the From_ lines in an mbox file,
* and it only uses 20% of the CPU on my PIII/600 --- it's I/O-bound
* at a CPU speed of 32 megabytes per second, compiled with gcc 3.3.3
* prerelease with -ftracer -O3. Unlike the previous mmap-based
* mailbox indexers, this one doesn't cause my system to page out
* everything essential so it can paw through a 600MB mailbox, and it
* should be able to paw through mailboxes of any arbitrary size.
*
* Still haven't grafted it together with the mailbox indexing code to
* get a fast, sequential-I/O, inverted indexer for arbitrarily large
* mailboxes.
*/
#define _GNU_SOURCE
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <stdio.h>
#include "buf.h"
#define MIN_READ_SIZE 262144 /* 1/4MiB */
#define MAX_READ_SIZE 2097152 /* 2MiB */
#define DEBUG 0
static char *current_pointer(instream *is) { return is->buf + is->cur; }
static int is_eof(instream *is) { return is->eof; }
static int data_length(instream *is) { return is->buffered - is->cur; }
/* buf_find_delim's definition is a little tricky: it returns NULL if
* we need to read more, but points at the end of the buffer in case
* of EOF, or the found delimiter otherwise. */
static char *buf_find_delim(instream *is, char *delim) {
char *rv = memmem(current_pointer(is), data_length(is),
delim, strlen(delim));
if (rv) return rv;
if (is_eof(is)) return is->buf + is->buffered;
return NULL;
}
/* Move live space to the beginning of a buffer. Usually newspace is
* is->buf. */
static void compact(instream *is, char *newspace) {
is->buffered -= is->cur;
if (DEBUG) fprintf(stderr, "COMPACTING\n");
memmove(newspace, is->buf + is->cur, is->buffered);
if (DEBUG) fprintf(stderr, "DONE\n");
is->cur = 0;
}
static int expand_space(instream *is) {
int new_size = is->buflen * 2 + 1;
char *new_space = malloc(new_size);
if (!new_space) return 0;
compact(is, new_space);
memset(is->buf, '%', is->buflen);
free(is->buf);
is->buf = new_space;
is->buflen = new_size;
return 1;
}
static void dump_instream(instream *is, FILE *fh, char *msg) {
char *buf = "???";
if (DEBUG) {
if (is->buf) buf = is->buf;
fprintf(fh, "(#%d%s (%c%c%c...) %d/%d (@%d)) %s\n", is->fd,
(is->eof ? " (closed)" : ""),
buf[0], buf[1], buf[2],
is->buffered, is->buflen,
is->cur, msg);
}
}
static int not_much_slack_space(instream *is) {
int slack_space = is->buflen - is->buffered;
return slack_space < MIN_READ_SIZE || slack_space < is->buflen/4;
}
static void maybe_compact(instream *is) {
/* We're about to read more, and if it's going to save us work, we'd
* like to move the live data to the beginning of the buffer.
* I'm not sure how to tell if it's going to save us work,
* though. The work of compacting is proportional to the live
* data; but what work does it save us? */
if (not_much_slack_space(is) && (is->buflen - is->cur) * 4 < is->buflen)
compact(is, is->buf);
}
static int amount_to_read(instream *is) {
int available_space = is->buflen - is->buffered;
if (available_space > MAX_READ_SIZE) return MAX_READ_SIZE;
return available_space;
}
/* read_more is supposed to return false only if there's
* some kind of error (memory allocation or file I/O); it's supposed
* to set eof if it hits eof, but not return false. */
static int read_more(instream *is) {
dump_instream(is, stderr, "want to read more");
maybe_compact(is);
if (not_much_slack_space(is)) {
if (!expand_space(is)) return 0;
}
dump_instream(is, stderr, "prepared");
if (DEBUG) fprintf(stderr, "READING\n");
int read_bytes = read(is->fd,
is->buf + is->buffered,
amount_to_read(is));
if (DEBUG) fprintf(stderr, "READ\n");
if (read_bytes < 0) return 0;
is->buffered += read_bytes;
if (!read_bytes) is->eof = 1;
return 1;
}
static int read_until_delimiter(char *delim, instream *is, char **rv, int
*rvlen) {
char *d;
/* XXX it's goofy (in an O(N^2) way) that this may end up searching
* the beginning of a long message several times */
while (!(d=buf_find_delim(is, delim))) {
if (!read_more(is)) return 0;
}
char *cp = current_pointer(is);
*rv = cp;
*rvlen = d - cp;
is->cur += d - cp;
return 1;
}
static void discard_char(instream *is) {
assert(is->cur < is->buffered);
is->cur++;
}
static int starts_with(char *haystack, int len, char *needle, int needlelen) {
if (needlelen > len) return 0;
return (0 == memcmp(haystack, needle, needlelen));
}
static int starts_with_str(char *haystack, int len, char *needle) {
return starts_with(haystack, len, needle, strlen(needle));
}
int do_messages(instream *is, void (*process_msg)(char *s, int len)) {
char *s;
int len;
for (;;) {
if (!read_until_delimiter("\nFrom ", is, &s, &len)) {
perror("read_until_delimiter");
return 0;
}
if (starts_with_str(s, len, "From ")) /* skip possible first partial msg */
process_msg(s, len);
if (is_eof(is)) break;
discard_char(is); // for the \n
}
return 1;
}
void instream_init(instream *is, int infd) {
is->fd = infd;
is->eof = 0;
is->buf = NULL;
is->buffered = 0;
is->buflen = 0;
is->cur = 0;
}
END_OF_BUF_DOT_C = here
Here's buf.h:
/* buf.h: copyright relinquished by Kragen Sitaker, sole author */
#ifndef __INSTREAM_BUF_H
#define __INSTREAM_BUF_H
typedef struct instream {
int fd; /* file descriptor */
int eof; /* true if it's hit EOF */
char *buf; /* currently buffered data */
int buffered; /* number of buffered bytes */
int buflen; /* allocated size of buffer in bytes */
int cur; /* pointer to beginning of live data in buf */
} instream;
int do_messages(instream *is, void (*process_msg)(char *s, int len));
void instream_init(instream *is, int infd);
#endif
END_OF_BUF_DOT_H = here
Here's the example program that prints out the "From " lines:
/* from_buf.c: copyright relinquished by Kragen Sitaker, sole author */
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include "buf.h"
int open_or_die(char *fname) {
int rv = open(fname, O_RDONLY);
if (rv < 0) {
perror(fname);
exit(1);
}
return rv;
}
void print_from_line(char *msg, int msg_len) {
char *eol = memchr(msg, '\n', msg_len);
if (!eol) eol = msg + msg_len;
fwrite(msg, eol - msg, 1, stdout);
printf("\n");
}
int main(int argc, char **argv) {
instream is;
int infile = open_or_die(argv[1]);
instream_init(&is, infile);
do_messages(&is, print_from_line);
}
END_OF_FROM_BUF_DOT_C = here