/* buf.c: copyright relinquished by Kragen Sitaker, sole author */ /* * My first mail-message-buffering strategy turned out fiendishly * complex. So I thought maybe I'd make a better design with a couple * of layers: one that doesn't know anything about messages and From_ * lines, and just gives you a sort of sliding window on a file, and a * message layer that controls the lower layer to frame successive * messages. */
/* * My next thought was: * So you can construct a "window" on a file descriptor. Then you * can advance the leading and trailing edges of the window, or ask * for an address relative to the leading edge... or something. I * dunno. * I couldn't figure out how to make that simple, either. */ /* So instead I defined an abstract input stream type that lets me * "read up to" some delimiter. My new do_messages function ended up * keeping nearly the same data as before, but the operations on it * seem a little cleaner. It relies (directly) only on * read_until_delimiter, discard_char, is_eof, and instream_init. */ /* * I'm still not happy with the cleanliness of this code. I feel * really stupid that it is so large and complicated, but it isn't * obvious to me how to avoid this level of complication. The * following 100+ lines of C are what mmap was saving me before. */ /* * So here I have a mailbox buffering system which seems to work * reasonably well. My sample program still uses several times as * much CPU as "grep" to print out the From_ lines in an mbox file, * and it only uses 20% of the CPU on my PIII/600 --- it's I/O-bound * at a CPU speed of 32 megabytes per second, compiled with gcc 3.3.3 * prerelease with -ftracer -O3. Unlike the previous mmap-based * mailbox indexers, this one doesn't cause my system to page out * everything essential so it can paw through a 600MB mailbox, and it * should be able to paw through mailboxes of any arbitrary size. * * Still haven't grafted it together with the mailbox indexing code to * get a fast, sequential-I/O, inverted indexer for arbitrarily large * mailboxes. */ #define _GNU_SOURCE #include <string.h> #include <stdlib.h> #include <unistd.h> #include <assert.h> #include <stdio.h> #include "buf.h" #define MIN_READ_SIZE 262144 /* 1/4MiB */ #define MAX_READ_SIZE 2097152 /* 2MiB */ #define DEBUG 0 static char *current_pointer(instream *is) { return is->buf + is->cur; } static int is_eof(instream *is) { return is->eof; } static int data_length(instream *is) { return is->buffered - is->cur; } /* buf_find_delim's definition is a little tricky: it returns NULL if * we need to read more, but points at the end of the buffer in case * of EOF, or the found delimiter otherwise. */ static char *buf_find_delim(instream *is, char *delim) { char *rv = memmem(current_pointer(is), data_length(is), delim, strlen(delim)); if (rv) return rv; if (is_eof(is)) return is->buf + is->buffered; return NULL; } /* Move live space to the beginning of a buffer. Usually newspace is * is->buf. */ static void compact(instream *is, char *newspace) { is->buffered -= is->cur; if (DEBUG) fprintf(stderr, "COMPACTING\n"); memmove(newspace, is->buf + is->cur, is->buffered); if (DEBUG) fprintf(stderr, "DONE\n"); is->cur = 0; } static int expand_space(instream *is) { int new_size = is->buflen * 2 + 1; char *new_space = malloc(new_size); if (!new_space) return 0; compact(is, new_space); memset(is->buf, '%', is->buflen); free(is->buf); is->buf = new_space; is->buflen = new_size; return 1; } static void dump_instream(instream *is, FILE *fh, char *msg) { char *buf = "???"; if (DEBUG) { if (is->buf) buf = is->buf; fprintf(fh, "(#%d%s (%c%c%c...) %d/%d (@%d)) %s\n", is->fd, (is->eof ? " (closed)" : ""), buf[0], buf[1], buf[2], is->buffered, is->buflen, is->cur, msg); } } static int not_much_slack_space(instream *is) { int slack_space = is->buflen - is->buffered; return slack_space < MIN_READ_SIZE || slack_space < is->buflen/4; } static void maybe_compact(instream *is) { /* We're about to read more, and if it's going to save us work, we'd * like to move the live data to the beginning of the buffer. * I'm not sure how to tell if it's going to save us work, * though. The work of compacting is proportional to the live * data; but what work does it save us? */ if (not_much_slack_space(is) && (is->buflen - is->cur) * 4 < is->buflen) compact(is, is->buf); } static int amount_to_read(instream *is) { int available_space = is->buflen - is->buffered; if (available_space > MAX_READ_SIZE) return MAX_READ_SIZE; return available_space; } /* read_more is supposed to return false only if there's * some kind of error (memory allocation or file I/O); it's supposed * to set eof if it hits eof, but not return false. */ static int read_more(instream *is) { dump_instream(is, stderr, "want to read more"); maybe_compact(is); if (not_much_slack_space(is)) { if (!expand_space(is)) return 0; } dump_instream(is, stderr, "prepared"); if (DEBUG) fprintf(stderr, "READING\n"); int read_bytes = read(is->fd, is->buf + is->buffered, amount_to_read(is)); if (DEBUG) fprintf(stderr, "READ\n"); if (read_bytes < 0) return 0; is->buffered += read_bytes; if (!read_bytes) is->eof = 1; return 1; } static int read_until_delimiter(char *delim, instream *is, char **rv, int *rvlen) { char *d; /* XXX it's goofy (in an O(N^2) way) that this may end up searching * the beginning of a long message several times */ while (!(d=buf_find_delim(is, delim))) { if (!read_more(is)) return 0; } char *cp = current_pointer(is); *rv = cp; *rvlen = d - cp; is->cur += d - cp; return 1; } static void discard_char(instream *is) { assert(is->cur < is->buffered); is->cur++; } static int starts_with(char *haystack, int len, char *needle, int needlelen) { if (needlelen > len) return 0; return (0 == memcmp(haystack, needle, needlelen)); } static int starts_with_str(char *haystack, int len, char *needle) { return starts_with(haystack, len, needle, strlen(needle)); } int do_messages(instream *is, void (*process_msg)(char *s, int len)) { char *s; int len; for (;;) { if (!read_until_delimiter("\nFrom ", is, &s, &len)) { perror("read_until_delimiter"); return 0; } if (starts_with_str(s, len, "From ")) /* skip possible first partial msg */ process_msg(s, len); if (is_eof(is)) break; discard_char(is); // for the \n } return 1; } void instream_init(instream *is, int infd) { is->fd = infd; is->eof = 0; is->buf = NULL; is->buffered = 0; is->buflen = 0; is->cur = 0; } END_OF_BUF_DOT_C = here Here's buf.h: /* buf.h: copyright relinquished by Kragen Sitaker, sole author */ #ifndef __INSTREAM_BUF_H #define __INSTREAM_BUF_H typedef struct instream { int fd; /* file descriptor */ int eof; /* true if it's hit EOF */ char *buf; /* currently buffered data */ int buffered; /* number of buffered bytes */ int buflen; /* allocated size of buffer in bytes */ int cur; /* pointer to beginning of live data in buf */ } instream; int do_messages(instream *is, void (*process_msg)(char *s, int len)); void instream_init(instream *is, int infd); #endif END_OF_BUF_DOT_H = here Here's the example program that prints out the "From " lines: /* from_buf.c: copyright relinquished by Kragen Sitaker, sole author */ #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include "buf.h" int open_or_die(char *fname) { int rv = open(fname, O_RDONLY); if (rv < 0) { perror(fname); exit(1); } return rv; } void print_from_line(char *msg, int msg_len) { char *eol = memchr(msg, '\n', msg_len); if (!eol) eol = msg + msg_len; fwrite(msg, eol - msg, 1, stdout); printf("\n"); } int main(int argc, char **argv) { instream is; int infile = open_or_die(argv[1]); instream_init(&is, infile); do_messages(&is, print_from_line); } END_OF_FROM_BUF_DOT_C = here