On Wed, Aug 18, 2010 at 10:40 AM, Gabor Kovesdan <ga...@freebsd.org> wrote: > Author: gabor > Date: Wed Aug 18 17:40:10 2010 > New Revision: 211463 > URL: http://svn.freebsd.org/changeset/base/211463 > > Log: > - Refactor file reading code to use pure syscalls and an internal buffer > instead of stdio. This gives BSD grep a very big performance boost, > its speed is now almost comparable to GNU grep.
I didn't read all of the details in the profiling mails in the thread, but does this mean that work on stdio would give a performance boost to many apps? Or is there something specific about how grep(1) is using its input that makes it a horse of a different color? Thanks, matthew > > Submitted by: Dimitry Andric <dimi...@andric.com> > Approved by: delphij (mentor) > > Modified: > head/usr.bin/grep/fastgrep.c > head/usr.bin/grep/file.c > head/usr.bin/grep/grep.h > head/usr.bin/grep/util.c > > Modified: head/usr.bin/grep/fastgrep.c > ============================================================================== > --- head/usr.bin/grep/fastgrep.c Wed Aug 18 17:39:47 2010 > (r211462) > +++ head/usr.bin/grep/fastgrep.c Wed Aug 18 17:40:10 2010 > (r211463) > @@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat > } > > int > -grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t > *pmatch) > +grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, > regmatch_t *pmatch) > { > unsigned int j; > int ret = REG_NOMATCH; > > Modified: head/usr.bin/grep/file.c > ============================================================================== > --- head/usr.bin/grep/file.c Wed Aug 18 17:39:47 2010 (r211462) > +++ head/usr.bin/grep/file.c Wed Aug 18 17:40:10 2010 (r211463) > @@ -2,7 +2,8 @@ > > /*- > * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav > - * Copyright (C) 2008-2009 Gabor Kovesdan <ga...@freebsd.org> > + * Copyright (C) 2008-2010 Gabor Kovesdan <ga...@freebsd.org> > + * Copyright (C) 2010 Dimitry Andric <dimi...@andric.com> > * All rights reserved. > * > * Redistribution and use in source and binary forms, with or without > @@ -37,7 +38,8 @@ __FBSDID("$FreeBSD$"); > #include <bzlib.h> > #include <err.h> > #include <errno.h> > -#include <stdio.h> > +#include <fcntl.h> > +#include <stddef.h> > #include <stdlib.h> > #include <string.h> > #include <unistd.h> > @@ -47,222 +49,204 @@ __FBSDID("$FreeBSD$"); > > #include "grep.h" > > -static char fname[MAXPATHLEN]; /* file name */ > +#define MAXBUFSIZ (32 * 1024) > +#define LNBUFBUMP 80 > > -#define MAXBUFSIZ (16 * 1024) > -#define PREREAD_M 0.2 > +static gzFile gzbufdesc; > +static BZFILE* bzbufdesc; > > -/* Some global variables for the buffering and reading. */ > -static char *lnbuf; > -static size_t lnbuflen; > -static unsigned char *binbuf; > -static int binbufsiz; > -unsigned char *binbufptr; > -static int bzerr; > +static unsigned char buffer[MAXBUFSIZ]; > +static unsigned char *bufpos; > +static size_t bufrem; > > -#define iswbinary(ch) (!iswspace((ch)) && iswcntrl((ch)) && \ > - (ch != L'\b') && (ch != L'\0')) > +static unsigned char *lnbuf; > +static size_t lnbuflen; > > -/* > - * Returns a single character according to the file type. > - * Returns -1 on failure. > - */ > static inline int > -grep_fgetc(struct file *f) > +grep_refill(struct file *f) > { > - unsigned char c; > + ssize_t nr; > + int bzerr; > > - switch (filebehave) { > - case FILE_STDIO: > - return (getc_unlocked(f->f)); > - case FILE_GZIP: > - return (gzgetc(f->gzf)); > - case FILE_BZIP: > - BZ2_bzRead(&bzerr, f->bzf, &c, 1); > - if (bzerr == BZ_STREAM_END) > - return (-1); > - else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK) > - errx(2, "%s", getstr(2)); > - return (c); > - } > - return (-1); > + bufpos = buffer; > + bufrem = 0; > + > + if (filebehave == FILE_GZIP) > + nr = gzread(gzbufdesc, buffer, MAXBUFSIZ); > + else if (filebehave == FILE_BZIP && bzbufdesc != NULL) { > + nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ); > + switch (bzerr) { > + case BZ_OK: > + case BZ_STREAM_END: > + /* No problem, nr will be okay */ > + break; > + case BZ_DATA_ERROR_MAGIC: > + /* > + * As opposed to gzread(), which simply returns the > + * plain file data, if it is not in the correct > + * compressed format, BZ2_bzRead() instead aborts. > + * > + * So, just restart at the beginning of the file > again, > + * and use plain reads from now on. > + */ > + BZ2_bzReadClose(&bzerr, bzbufdesc); > + bzbufdesc = NULL; > + if (lseek(f->fd, 0, SEEK_SET) == -1) > + return (-1); > + nr = read(f->fd, buffer, MAXBUFSIZ); > + break; > + default: > + /* Make sure we exit with an error */ > + nr = -1; > + } > + } else > + nr = read(f->fd, buffer, MAXBUFSIZ); > + > + if (nr < 0) > + return (-1); > + > + bufrem = nr; > + return (0); > } > > -/* > - * Returns true if the file position is a EOF, returns false > - * otherwise. > - */ > static inline int > -grep_feof(struct file *f) > +grep_lnbufgrow(size_t newlen) > { > > - switch (filebehave) { > - case FILE_STDIO: > - return (feof_unlocked(f->f)); > - case FILE_GZIP: > - return (gzeof(f->gzf)); > - case FILE_BZIP: > - return (bzerr == BZ_STREAM_END); > + if (lnbuflen < newlen) { > + lnbuf = grep_realloc(lnbuf, newlen); > + lnbuflen = newlen; > } > - return (1); > + > + return (0); > } > > -/* > - * At the first call, fills in an internal buffer and checks if the given > - * file is a binary file and sets the binary flag accordingly. Then returns > - * a single line and sets len to the length of the returned line. > - * At any other call returns a single line either from the internal buffer > - * or from the file if the buffer is exhausted and sets len to the length > - * of the line. > - */ > char * > -grep_fgetln(struct file *f, size_t *len) > +grep_fgetln(struct file *f, size_t *lenp) > { > - struct stat st; > - size_t bufsiz, i = 0; > - int ch = 0; > - > - /* Fill in the buffer if it is empty. */ > - if (binbufptr == NULL) { > - > - /* Only pre-read to the buffer if we need the binary check. */ > - if (binbehave != BINFILE_TEXT) { > - if (f->stdin) > - st.st_size = MAXBUFSIZ; > - else if (stat(fname, &st) != 0) > - err(2, NULL); > - /* no need to allocate buffer. */ > - if (st.st_size == 0) > - return (NULL); > - > - bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ? > - (st.st_size / 2) : MAXBUFSIZ; > - > - binbuf = grep_malloc(sizeof(char) * bufsiz); > - > - while (i < bufsiz) { > - ch = grep_fgetc(f); > - if (ch == EOF) > - break; > - binbuf[i++] = ch; > - if ((ch == '\n') && lbflag) > - break; > - } > - > - f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ? > - '\0' : '\200', i - 1) != NULL; > - } > - binbufsiz = i; > - binbufptr = binbuf; > - } > - > - /* Read a line whether from the buffer or from the file itself. */ > - for (i = 0; !(grep_feof(f) && > - (binbufptr == &binbuf[binbufsiz])); i++) { > - if (binbufptr == &binbuf[binbufsiz]) { > - ch = grep_fgetc(f); > - } else { > - ch = binbufptr[0]; > - binbufptr++; > - } > - if (i >= lnbuflen) { > - lnbuflen *= 2; > - lnbuf = grep_realloc(lnbuf, ++lnbuflen); > - } > - if ((ch == '\n') || (ch == EOF)) { > - lnbuf[i] = '\0'; > + unsigned char *p; > + char *ret; > + size_t len; > + size_t off; > + ptrdiff_t diff; > + > + /* Fill the buffer, if necessary */ > + if (bufrem == 0 && grep_refill(f) != 0) > + goto error; > + > + if (bufrem == 0) { > + /* Return zero length to indicate EOF */ > + *lenp = 0; > + return (bufpos); > + } > + > + /* Look for a newline in the remaining part of the buffer */ > + if ((p = memchr(bufpos, '\n', bufrem)) != NULL) { > + ++p; /* advance over newline */ > + ret = bufpos; > + len = p - bufpos; > + bufrem -= len; > + bufpos = p; > + *lenp = len; > + return (ret); > + } > + > + /* We have to copy the current buffered data to the line buffer */ > + for (len = bufrem, off = 0; ; len += bufrem) { > + /* Make sure there is room for more data */ > + if (grep_lnbufgrow(len + LNBUFBUMP)) > + goto error; > + memcpy(lnbuf + off, bufpos, len - off); > + off = len; > + if (grep_refill(f) != 0) > + goto error; > + if (bufrem == 0) > + /* EOF: return partial line */ > break; > - } else > - lnbuf[i] = ch; > + if ((p = memchr(bufpos, '\n', bufrem)) == NULL) > + continue; > + /* got it: finish up the line (like code above) */ > + ++p; > + diff = p - bufpos; > + len += diff; > + if (grep_lnbufgrow(len)) > + goto error; > + memcpy(lnbuf + off, bufpos, diff); > + bufrem -= diff; > + bufpos = p; > + break; > } > - if (grep_feof(f) && (i == 0) && (ch != '\n')) > - return (NULL); > - *len = i; > + *lenp = len; > return (lnbuf); > + > +error: > + *lenp = 0; > + return (NULL); > } > > -/* > - * Opens the standard input for processing. > - */ > -struct file * > -grep_stdin_open(void) > +static inline struct file * > +grep_file_init(struct file *f) > { > - struct file *f; > > - /* Processing stdin implies --line-buffered for tail -f to work. */ > - lbflag = true; > + if (filebehave == FILE_GZIP && > + (gzbufdesc = gzdopen(f->fd, "r")) == NULL) > + goto error; > > - snprintf(fname, sizeof fname, "%s", getstr(1)); > + if (filebehave == FILE_BZIP && > + (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL) > + goto error; > > - f = grep_malloc(sizeof *f); > + /* Fill read buffer, also catches errors early */ > + if (grep_refill(f) != 0) > + goto error; > > - binbuf = NULL; > - if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) { > - flockfile(f->f); > - f->stdin = true; > - return (f); > - } > + /* Check for binary stuff, if necessary */ > + if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL) > + f->binary = true; > > + return (f); > +error: > + close(f->fd); > free(f); > return (NULL); > } > > /* > - * Opens a normal, a gzipped or a bzip2 compressed file for processing. > + * Opens a file for processing. > */ > struct file * > grep_open(const char *path) > { > struct file *f; > > - snprintf(fname, sizeof fname, "%s", path); > - > f = grep_malloc(sizeof *f); > - > - binbuf = NULL; > - f->stdin = false; > - switch (filebehave) { > - case FILE_STDIO: > - if ((f->f = fopen(path, "r")) != NULL) { > - flockfile(f->f); > - return (f); > - } > - break; > - case FILE_GZIP: > - if ((f->gzf = gzopen(fname, "r")) != NULL) > - return (f); > - break; > - case FILE_BZIP: > - if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL) > - return (f); > - break; > + memset(f, 0, sizeof *f); > + if (path == NULL) { > + /* Processing stdin implies --line-buffered. */ > + lbflag = true; > + f->fd = STDIN_FILENO; > + } else if ((f->fd = open(path, O_RDONLY)) == -1) { > + free(f); > + return (NULL); > } > > - free(f); > - return (NULL); > + return (grep_file_init(f)); > } > > /* > - * Closes a normal, a gzipped or a bzip2 compressed file. > + * Closes a file. > */ > void > grep_close(struct file *f) > { > > - switch (filebehave) { > - case FILE_STDIO: > - funlockfile(f->f); > - fclose(f->f); > - break; > - case FILE_GZIP: > - gzclose(f->gzf); > - break; > - case FILE_BZIP: > - BZ2_bzclose(f->bzf); > - break; > - } > + close(f->fd); > > - /* Reset read buffer for the file we are closing */ > - binbufptr = NULL; > - free(binbuf); > + /* Reset read buffer and line buffer */ > + bufpos = buffer; > + bufrem = 0; > + > + free(lnbuf); > + lnbuf = NULL; > + lnbuflen = 0; > } > > Modified: head/usr.bin/grep/grep.h > ============================================================================== > --- head/usr.bin/grep/grep.h Wed Aug 18 17:39:47 2010 (r211462) > +++ head/usr.bin/grep/grep.h Wed Aug 18 17:40:10 2010 (r211463) > @@ -77,12 +77,8 @@ extern const char *errstr[]; > #define MAX_LINE_MATCHES 32 > > struct file { > - struct mmfile *mmf; > - BZFILE *bzf; > - FILE *f; > - gzFile *gzf; > + int fd; > bool binary; > - bool stdin; > }; > > struct str { > @@ -150,11 +146,10 @@ void clearqueue(void); > > /* file.c */ > void grep_close(struct file *f); > -struct file *grep_stdin_open(void); > struct file *grep_open(const char *path); > char *grep_fgetln(struct file *f, size_t *len); > > /* fastgrep.c */ > int fastcomp(fastgrep_t *, const char *); > void fgrepcomp(fastgrep_t *, const char *); > -int grep_search(fastgrep_t *, unsigned char *, size_t, > regmatch_t *); > +int grep_search(fastgrep_t *, const unsigned char *, size_t, > regmatch_t *); > > Modified: head/usr.bin/grep/util.c > ============================================================================== > --- head/usr.bin/grep/util.c Wed Aug 18 17:39:47 2010 (r211462) > +++ head/usr.bin/grep/util.c Wed Aug 18 17:40:10 2010 (r211463) > @@ -184,7 +184,7 @@ procfile(const char *fn) > > if (strcmp(fn, "-") == 0) { > fn = label != NULL ? label : getstr(1); > - f = grep_stdin_open(); > + f = grep_open(NULL); > } else { > if (!stat(fn, &sb)) { > /* Check if we need to process the file */ > @@ -215,7 +215,7 @@ procfile(const char *fn) > > for (c = 0; c == 0 || !(lflag || qflag); ) { > ln.off += ln.len + 1; > - if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) { > + if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL || ln.len == > 0) { > if (ln.line_no == 0 && matchall) > exit(0); > else > _______________________________________________ svn-src-head@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"