Re: locate(1): ignore paths longer than MAXPATHLEN
Last update, with Tobias's help. The following diff - changes MAXPATHLEN from sys/param.h to PATH_MAX from limits.h - adds a missing prototype for sane_count - locate.bigram and locate.code now abort when reading a pathname exceeding PATH_MAX bytes on stdin Index: src/usr.bin/locate//bigram/locate.bigram.c === RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v retrieving revision 1.12 diff -u -p -u -r1.12 locate.bigram.c --- src/usr.bin/locate//bigram/locate.bigram.c 27 Oct 2009 23:59:39 - 1.12 +++ src/usr.bin/locate//bigram/locate.bigram.c 26 Nov 2014 18:15:13 - @@ -43,30 +43,41 @@ * Use 'code' to encode a file using this output. */ +#include err.h +#include limits.h/* for PATH_MAX */ #include stdio.h #include stdlib.h -#include sys/param.h /* for MAXPATHLEN */ +#include string.h #include locate.h -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf[PATH_MAX] = ; u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1]; int main(void) { u_char *cp; - u_char *oldpath = buf1, *path = buf2; + u_char *oldpath = buf, *path, *mbuf; u_int i, j; + size_t len; - while (fgets(path, sizeof(buf2), stdin) != NULL) { + mbuf = NULL; - /* -* We don't need remove newline character '\n'. -* '\n' is less than ASCII_MIN and will be later -* ignored at output. -*/ + while ((path=(u_char *)fgetln(stdin, len)) != NULL) { + if (path[len-1] == '\n') + path[len-1] = '\0'; + else { + if ((mbuf=malloc(len+1)) == NULL) + err(1, malloc); + memcpy(mbuf, path, len); + mbuf[len] = '\0'; + len++; + path = mbuf; + } + if (len sizeof(buf)) + errx(1, pathname exceeding %zu byte limit: %s, + sizeof(buf), path); /* skip longest common prefix */ for (cp = path; *cp == *oldpath; cp++, oldpath++) @@ -78,14 +89,7 @@ main(void) cp += 2; } - /* swap pointers */ - if (path == buf1) { - path = buf2; - oldpath = buf1; - } else { - path = buf1; - oldpath = buf2; - } + memcpy(buf, path, len); } /* output, boundary check */ Index: src/usr.bin/locate//code/locate.code.c === RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v retrieving revision 1.17 diff -u -p -u -r1.17 locate.code.c --- src/usr.bin/locate//code/locate.code.c 17 Nov 2013 20:19:36 - 1.17 +++ src/usr.bin/locate//code/locate.code.c 26 Nov 2014 18:15:13 - @@ -78,10 +78,10 @@ * Wolfram Schneider, Berlin September 1996 */ -#include sys/param.h #include err.h #include errno.h +#include limits.h #include stdio.h #include stdlib.h #include string.h @@ -91,8 +91,7 @@ #defineBGBUFSIZE (NBG * 2) /* size of bigram buffer */ -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf[PATH_MAX] = ; u_char bigrams[BGBUFSIZE + 1] = { 0 }; #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ @@ -115,7 +114,8 @@ extern int optopt; int main(int argc, char *argv[]) { - u_char *cp, *oldpath, *path; + u_char *cp, *oldpath, *path, *mbuf; + size_t len; int ch, code, count, diffcount, oldcount; FILE *fp; int i, j; @@ -156,23 +156,31 @@ main(int argc, char *argv[]) #endif /* LOOKUP */ - oldpath = buf1; - path = buf2; + oldpath = buf; oldcount = 0; + mbuf = NULL; - while (fgets(path, sizeof(buf2), stdin) != NULL) { - + while ((path=(u_char *)fgetln(stdin, len)) != NULL) { /* skip empty lines */ if (*path == '\n') continue; - /* remove newline */ - for (cp = path; *cp != '\0'; cp++) { - /* chop newline */ - if (*cp == '\n') - *cp = '\0'; + if (path[len-1] == '\n') { + /* remove newline */ + path[len-1] = '\0'; + } else { + if ((mbuf = malloc(len+1)) == NULL) + err(1, malloc); + memcpy(mbuf, path, len); + mbuf[len] = '\0'; + len++; + path = mbuf; } + if (len
Re: locate(1): ignore paths longer than MAXPATHLEN
On Wed, Nov 26, 2014 at 09:48:15PM +0100, Nicolas Bedos wrote: Last update, with Tobias's help. The following diff - changes MAXPATHLEN from sys/param.h to PATH_MAX from limits.h - adds a missing prototype for sane_count - locate.bigram and locate.code now abort when reading a pathname exceeding PATH_MAX bytes on stdin If somebody else agrees with this diff, I'll adjust the last style-deviations and commit it. Tobias
Re: locate(1): ignore paths longer than MAXPATHLEN
Tobias Stoeckmann wrote: I would free() it nontheless outside the while loop. For the sake of faster review. But that's just my opinion. Also, it would be nice if there is only one len/sizeof() check after fgetln. Which means that the check should be done after the if/else-block. Could happen that we malloc memory for nothing, but hey: It happens only once and just if there is a missing newline. And while at it, silently ignoring lines doesn't sound like a good idea. I would even go for an err() call; or at least warn(). Again, thank you for your suggestions. I didn't thought about factoring out the len/sizeof comparisons. I have also added the free() call, and an errx() call when a pathname is too long. Nicolas Bedos Index: src/usr.bin/locate//bigram/locate.bigram.c === RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v retrieving revision 1.12 diff -u -p -u -r1.12 locate.bigram.c --- src/usr.bin/locate//bigram/locate.bigram.c 27 Oct 2009 23:59:39 - 1.12 +++ src/usr.bin/locate//bigram/locate.bigram.c 25 Nov 2014 18:46:34 - @@ -43,13 +43,13 @@ * Use 'code' to encode a file using this output. */ +#include limits.h/* for PATH_MAX */ #include stdio.h #include stdlib.h -#include sys/param.h /* for MAXPATHLEN */ #include locate.h -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf1[PATH_MAX] = ; +u_char buf2[PATH_MAX]; u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1]; int Index: src/usr.bin/locate//code/locate.code.c === RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v retrieving revision 1.17 diff -u -p -u -r1.17 locate.code.c --- src/usr.bin/locate//code/locate.code.c 17 Nov 2013 20:19:36 - 1.17 +++ src/usr.bin/locate//code/locate.code.c 25 Nov 2014 18:46:34 - @@ -78,10 +78,10 @@ * Wolfram Schneider, Berlin September 1996 */ -#include sys/param.h #include err.h #include errno.h +#include limits.h #include stdio.h #include stdlib.h #include string.h @@ -91,8 +91,7 @@ #defineBGBUFSIZE (NBG * 2) /* size of bigram buffer */ -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf[PATH_MAX] = ; u_char bigrams[BGBUFSIZE + 1] = { 0 }; #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ @@ -115,7 +114,8 @@ extern int optopt; int main(int argc, char *argv[]) { - u_char *cp, *oldpath, *path; + u_char *cp, *oldpath, *path, *mbuf; + size_t len; int ch, code, count, diffcount, oldcount; FILE *fp; int i, j; @@ -156,23 +156,31 @@ main(int argc, char *argv[]) #endif /* LOOKUP */ - oldpath = buf1; - path = buf2; + oldpath = buf; oldcount = 0; + mbuf = NULL; - while (fgets(path, sizeof(buf2), stdin) != NULL) { - + while ((path=(u_char *)fgetln(stdin, len)) != NULL) { /* skip empty lines */ if (*path == '\n') continue; - /* remove newline */ - for (cp = path; *cp != '\0'; cp++) { - /* chop newline */ - if (*cp == '\n') - *cp = '\0'; + if (path[len-1] == '\n') { + /* remove newline */ + path[len-1] = '\0'; + } else { + if ((mbuf = malloc(len+1)) == NULL) + err(1, malloc); + memcpy(mbuf, path, len); + mbuf[len] = '\0'; + len++; + path = mbuf; } + if (len sizeof(buf)) + errx(1, pathname exceeding %zu byte limit: %s, + sizeof(buf), path); + /* Skip longest common prefix. */ for (cp = path; *cp == *oldpath; cp++, oldpath++) if (*cp == '\0') @@ -222,14 +230,11 @@ main(int argc, char *argv[]) } } - if (path == buf1) { /* swap pointers */ - path = buf2; - oldpath = buf1; - } else { - path = buf1; - oldpath = buf2; - } + memcpy(buf, path, len); } + + free(mbuf); + /* Non-zero status if there were errors */ if (fflush(stdout) != 0 || ferror(stdout)) exit(1); Index: src/usr.bin/locate//locate/fastfind.c === RCS file: /cvs/src/usr.bin/locate/locate/fastfind.c,v retrieving revision 1.11 diff -u -p -u -r1.11 fastfind.c --- src/usr.bin/locate//locate/fastfind.c 25 Oct 2010 19:16:45 -
Re: locate(1): ignore paths longer than MAXPATHLEN
Tobias Stoeckmann wrote: I would recommend using fgetln for the actual line parsing. Then this kinda fragile code can be avoided (fragile: fgets and its users have a hard time to properly handle '\0' chars inside a file). Thank you for the advice. Here is an updated diff. It does indeed look less 'fragile' ! In locate.code.c 'mbuf' is never free()d: it is only allocated for the last line of input and after processing this line the program ends. I hope it is ok. Nicolas Bedos Index: src/usr.bin/locate//bigram/locate.bigram.c === RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v retrieving revision 1.12 diff -u -p -u -r1.12 locate.bigram.c --- src/usr.bin/locate//bigram/locate.bigram.c 27 Oct 2009 23:59:39 - 1.12 +++ src/usr.bin/locate//bigram/locate.bigram.c 24 Nov 2014 19:09:03 - @@ -43,13 +43,13 @@ * Use 'code' to encode a file using this output. */ +#include limits.h/* for PATH_MAX */ #include stdio.h #include stdlib.h -#include sys/param.h /* for MAXPATHLEN */ #include locate.h -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf1[PATH_MAX] = ; +u_char buf2[PATH_MAX]; u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1]; int Index: src/usr.bin/locate//code/locate.code.c === RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v retrieving revision 1.17 diff -u -p -u -r1.17 locate.code.c --- src/usr.bin/locate//code/locate.code.c 17 Nov 2013 20:19:36 - 1.17 +++ src/usr.bin/locate//code/locate.code.c 24 Nov 2014 19:09:03 - @@ -78,10 +78,10 @@ * Wolfram Schneider, Berlin September 1996 */ -#include sys/param.h #include err.h #include errno.h +#include limits.h #include stdio.h #include stdlib.h #include string.h @@ -91,8 +91,7 @@ #defineBGBUFSIZE (NBG * 2) /* size of bigram buffer */ -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf[PATH_MAX] = ; u_char bigrams[BGBUFSIZE + 1] = { 0 }; #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ @@ -115,7 +114,8 @@ extern int optopt; int main(int argc, char *argv[]) { - u_char *cp, *oldpath, *path; + u_char *cp, *oldpath, *path, *mbuf; + size_t len; int ch, code, count, diffcount, oldcount; FILE *fp; int i, j; @@ -156,21 +156,31 @@ main(int argc, char *argv[]) #endif /* LOOKUP */ - oldpath = buf1; - path = buf2; + oldpath = buf; oldcount = 0; - while (fgets(path, sizeof(buf2), stdin) != NULL) { - + while ((path=(u_char *)fgetln(stdin, len)) != NULL) { /* skip empty lines */ if (*path == '\n') continue; - /* remove newline */ - for (cp = path; *cp != '\0'; cp++) { - /* chop newline */ - if (*cp == '\n') - *cp = '\0'; + if (path[len-1] == '\n') { + /* skip lines exceeding PATH_MAX */ + if (len sizeof(buf)) + continue; + /* remove newline */ + path[len-1] = '\0'; + } else { + /* skip lines exceeding PATH_MAX-1 (save one byte +* to add the terminating '\0') */ + if (len sizeof(buf)-1) + continue; + if ((mbuf = malloc(len+1)) == NULL) + err(1, malloc); + memcpy(mbuf, path, len); + mbuf[len] = '\0'; + len++; + path = mbuf; } /* Skip longest common prefix. */ @@ -222,13 +232,7 @@ main(int argc, char *argv[]) } } - if (path == buf1) { /* swap pointers */ - path = buf2; - oldpath = buf1; - } else { - path = buf1; - oldpath = buf2; - } + memcpy(buf, path, len); } /* Non-zero status if there were errors */ if (fflush(stdout) != 0 || ferror(stdout)) Index: src/usr.bin/locate//locate/fastfind.c === RCS file: /cvs/src/usr.bin/locate/locate/fastfind.c,v retrieving revision 1.11 diff -u -p -u -r1.11 fastfind.c --- src/usr.bin/locate//locate/fastfind.c 25 Oct 2010 19:16:45 - 1.11 +++ src/usr.bin/locate//locate/fastfind.c 24 Nov 2014 19:09:04 - @@ -47,7 +47,7 @@ statistic (fp, path_fcodes) u_char *p, *s; int c; int count, umlaut; - u_char bigram1[NBG],
Re: locate(1): ignore paths longer than MAXPATHLEN
On Mon, Nov 24, 2014 at 08:37:47PM +0100, Nicolas Bedos wrote: In locate.code.c 'mbuf' is never free()d: it is only allocated for the last line of input and after processing this line the program ends. I hope it is ok. I would free() it nontheless outside the while loop. For the sake of faster review. But that's just my opinion. Also, it would be nice if there is only one len/sizeof() check after fgetln. Which means that the check should be done after the if/else-block. Could happen that we malloc memory for nothing, but hey: It happens only once and just if there is a missing newline. And while at it, silently ignoring lines doesn't sound like a good idea. I would even go for an err() call; or at least warn(). Tobias
locate(1): ignore paths longer than MAXPATHLEN
Hello, locate(1) doesn't handle paths longer than MAXPATHLEN well: they are silently split into MAXPATHLEN-long strings and each string is included in the database. Here is an example : $ cat loc_test.sh testdir='/tmp/locate_maxpathlen' db=$testdir/db d='64_characters_long_directory_name___' long_path=$testdir/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d mkdir -p $long_path || exit 1 cd $long_path touch short_filename touch long_filename_which_gets_truncated /usr/libexec/locate.updatedb --fcodes=$db \ --searchpaths=$testdir /usr/src \ || exit 1 locate -d $db filename truncated | grep -v ^/usr/src $ sh loc_test.sh /tmp/locate_maxpathlen/64_char_[... SNIP ...]_/long_filename_which_gets_ /tmp/locate_maxpathlen/64_char_[... SNIP ...]_/short_filename truncated We have just created the path truncated in the locate database. The diff below includes the following changes : - locate(1) now silently ignores paths longer than MAXPATHLEN - MAXPATHLEN from sys/param.h is replaced by PATH_MAX from limits.h - add a missing prototype for sane_count() Another solution would be to modify mklocatedb.sh or updatedb.sh to filter out those lng paths before they are fed to locate.code, but I can't see a way to achieve this that doesn't require hardcoding PATH_MAX? I ran /etc/weekly with the new binaries and didn't notice any unexpected changes in the locate database. Nicolas Bedos Index: src/usr.bin/locate//bigram/locate.bigram.c === RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v retrieving revision 1.12 diff -u -p -u -r1.12 locate.bigram.c --- src/usr.bin/locate//bigram/locate.bigram.c 27 Oct 2009 23:59:39 - 1.12 +++ src/usr.bin/locate//bigram/locate.bigram.c 23 Nov 2014 16:49:47 - @@ -43,13 +43,13 @@ * Use 'code' to encode a file using this output. */ +#include limits.h/* for PATH_MAX */ #include stdio.h #include stdlib.h -#include sys/param.h /* for MAXPATHLEN */ #include locate.h -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf1[PATH_MAX] = ; +u_char buf2[PATH_MAX]; u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1]; int Index: src/usr.bin/locate//code/locate.code.c === RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v retrieving revision 1.17 diff -u -p -u -r1.17 locate.code.c --- src/usr.bin/locate//code/locate.code.c 17 Nov 2013 20:19:36 - 1.17 +++ src/usr.bin/locate//code/locate.code.c 23 Nov 2014 16:49:47 - @@ -78,10 +78,10 @@ * Wolfram Schneider, Berlin September 1996 */ -#include sys/param.h #include err.h #include errno.h +#include limits.h #include stdio.h #include stdlib.h #include string.h @@ -91,8 +91,8 @@ #defineBGBUFSIZE (NBG * 2) /* size of bigram buffer */ -u_char buf1[MAXPATHLEN] = ; -u_char buf2[MAXPATHLEN]; +u_char buf1[PATH_MAX] = ; +u_char buf2[PATH_MAX]; u_char bigrams[BGBUFSIZE + 1] = { 0 }; #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ @@ -171,6 +171,14 @@ main(int argc, char *argv[]) /* chop newline */ if (*cp == '\n') *cp = '\0'; + } + + /* skip truncated lines */ + if (cp == path + sizeof(buf2) - 1 *(cp-1) != '\0') { + while (fgets(path, sizeof(buf2), stdin) != NULL) + if (strchr(path, '\n') != NULL) + break; + continue; } /* Skip longest common prefix. */ Index: src/usr.bin/locate//locate/fastfind.c === RCS file: /cvs/src/usr.bin/locate/locate/fastfind.c,v retrieving revision 1.11 diff -u -p -u -r1.11 fastfind.c --- src/usr.bin/locate//locate/fastfind.c 25 Oct 2010 19:16:45 - 1.11 +++ src/usr.bin/locate//locate/fastfind.c 23 Nov 2014 16:49:47 - @@ -47,7 +47,7 @@ statistic (fp, path_fcodes) u_char *p, *s; int c; int count, umlaut; - u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN]; + u_char bigram1[NBG], bigram2[NBG], path[PATH_MAX]; for (c = 0, p = bigram1, s = bigram2; c NBG; c++) { p[c] = check_bigram_char(getc(fp)); @@ -140,7 +140,7 @@ fastfind int c, cc; int count, found, globflag; u_char *cutoff; - u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN]; + u_char bigram1[NBG], bigram2[NBG], path[PATH_MAX]; #ifdef FF_ICASE /* use a lookup table for case insensitive search */ Index: src/usr.bin/locate//locate/locate.c
Re: locate(1): ignore paths longer than MAXPATHLEN
On Sun, Nov 23, 2014 at 06:59:59PM +0100, Nicolas Bedos wrote: Index: src/usr.bin/locate//code/locate.code.c === RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v retrieving revision 1.17 diff -u -p -u -r1.17 locate.code.c --- src/usr.bin/locate//code/locate.code.c17 Nov 2013 20:19:36 - 1.17 +++ src/usr.bin/locate//code/locate.code.c23 Nov 2014 16:49:47 - @@ -171,6 +171,14 @@ main(int argc, char *argv[]) /* chop newline */ if (*cp == '\n') *cp = '\0'; + } + + /* skip truncated lines */ + if (cp == path + sizeof(buf2) - 1 *(cp-1) != '\0') { + while (fgets(path, sizeof(buf2), stdin) != NULL) + if (strchr(path, '\n') != NULL) + break; + continue; } /* Skip longest common prefix. */ I would recommend using fgetln for the actual line parsing. Then this kinda fragile code can be avoided (fragile: fgets and its users have a hard time to properly handle '\0' chars inside a file). See also: - fgetln(3) - http://undeadly.org/cgi?action=articlesid=20061027031811 Tobias