Re: locate(1): ignore paths longer than MAXPATHLEN

2014-11-26 Thread Nicolas Bedos
Last update, with Tobias's help.

The following diff 
- changes MAXPATHLEN from sys/param.h to PATH_MAX from limits.h
- adds a missing prototype for sane_count
- locate.bigram and locate.code now abort when reading a pathname
  exceeding PATH_MAX bytes on stdin
  

Index: src/usr.bin/locate//bigram/locate.bigram.c
===
RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v
retrieving revision 1.12
diff -u -p -u -r1.12 locate.bigram.c
--- src/usr.bin/locate//bigram/locate.bigram.c  27 Oct 2009 23:59:39 -  
1.12
+++ src/usr.bin/locate//bigram/locate.bigram.c  26 Nov 2014 18:15:13 -
@@ -43,30 +43,41 @@
  * Use 'code' to encode a file using this output.
  */
 
+#include err.h
+#include limits.h/* for PATH_MAX */
 #include stdio.h
 #include stdlib.h
-#include sys/param.h /* for MAXPATHLEN */
+#include string.h
 #include locate.h
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf[PATH_MAX] =  ;
 u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1];
 
 int
 main(void)
 {
u_char *cp;
-   u_char *oldpath = buf1, *path = buf2;
+   u_char *oldpath = buf, *path, *mbuf;
u_int i, j;
+   size_t len;
 
-   while (fgets(path, sizeof(buf2), stdin) != NULL) {
+   mbuf = NULL;
 
-   /*
-* We don't need remove newline character '\n'.
-* '\n' is less than ASCII_MIN and will be later
-* ignored at output.
-*/
+   while ((path=(u_char *)fgetln(stdin, len)) != NULL) {
+   if (path[len-1] == '\n')
+   path[len-1] = '\0';
+   else {
+   if ((mbuf=malloc(len+1)) == NULL)
+   err(1, malloc);
+   memcpy(mbuf, path, len);
+   mbuf[len] = '\0';
+   len++;
+   path = mbuf;
+   }
 
+   if (len  sizeof(buf))
+   errx(1, pathname exceeding %zu byte limit: %s,
+   sizeof(buf), path);
 
/* skip longest common prefix */
for (cp = path; *cp == *oldpath; cp++, oldpath++)
@@ -78,14 +89,7 @@ main(void)
cp += 2;
}
 
-   /* swap pointers */
-   if (path == buf1) {
-   path = buf2;
-   oldpath = buf1;
-   } else {
-   path = buf1;
-   oldpath = buf2;
-   }
+   memcpy(buf, path, len);
}
 
/* output, boundary check */
Index: src/usr.bin/locate//code/locate.code.c
===
RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v
retrieving revision 1.17
diff -u -p -u -r1.17 locate.code.c
--- src/usr.bin/locate//code/locate.code.c  17 Nov 2013 20:19:36 -  
1.17
+++ src/usr.bin/locate//code/locate.code.c  26 Nov 2014 18:15:13 -
@@ -78,10 +78,10 @@
  * Wolfram Schneider, Berlin September 1996
  */
 
-#include sys/param.h
 
 #include err.h
 #include errno.h
+#include limits.h
 #include stdio.h
 #include stdlib.h
 #include string.h
@@ -91,8 +91,7 @@
 
 #defineBGBUFSIZE   (NBG * 2)   /* size of bigram buffer */
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf[PATH_MAX] =  ;
 u_char bigrams[BGBUFSIZE + 1] = { 0 };
 
 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
@@ -115,7 +114,8 @@ extern int optopt;
 int
 main(int argc, char *argv[])
 {
-   u_char *cp, *oldpath, *path;
+   u_char *cp, *oldpath, *path, *mbuf;
+   size_t len;
int ch, code, count, diffcount, oldcount;
FILE *fp;
int i, j;
@@ -156,23 +156,31 @@ main(int argc, char *argv[])
 
 #endif /* LOOKUP */
 
-   oldpath = buf1;
-   path = buf2;
+   oldpath = buf;
oldcount = 0;
+   mbuf = NULL;
 
-   while (fgets(path, sizeof(buf2), stdin) != NULL) {
-
+   while ((path=(u_char *)fgetln(stdin, len)) != NULL) {
/* skip empty lines */
if (*path == '\n')
continue;
 
-   /* remove newline */
-   for (cp = path; *cp != '\0'; cp++) {
-   /* chop newline */
-   if (*cp == '\n')
-   *cp = '\0';
+   if (path[len-1] == '\n') {
+   /* remove newline */
+   path[len-1] = '\0';
+   } else {
+   if ((mbuf = malloc(len+1)) == NULL)
+   err(1, malloc);
+   memcpy(mbuf, path, len);
+   mbuf[len] = '\0';
+   len++;
+   path = mbuf;
}
 
+   if (len  

Re: locate(1): ignore paths longer than MAXPATHLEN

2014-11-26 Thread Tobias Stoeckmann
On Wed, Nov 26, 2014 at 09:48:15PM +0100, Nicolas Bedos wrote:
 Last update, with Tobias's help.
 
 The following diff 
 - changes MAXPATHLEN from sys/param.h to PATH_MAX from limits.h
 - adds a missing prototype for sane_count
 - locate.bigram and locate.code now abort when reading a pathname
   exceeding PATH_MAX bytes on stdin

If somebody else agrees with this diff, I'll adjust the last
style-deviations and commit it.


Tobias



Re: locate(1): ignore paths longer than MAXPATHLEN

2014-11-25 Thread Nicolas Bedos
Tobias Stoeckmann wrote:
 I would free() it nontheless outside the while loop.  For the sake of
 faster review.  But that's just my opinion.
 
 Also, it would be nice if there is only one len/sizeof() check after
 fgetln.  Which means that the check should be done after the
 if/else-block.  Could happen that we malloc memory for nothing, but hey:
 It happens only once and just if there is a missing newline.
 
 And while at it, silently ignoring lines doesn't sound like a good idea.
 I would even go for an err() call; or at least warn().

Again, thank you for your suggestions. I didn't thought about factoring
out the len/sizeof comparisons. I have also added the free() call, and
an errx() call when a pathname is too long.


Nicolas Bedos


Index: src/usr.bin/locate//bigram/locate.bigram.c
===
RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v
retrieving revision 1.12
diff -u -p -u -r1.12 locate.bigram.c
--- src/usr.bin/locate//bigram/locate.bigram.c  27 Oct 2009 23:59:39 -  
1.12
+++ src/usr.bin/locate//bigram/locate.bigram.c  25 Nov 2014 18:46:34 -
@@ -43,13 +43,13 @@
  * Use 'code' to encode a file using this output.
  */
 
+#include limits.h/* for PATH_MAX */
 #include stdio.h
 #include stdlib.h
-#include sys/param.h /* for MAXPATHLEN */
 #include locate.h
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf1[PATH_MAX] =  ;
+u_char buf2[PATH_MAX];
 u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1];
 
 int
Index: src/usr.bin/locate//code/locate.code.c
===
RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v
retrieving revision 1.17
diff -u -p -u -r1.17 locate.code.c
--- src/usr.bin/locate//code/locate.code.c  17 Nov 2013 20:19:36 -  
1.17
+++ src/usr.bin/locate//code/locate.code.c  25 Nov 2014 18:46:34 -
@@ -78,10 +78,10 @@
  * Wolfram Schneider, Berlin September 1996
  */
 
-#include sys/param.h
 
 #include err.h
 #include errno.h
+#include limits.h
 #include stdio.h
 #include stdlib.h
 #include string.h
@@ -91,8 +91,7 @@
 
 #defineBGBUFSIZE   (NBG * 2)   /* size of bigram buffer */
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf[PATH_MAX] =  ;
 u_char bigrams[BGBUFSIZE + 1] = { 0 };
 
 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
@@ -115,7 +114,8 @@ extern int optopt;
 int
 main(int argc, char *argv[])
 {
-   u_char *cp, *oldpath, *path;
+   u_char *cp, *oldpath, *path, *mbuf;
+   size_t len;
int ch, code, count, diffcount, oldcount;
FILE *fp;
int i, j;
@@ -156,23 +156,31 @@ main(int argc, char *argv[])
 
 #endif /* LOOKUP */
 
-   oldpath = buf1;
-   path = buf2;
+   oldpath = buf;
oldcount = 0;
+   mbuf = NULL;
 
-   while (fgets(path, sizeof(buf2), stdin) != NULL) {
-
+   while ((path=(u_char *)fgetln(stdin, len)) != NULL) {
/* skip empty lines */
if (*path == '\n')
continue;
 
-   /* remove newline */
-   for (cp = path; *cp != '\0'; cp++) {
-   /* chop newline */
-   if (*cp == '\n')
-   *cp = '\0';
+   if (path[len-1] == '\n') {
+   /* remove newline */
+   path[len-1] = '\0';
+   } else {
+   if ((mbuf = malloc(len+1)) == NULL)
+   err(1, malloc);
+   memcpy(mbuf, path, len);
+   mbuf[len] = '\0';
+   len++;
+   path = mbuf;
}
 
+   if (len  sizeof(buf))
+   errx(1, pathname exceeding %zu byte limit: %s,
+   sizeof(buf), path);
+
/* Skip longest common prefix. */
for (cp = path; *cp == *oldpath; cp++, oldpath++)
if (*cp == '\0')
@@ -222,14 +230,11 @@ main(int argc, char *argv[])
}
}
 
-   if (path == buf1) { /* swap pointers */
-   path = buf2;
-   oldpath = buf1;
-   } else {
-   path = buf1;
-   oldpath = buf2;
-   }
+   memcpy(buf, path, len);
}
+
+   free(mbuf);
+
/* Non-zero status if there were errors */
if (fflush(stdout) != 0 || ferror(stdout))
exit(1);
Index: src/usr.bin/locate//locate/fastfind.c
===
RCS file: /cvs/src/usr.bin/locate/locate/fastfind.c,v
retrieving revision 1.11
diff -u -p -u -r1.11 fastfind.c
--- src/usr.bin/locate//locate/fastfind.c   25 Oct 2010 19:16:45 -  

Re: locate(1): ignore paths longer than MAXPATHLEN

2014-11-24 Thread Nicolas Bedos
Tobias Stoeckmann wrote:
 I would recommend using fgetln for the actual line parsing.  Then this
 kinda fragile code can be avoided (fragile: fgets and its users have a
 hard time to properly handle '\0' chars inside a file).

Thank you for the advice. Here is an updated diff. It does indeed look
less 'fragile' !

In locate.code.c 'mbuf' is never free()d: it is only allocated for the
last line of input and after processing this line the program ends. I
hope it is ok.

Nicolas Bedos


Index: src/usr.bin/locate//bigram/locate.bigram.c
===
RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v
retrieving revision 1.12
diff -u -p -u -r1.12 locate.bigram.c
--- src/usr.bin/locate//bigram/locate.bigram.c  27 Oct 2009 23:59:39 -  
1.12
+++ src/usr.bin/locate//bigram/locate.bigram.c  24 Nov 2014 19:09:03 -
@@ -43,13 +43,13 @@
  * Use 'code' to encode a file using this output.
  */
 
+#include limits.h/* for PATH_MAX */
 #include stdio.h
 #include stdlib.h
-#include sys/param.h /* for MAXPATHLEN */
 #include locate.h
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf1[PATH_MAX] =  ;
+u_char buf2[PATH_MAX];
 u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1];
 
 int
Index: src/usr.bin/locate//code/locate.code.c
===
RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v
retrieving revision 1.17
diff -u -p -u -r1.17 locate.code.c
--- src/usr.bin/locate//code/locate.code.c  17 Nov 2013 20:19:36 -  
1.17
+++ src/usr.bin/locate//code/locate.code.c  24 Nov 2014 19:09:03 -
@@ -78,10 +78,10 @@
  * Wolfram Schneider, Berlin September 1996
  */
 
-#include sys/param.h
 
 #include err.h
 #include errno.h
+#include limits.h
 #include stdio.h
 #include stdlib.h
 #include string.h
@@ -91,8 +91,7 @@
 
 #defineBGBUFSIZE   (NBG * 2)   /* size of bigram buffer */
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf[PATH_MAX] =  ;
 u_char bigrams[BGBUFSIZE + 1] = { 0 };
 
 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
@@ -115,7 +114,8 @@ extern int optopt;
 int
 main(int argc, char *argv[])
 {
-   u_char *cp, *oldpath, *path;
+   u_char *cp, *oldpath, *path, *mbuf;
+   size_t len;
int ch, code, count, diffcount, oldcount;
FILE *fp;
int i, j;
@@ -156,21 +156,31 @@ main(int argc, char *argv[])
 
 #endif /* LOOKUP */
 
-   oldpath = buf1;
-   path = buf2;
+   oldpath = buf;
oldcount = 0;
 
-   while (fgets(path, sizeof(buf2), stdin) != NULL) {
-
+   while ((path=(u_char *)fgetln(stdin, len)) != NULL) {
/* skip empty lines */
if (*path == '\n')
continue;
 
-   /* remove newline */
-   for (cp = path; *cp != '\0'; cp++) {
-   /* chop newline */
-   if (*cp == '\n')
-   *cp = '\0';
+   if (path[len-1] == '\n') {
+   /* skip lines exceeding PATH_MAX */
+   if (len  sizeof(buf))
+   continue;
+   /* remove newline */
+   path[len-1] = '\0';
+   } else {
+   /* skip lines exceeding PATH_MAX-1 (save one byte
+* to add the terminating '\0') */
+   if (len  sizeof(buf)-1)
+   continue;
+   if ((mbuf = malloc(len+1)) == NULL)
+   err(1, malloc);
+   memcpy(mbuf, path, len);
+   mbuf[len] = '\0';
+   len++;
+   path = mbuf;
}
 
/* Skip longest common prefix. */
@@ -222,13 +232,7 @@ main(int argc, char *argv[])
}
}
 
-   if (path == buf1) { /* swap pointers */
-   path = buf2;
-   oldpath = buf1;
-   } else {
-   path = buf1;
-   oldpath = buf2;
-   }
+   memcpy(buf, path, len);
}
/* Non-zero status if there were errors */
if (fflush(stdout) != 0 || ferror(stdout))
Index: src/usr.bin/locate//locate/fastfind.c
===
RCS file: /cvs/src/usr.bin/locate/locate/fastfind.c,v
retrieving revision 1.11
diff -u -p -u -r1.11 fastfind.c
--- src/usr.bin/locate//locate/fastfind.c   25 Oct 2010 19:16:45 -  
1.11
+++ src/usr.bin/locate//locate/fastfind.c   24 Nov 2014 19:09:04 -
@@ -47,7 +47,7 @@ statistic (fp, path_fcodes)
u_char *p, *s;
int c;
int count, umlaut;
-   u_char bigram1[NBG], 

Re: locate(1): ignore paths longer than MAXPATHLEN

2014-11-24 Thread Tobias Stoeckmann
On Mon, Nov 24, 2014 at 08:37:47PM +0100, Nicolas Bedos wrote:
 In locate.code.c 'mbuf' is never free()d: it is only allocated for the
 last line of input and after processing this line the program ends. I
 hope it is ok.

I would free() it nontheless outside the while loop.  For the sake of
faster review.  But that's just my opinion.

Also, it would be nice if there is only one len/sizeof() check after
fgetln.  Which means that the check should be done after the
if/else-block.  Could happen that we malloc memory for nothing, but hey:
It happens only once and just if there is a missing newline.

And while at it, silently ignoring lines doesn't sound like a good idea.
I would even go for an err() call; or at least warn().


Tobias



locate(1): ignore paths longer than MAXPATHLEN

2014-11-23 Thread Nicolas Bedos
Hello,

locate(1) doesn't handle paths longer than MAXPATHLEN well: they are
silently split into MAXPATHLEN-long strings and each string is included
in the database. Here is an example :

 $ cat loc_test.sh
testdir='/tmp/locate_maxpathlen'
db=$testdir/db

d='64_characters_long_directory_name___'
long_path=$testdir/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d/$d

mkdir -p $long_path || exit 1

cd $long_path
touch short_filename
touch long_filename_which_gets_truncated

/usr/libexec/locate.updatedb --fcodes=$db \
 --searchpaths=$testdir /usr/src \
|| exit 1

locate -d $db filename truncated | grep  -v ^/usr/src

 $ sh loc_test.sh
/tmp/locate_maxpathlen/64_char_[... SNIP ...]_/long_filename_which_gets_
/tmp/locate_maxpathlen/64_char_[... SNIP ...]_/short_filename
truncated

We have just created the path truncated in the locate database.


The diff below includes the following changes :
- locate(1) now silently ignores paths longer than MAXPATHLEN
- MAXPATHLEN from sys/param.h is replaced by PATH_MAX from
limits.h
- add a missing prototype for sane_count()

Another solution would be to modify mklocatedb.sh or updatedb.sh to
filter out those lng paths before they are fed to locate.code, but I
can't see a way to achieve this that doesn't require hardcoding PATH_MAX?

I ran /etc/weekly with the new binaries and didn't notice any
unexpected changes in the locate database.


Nicolas Bedos


Index: src/usr.bin/locate//bigram/locate.bigram.c
===
RCS file: /cvs/src/usr.bin/locate/bigram/locate.bigram.c,v
retrieving revision 1.12
diff -u -p -u -r1.12 locate.bigram.c
--- src/usr.bin/locate//bigram/locate.bigram.c  27 Oct 2009 23:59:39 -  
1.12
+++ src/usr.bin/locate//bigram/locate.bigram.c  23 Nov 2014 16:49:47 -
@@ -43,13 +43,13 @@
  * Use 'code' to encode a file using this output.
  */
 
+#include limits.h/* for PATH_MAX */
 #include stdio.h
 #include stdlib.h
-#include sys/param.h /* for MAXPATHLEN */
 #include locate.h
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf1[PATH_MAX] =  ;
+u_char buf2[PATH_MAX];
 u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1];
 
 int
Index: src/usr.bin/locate//code/locate.code.c
===
RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v
retrieving revision 1.17
diff -u -p -u -r1.17 locate.code.c
--- src/usr.bin/locate//code/locate.code.c  17 Nov 2013 20:19:36 -  
1.17
+++ src/usr.bin/locate//code/locate.code.c  23 Nov 2014 16:49:47 -
@@ -78,10 +78,10 @@
  * Wolfram Schneider, Berlin September 1996
  */
 
-#include sys/param.h
 
 #include err.h
 #include errno.h
+#include limits.h
 #include stdio.h
 #include stdlib.h
 #include string.h
@@ -91,8 +91,8 @@
 
 #defineBGBUFSIZE   (NBG * 2)   /* size of bigram buffer */
 
-u_char buf1[MAXPATHLEN] =  ;
-u_char buf2[MAXPATHLEN];
+u_char buf1[PATH_MAX] =  ;
+u_char buf2[PATH_MAX];
 u_char bigrams[BGBUFSIZE + 1] = { 0 };
 
 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
@@ -171,6 +171,14 @@ main(int argc, char *argv[])
/* chop newline */
if (*cp == '\n')
*cp = '\0';
+   }
+
+   /* skip truncated lines */
+   if (cp == path + sizeof(buf2) - 1  *(cp-1) != '\0') {
+   while (fgets(path, sizeof(buf2), stdin) != NULL)
+   if (strchr(path, '\n') != NULL)
+   break;
+   continue;
}
 
/* Skip longest common prefix. */
Index: src/usr.bin/locate//locate/fastfind.c
===
RCS file: /cvs/src/usr.bin/locate/locate/fastfind.c,v
retrieving revision 1.11
diff -u -p -u -r1.11 fastfind.c
--- src/usr.bin/locate//locate/fastfind.c   25 Oct 2010 19:16:45 -  
1.11
+++ src/usr.bin/locate//locate/fastfind.c   23 Nov 2014 16:49:47 -
@@ -47,7 +47,7 @@ statistic (fp, path_fcodes)
u_char *p, *s;
int c;
int count, umlaut;
-   u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN];
+   u_char bigram1[NBG], bigram2[NBG], path[PATH_MAX];
 
for (c = 0, p = bigram1, s = bigram2; c  NBG; c++) {
p[c] = check_bigram_char(getc(fp));
@@ -140,7 +140,7 @@ fastfind
int c, cc;
int count, found, globflag;
u_char *cutoff;
-   u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN];
+   u_char bigram1[NBG], bigram2[NBG], path[PATH_MAX];
 
 #ifdef FF_ICASE
/* use a lookup table for case insensitive search */
Index: src/usr.bin/locate//locate/locate.c

Re: locate(1): ignore paths longer than MAXPATHLEN

2014-11-23 Thread Tobias Stoeckmann
On Sun, Nov 23, 2014 at 06:59:59PM +0100, Nicolas Bedos wrote:
 Index: src/usr.bin/locate//code/locate.code.c
 ===
 RCS file: /cvs/src/usr.bin/locate/code/locate.code.c,v
 retrieving revision 1.17
 diff -u -p -u -r1.17 locate.code.c
 --- src/usr.bin/locate//code/locate.code.c17 Nov 2013 20:19:36 -  
 1.17
 +++ src/usr.bin/locate//code/locate.code.c23 Nov 2014 16:49:47 -
 @@ -171,6 +171,14 @@ main(int argc, char *argv[])
   /* chop newline */
   if (*cp == '\n')
   *cp = '\0';
 + }
 +
 + /* skip truncated lines */
 + if (cp == path + sizeof(buf2) - 1  *(cp-1) != '\0') {
 + while (fgets(path, sizeof(buf2), stdin) != NULL)
 + if (strchr(path, '\n') != NULL)
 + break;
 + continue;
   }
  
   /* Skip longest common prefix. */

I would recommend using fgetln for the actual line parsing.  Then this
kinda fragile code can be avoided (fragile: fgets and its users have a
hard time to properly handle '\0' chars inside a file).

See also:
- fgetln(3)
- http://undeadly.org/cgi?action=articlesid=20061027031811


Tobias