Bug#440951: isutf8 accepts overlong UTF-8 sequences

2007-11-12 Thread Lars Wirzenius
package moreutils
tag 440951 patch
thanks

Attached is a patch to fix bug #440951 (isutf8 accepts overlong UTF-8
sequences) in isutf8, part of the moreutils package. I ended
re-implementing the core logic of the program, but on the other hand it
is now clearer than before (read: I couldn't understand the old code
without some effort).

Joey, should I commit directly to svn or can you apply and commit this
patch? (I can provide the debian/changelog change as well, if you like.)

=== modified file 'check-isutf8'
--- check-isutf8	2006-05-27 23:18:29 +
+++ check-isutf8	2007-10-28 13:59:54 +
@@ -27,5 +27,6 @@
 check 1 '\xc2'
 check 1 '\xc2\x20'
 check 1 '\x20\xc2'
+check 1 '\300\200'
 
 exit $failed

=== modified file 'isutf8.c'
--- isutf8.c	2006-03-08 03:19:40 +
+++ isutf8.c	2007-11-10 23:21:59 +
@@ -18,72 +18,199 @@
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  */
 
+#include assert.h
 #include stdio.h
 #include stdlib.h
 #include errno.h
 #include string.h
 #include getopt.h
 
-#define VERSION 1.0
+
+#define VERSION 1.1
+
+
+/*
+ * Code to indicate an invalid UTF8 character.
+ */
+enum { INVALID_CHAR = 0x };
+
+
+/*
+ * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
+ * in the array 'buf'. Return the number of bytes in the encoded value.
+ * If the value is too large (more than 32 bits or would take more than
+ * 'maxbytes' bytes), return -1.
+ */
+static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
+{
+static const struct {
+int nbytes;
+unsigned long max;
+} tab[] = {
+{ 1, 0x007F },
+{ 2, 0x07FF },
+{ 3, 0x },
+{ 4, 0x001F },
+{ 5, 0x03FF },
+{ 6, 0x7FFF },
+};
+static const int ntab = sizeof(tab) / sizeof(tab[0]);
+int i, j;
+
+if (u  tab[ntab-1].max)
+return -1;
+
+for (i = 0; i  ntab; ++i) {
+if (u = tab[i].max)
+break;
+}
+assert(i  ntab);
+
+if (tab[i].nbytes  maxbytes)
+return -1;
+
+if (tab[i].nbytes == 1) { /* Special case */
+buf[0] = u;
+} else {
+for (j = tab[i].nbytes-1; j  0; --j) {
+buf[j] = 0x80 | (u  0x3f);
+u = 6;
+}
+
+unsigned char mask = ~(0xFF  tab[i].nbytes);
+buf[0] = mask | u;
+}
+
+return tab[i].nbytes;
+}
+
 
 /* 
+ * Return number of ones at the top of a byte.
+ *
  * I'm pretty sure there is a fancy trick to do this without a loop,
  * but I'm too tired to figure it out now. --liw
  */
 static int high_ones(int c) {
-	int n;
-
-	for (n = 0; (c  0x80) == 0x80; c = 1)
-		++n;	
-	return n;
-}
-
+int n;
+
+for (n = 0; (c  0x80) == 0x80; c = 1)
+++n;
+return n;
+}
+
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_CHAR.
+ */
+static unsigned long decodeutf8(unsigned char *buf, int nbytes)
+{
+unsigned long u;
+int i, j;
+
+if (nbytes = 0)
+return INVALID_CHAR;
+
+if (nbytes == 1) {
+if (buf[0] = 0x80)
+return INVALID_CHAR;
+return buf[0];
+}
+
+i = high_ones(buf[0]);
+if (i != nbytes)
+return INVALID_CHAR;
+u = buf[0]  (0xff  i);
+for (j = 1; j  nbytes; ++j) {
+if ((buf[j]  0xC0) != 0x80)
+return INVALID_CHAR;
+u = (u  6) | (buf[j]  0x3f);
+}
+return u;
+}
+
+
+/*
+ * Determine if the contents of an open file form a valid UTF8 byte stream.
+ * Do this by collecting bytes for a character into a buffer and then
+ * decode the bytes and re-encode them and compare that they are identical
+ * to the original bytes. If any step fails, return 0 for error. If EOF
+ * is reached, return 1 for OK.
+ */
 static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
-	int c, n, remaining_bytes;
-	unsigned long line, col;
-	
-	remaining_bytes = 0;
-	line = 1;
-	col = 1;
-	while ((c = getc(file)) != EOF) {
-		n = high_ones(c);
-		if (remaining_bytes  0) {
-			if (n == 1) {
---remaining_bytes;
-if (remaining_bytes == 0)
-	++col;
-			} else
-goto error;
-		} else if (n == 0) {
-			/* 7-bit character, skip, but adjust position */
-			if (c == '\n') {
-++line;
-col = 1;
-			} else
-++col;
-		} else if (n == 1)
-			goto error; /* wrong place for continuation byte */
-		else
-			remaining_bytes = n - 1; /* start of multi-byte sequence */
-	}
-	if (remaining_bytes  0)
-		goto error;
+enum { MAX_UTF8_BYTES = 6 };
+unsigned char buf

Bug#440951: isutf8 accepts overlong UTF-8 sequences

2007-09-05 Thread Jakub Wilk

Package: moreutils
Version: 0.20
Severity: normal

$ man utf-8 | grep 'Security' -A 9 | sed -e '1d; s/^ *//'
The Unicode and UCS standards require that producers of UTF-8 shall use
the shortest form possible, for example, producing a two-byte  sequence
with  first  byte  0xc0  is  non-conforming.  Unicode 3.1 has added the
requirement that conforming programs must not accept non-shortest forms
in their input.  This is for security reasons: if user input is checked
for possible security violations, a program might check  only  for  the
ASCII  version of /../ or ; or NUL and overlook that there are many
non-ASCII ways to represent these things in a non-shortest UTF-8 encod-
ing.

$ printf '\300\200' | iconv -f UTF-8 -t UTF-8
iconv: illegal input sequence at position 0

$ printf '\300\200' | isutf8  echo valid
valid

-- System Information:
Debian Release: lenny/sid
  APT prefers testing
  APT policy: (900, 'testing'), (600, 'unstable'), (500, 'experimental')
Architecture: i386 (i686)

Kernel: Linux 2.6.21-2-686 (SMP w/1 CPU core)
Locale: LANG=C, LC_CTYPE=pl_PL (charmap=ISO-8859-2)
Shell: /bin/sh linked to /bin/dash

Versions of packages moreutils depends on:
ii  libc6 2.6.1-1+b1 GNU C Library: Shared libraries
ii  perl  5.8.8-7Larry Wall's Practical Extraction 


moreutils recommends no packages.

-- no debconf information

--
Jakub Wilk


--
To UNSUBSCRIBE, email to [EMAIL PROTECTED]
with a subject of unsubscribe. Trouble? Contact [EMAIL PROTECTED]