While reading preconv.c two peculiarities catched my eye:

1. The preconv_encode() function does manual byteswapping where none is
necessary (and harmful). While the bit-shifting used to construct the
value in 'accum' might seem endian specific on first sight, it is not.
The bug manifests itself like this (\xc3\xbc being the utf8 sequence for
umlaut u):

$ printf '.TH UMLAUT 7 2014-11-05\n.SH SYNOPSIS\n\xc3\xbc\n'|mandoc -Tutf8
mandoc: <stdin>:3:1: WARNING: invalid escape sequence: \[uFC00000
...

The correct unicode codepoint would be U+00FC, not U+FC000000. 'accum'
already contained the correct value _before_ byteswapping.

2. The code in preconv_encode() assumes 'accum' to be a 32 bit integer,
but the type declared is 'unsigned int'. While int is 32 bit on all
platforms that OpenBSD supports, it is not guaranteed by the standard.
I think the correct type to use would therefore be uint_least32_t.

Below is a diff to fix both issues (tested on macppc).

cheers,
natano


Index: preconv.c
===================================================================
RCS file: /cvs/src/usr.bin/mandoc/preconv.c,v
retrieving revision 1.2
diff -u -r1.2 preconv.c
--- preconv.c   1 Nov 2014 04:07:25 -0000       1.2
+++ preconv.c   13 Nov 2014 09:25:05 -0000
@@ -20,6 +20,7 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <stdint.h>
 #include "mandoc.h"
 #include "libmandoc.h"
 
@@ -28,9 +29,8 @@
     int *filenc)
 {
        size_t           i;
-       const long       one = 1L;
-       int              state, be;
-       unsigned int     accum;
+       int              state;
+       uint_least32_t   accum;
        unsigned char    cu;
 
        if ( ! (*filenc & MPARSE_UTF8))
@@ -38,12 +38,6 @@
 
        state = 0;
        accum = 0U;
-       be = 0;
-
-       /* Quick test for big-endian value. */
-
-       if ( ! (*((const char *)(&one))))
-               be = 1;
 
        for (i = *ii; i < ib->sz; i++) {
                cu = ib->buf[i];
@@ -65,24 +59,11 @@
                        if (state)
                                continue;
 
-                       /*
-                        * Accum is held in little-endian order as
-                        * stipulated by the UTF-8 sequence coding.  We
-                        * need to convert to a native big-endian if our
-                        * architecture requires it.
-                        */
-
-                       if (be)
-                               accum = (accum >> 24) |
-                                       ((accum << 8) & 0x00FF0000) |
-                                       ((accum >> 8) & 0x0000FF00) |
-                                       (accum << 24);
-
                        if (accum < 0x80)
                                ob->buf[(*oi)++] = accum;
                        else
                                *oi += snprintf(ob->buf + *oi,
-                                   11, "\\[u%.4X]", accum);
+                                   11, "\\[u%.4jX]", (uintmax_t)accum);
                        *ii = i + 1;
                        *filenc &= ~MPARSE_LATIN1;
                        return(1);

Reply via email to