This adds a quite limited understanding of utf-8 to hexdump. I've found it
helpful trying to see exactly what's coming out of some utilities instead of
trying to decode utf-8 by hand.

Index: display.c
===================================================================
RCS file: /cvs/src/usr.bin/hexdump/display.c,v
retrieving revision 1.21
diff -u -p -r1.21 display.c
--- display.c   16 Jan 2015 06:40:08 -0000      1.21
+++ display.c   27 Oct 2015 10:50:09 -0000
@@ -106,6 +106,17 @@ display(void)
        }
 }
 
+static int
+isu8cont(unsigned char c)
+{
+       return (c & 0xc0) == 0x80;
+}
+static int
+isu8start(unsigned char c)
+{
+       return (c & 0xc0) == 0xc0;
+}
+
 static __inline void
 print(PR *pr, u_char *bp)
 {
@@ -163,7 +174,16 @@ print(PR *pr, u_char *bp)
                }
                break;
        case F_P:
-               (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
+               if (isu8start(*bp)) {
+                       unsigned char *pp = bp + 1;
+                       (void)printf(pr->fmt, *bp);
+                       while (isu8cont(*pp))
+                               (void)printf(pr->fmt, *pp++);
+               } else if (isu8cont(*bp)) {
+                       (void)printf(pr->fmt, ' ');
+               } else {
+                       (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
+               }
                break;
        case F_STR:
                (void)printf(pr->fmt, (char *)bp);

Reply via email to