2015-10-27 11:51 GMT+01:00 Ted Unangst <t...@tedunangst.com>: > This adds a quite limited understanding of utf-8 to hexdump. I've found it > helpful trying to see exactly what's coming out of some utilities instead of > trying to decode utf-8 by hand. > > Index: display.c > =================================================================== > RCS file: /cvs/src/usr.bin/hexdump/display.c,v > retrieving revision 1.21 > diff -u -p -r1.21 display.c > --- display.c 16 Jan 2015 06:40:08 -0000 1.21 > +++ display.c 27 Oct 2015 10:50:09 -0000 > @@ -106,6 +106,17 @@ display(void) > } > } > > +static int > +isu8cont(unsigned char c) > +{ > + return (c & 0xc0) == 0x80; > +} > +static int > +isu8start(unsigned char c) > +{ > + return (c & 0xc0) == 0xc0; > +} > + > static __inline void > print(PR *pr, u_char *bp) > { > @@ -163,7 +174,16 @@ print(PR *pr, u_char *bp) > } > break; > case F_P: > - (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); > + if (isu8start(*bp)) { > + unsigned char *pp = bp + 1; > + (void)printf(pr->fmt, *bp); > + while (isu8cont(*pp)) > + (void)printf(pr->fmt, *pp++); > + } else if (isu8cont(*bp)) { > + (void)printf(pr->fmt, ' '); > + } else { > + (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); > + } > break; > case F_STR: > (void)printf(pr->fmt, (char *)bp);
So in case of the mangled UTF8 codepoint like the following you'll print more bytes than you should (two at max), no? 110xxxxx 10xxxxxx 10xxxxxx -- WBR, Vadim Zhukov