Re: mini utf-8 hexdump
2015-10-27 11:51 GMT+01:00 Ted Unangst : > This adds a quite limited understanding of utf-8 to hexdump. I've found it > helpful trying to see exactly what's coming out of some utilities instead of > trying to decode utf-8 by hand. > > Index: display.c > === > RCS file: /cvs/src/usr.bin/hexdump/display.c,v > retrieving revision 1.21 > diff -u -p -r1.21 display.c > --- display.c 16 Jan 2015 06:40:08 - 1.21 > +++ display.c 27 Oct 2015 10:50:09 - > @@ -106,6 +106,17 @@ display(void) > } > } > > +static int > +isu8cont(unsigned char c) > +{ > + return (c & 0xc0) == 0x80; > +} > +static int > +isu8start(unsigned char c) > +{ > + return (c & 0xc0) == 0xc0; > +} > + > static __inline void > print(PR *pr, u_char *bp) > { > @@ -163,7 +174,16 @@ print(PR *pr, u_char *bp) > } > break; > case F_P: > - (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); > + if (isu8start(*bp)) { > + unsigned char *pp = bp + 1; > + (void)printf(pr->fmt, *bp); > + while (isu8cont(*pp)) > + (void)printf(pr->fmt, *pp++); > + } else if (isu8cont(*bp)) { > + (void)printf(pr->fmt, ' '); > + } else { > + (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); > + } > break; > case F_STR: > (void)printf(pr->fmt, (char *)bp); So in case of the mangled UTF8 codepoint like the following you'll print more bytes than you should (two at max), no? 110x 10xx 10xx -- WBR, Vadim Zhukov
Re: mini utf-8 hexdump
Stuart Henderson wrote: > On 2015/10/27 06:51, Ted Unangst wrote: > > This adds a quite limited understanding of utf-8 to hexdump. I've found it > > helpful trying to see exactly what's coming out of some utilities instead of > > trying to decode utf-8 by hand. > > Should it only do this for a utf-8 terminal? It should probably only do this, if it all, when the imaginary -u option is specified, but it's good enough for noodling around.
Re: mini utf-8 hexdump
On 2015/10/27 06:51, Ted Unangst wrote: > This adds a quite limited understanding of utf-8 to hexdump. I've found it > helpful trying to see exactly what's coming out of some utilities instead of > trying to decode utf-8 by hand. Should it only do this for a utf-8 terminal? > Index: display.c > === > RCS file: /cvs/src/usr.bin/hexdump/display.c,v > retrieving revision 1.21 > diff -u -p -r1.21 display.c > --- display.c 16 Jan 2015 06:40:08 - 1.21 > +++ display.c 27 Oct 2015 10:50:09 - > @@ -106,6 +106,17 @@ display(void) > } > } > > +static int > +isu8cont(unsigned char c) > +{ > + return (c & 0xc0) == 0x80; > +} > +static int > +isu8start(unsigned char c) > +{ > + return (c & 0xc0) == 0xc0; > +} > + > static __inline void > print(PR *pr, u_char *bp) > { > @@ -163,7 +174,16 @@ print(PR *pr, u_char *bp) > } > break; > case F_P: > - (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); > + if (isu8start(*bp)) { > + unsigned char *pp = bp + 1; > + (void)printf(pr->fmt, *bp); > + while (isu8cont(*pp)) > + (void)printf(pr->fmt, *pp++); > + } else if (isu8cont(*bp)) { > + (void)printf(pr->fmt, ' '); > + } else { > + (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); > + } > break; > case F_STR: > (void)printf(pr->fmt, (char *)bp); >
mini utf-8 hexdump
This adds a quite limited understanding of utf-8 to hexdump. I've found it helpful trying to see exactly what's coming out of some utilities instead of trying to decode utf-8 by hand. Index: display.c === RCS file: /cvs/src/usr.bin/hexdump/display.c,v retrieving revision 1.21 diff -u -p -r1.21 display.c --- display.c 16 Jan 2015 06:40:08 - 1.21 +++ display.c 27 Oct 2015 10:50:09 - @@ -106,6 +106,17 @@ display(void) } } +static int +isu8cont(unsigned char c) +{ + return (c & 0xc0) == 0x80; +} +static int +isu8start(unsigned char c) +{ + return (c & 0xc0) == 0xc0; +} + static __inline void print(PR *pr, u_char *bp) { @@ -163,7 +174,16 @@ print(PR *pr, u_char *bp) } break; case F_P: - (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); + if (isu8start(*bp)) { + unsigned char *pp = bp + 1; + (void)printf(pr->fmt, *bp); + while (isu8cont(*pp)) + (void)printf(pr->fmt, *pp++); + } else if (isu8cont(*bp)) { + (void)printf(pr->fmt, ' '); + } else { + (void)printf(pr->fmt, isprint(*bp) ? *bp : '.'); + } break; case F_STR: (void)printf(pr->fmt, (char *)bp);