Re: mini utf-8 hexdump

2015-10-27 Thread Vadim Zhukov
2015-10-27 11:51 GMT+01:00 Ted Unangst :
> This adds a quite limited understanding of utf-8 to hexdump. I've found it
> helpful trying to see exactly what's coming out of some utilities instead of
> trying to decode utf-8 by hand.
>
> Index: display.c
> ===
> RCS file: /cvs/src/usr.bin/hexdump/display.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 display.c
> --- display.c   16 Jan 2015 06:40:08 -  1.21
> +++ display.c   27 Oct 2015 10:50:09 -
> @@ -106,6 +106,17 @@ display(void)
> }
>  }
>
> +static int
> +isu8cont(unsigned char c)
> +{
> +   return (c & 0xc0) == 0x80;
> +}
> +static int
> +isu8start(unsigned char c)
> +{
> +   return (c & 0xc0) == 0xc0;
> +}
> +
>  static __inline void
>  print(PR *pr, u_char *bp)
>  {
> @@ -163,7 +174,16 @@ print(PR *pr, u_char *bp)
> }
> break;
> case F_P:
> -   (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
> +   if (isu8start(*bp)) {
> +   unsigned char *pp = bp + 1;
> +   (void)printf(pr->fmt, *bp);
> +   while (isu8cont(*pp))
> +   (void)printf(pr->fmt, *pp++);
> +   } else if (isu8cont(*bp)) {
> +   (void)printf(pr->fmt, ' ');
> +   } else {
> +   (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
> +   }
> break;
> case F_STR:
> (void)printf(pr->fmt, (char *)bp);

So in case of the mangled UTF8 codepoint like the following you'll
print more bytes than you should (two at max), no?

110x
10xx
10xx


--
  WBR,
  Vadim Zhukov



Re: mini utf-8 hexdump

2015-10-27 Thread Ted Unangst
Stuart Henderson wrote:
> On 2015/10/27 06:51, Ted Unangst wrote:
> > This adds a quite limited understanding of utf-8 to hexdump. I've found it
> > helpful trying to see exactly what's coming out of some utilities instead of
> > trying to decode utf-8 by hand.
> 
> Should it only do this for a utf-8 terminal?

It should probably only do this, if it all, when the imaginary -u option is
specified, but it's good enough for noodling around.



Re: mini utf-8 hexdump

2015-10-27 Thread Stuart Henderson
On 2015/10/27 06:51, Ted Unangst wrote:
> This adds a quite limited understanding of utf-8 to hexdump. I've found it
> helpful trying to see exactly what's coming out of some utilities instead of
> trying to decode utf-8 by hand.

Should it only do this for a utf-8 terminal?

> Index: display.c
> ===
> RCS file: /cvs/src/usr.bin/hexdump/display.c,v
> retrieving revision 1.21
> diff -u -p -r1.21 display.c
> --- display.c 16 Jan 2015 06:40:08 -  1.21
> +++ display.c 27 Oct 2015 10:50:09 -
> @@ -106,6 +106,17 @@ display(void)
>   }
>  }
>  
> +static int
> +isu8cont(unsigned char c)
> +{
> + return (c & 0xc0) == 0x80;
> +}
> +static int
> +isu8start(unsigned char c)
> +{
> + return (c & 0xc0) == 0xc0;
> +}
> +
>  static __inline void
>  print(PR *pr, u_char *bp)
>  {
> @@ -163,7 +174,16 @@ print(PR *pr, u_char *bp)
>   }
>   break;
>   case F_P:
> - (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
> + if (isu8start(*bp)) {
> + unsigned char *pp = bp + 1;
> + (void)printf(pr->fmt, *bp);
> + while (isu8cont(*pp))
> + (void)printf(pr->fmt, *pp++);
> + } else if (isu8cont(*bp)) {
> + (void)printf(pr->fmt, ' ');
> + } else {
> + (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
> + }
>   break;
>   case F_STR:
>   (void)printf(pr->fmt, (char *)bp);
> 



mini utf-8 hexdump

2015-10-27 Thread Ted Unangst
This adds a quite limited understanding of utf-8 to hexdump. I've found it
helpful trying to see exactly what's coming out of some utilities instead of
trying to decode utf-8 by hand.

Index: display.c
===
RCS file: /cvs/src/usr.bin/hexdump/display.c,v
retrieving revision 1.21
diff -u -p -r1.21 display.c
--- display.c   16 Jan 2015 06:40:08 -  1.21
+++ display.c   27 Oct 2015 10:50:09 -
@@ -106,6 +106,17 @@ display(void)
}
 }
 
+static int
+isu8cont(unsigned char c)
+{
+   return (c & 0xc0) == 0x80;
+}
+static int
+isu8start(unsigned char c)
+{
+   return (c & 0xc0) == 0xc0;
+}
+
 static __inline void
 print(PR *pr, u_char *bp)
 {
@@ -163,7 +174,16 @@ print(PR *pr, u_char *bp)
}
break;
case F_P:
-   (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
+   if (isu8start(*bp)) {
+   unsigned char *pp = bp + 1;
+   (void)printf(pr->fmt, *bp);
+   while (isu8cont(*pp))
+   (void)printf(pr->fmt, *pp++);
+   } else if (isu8cont(*bp)) {
+   (void)printf(pr->fmt, ' ');
+   } else {
+   (void)printf(pr->fmt, isprint(*bp) ? *bp : '.');
+   }
break;
case F_STR:
(void)printf(pr->fmt, (char *)bp);