On Sun, Jul 15, 2012 at 4:07 AM, Roland Mainz <[email protected]> wrote:
> On Fri, Jul 13, 2012 at 3:58 PM, David Korn <[email protected]> wrote:
>> cc:  [email protected]
>> Subject: Re: [ast-developers] RFE: New "wc" option "-X" which counts number 
>> of  bytes which do not constitute valid multibyte characters...
>>
>> Would -X automatically enable -c?
>
> Erm... -c is for plain bytes while -X should just count the number of
> bytes not covered by -m/-C.
>
> AFAIK all "wc" counting options count independently (e.g. -c/-m/-w/-l
> can all be used in one command line (at least with GNU "wc"... AST
> "wc" doesn't like having both -c and -m at the same command line)) ...
> -X would be an exception because i basically "feeds on the remainder"
> of -m/-C ...
>
>> Would the output contain the other count or just the invalid character count?
>
> It is the count of _bytes_ which do not make a valid multibyte
> character (technically it can happen in the "C"/"POSIX" locales,
> too... since both only cover bytes 0-127... making 128-255 invalid
> character values).

Attached (as "wc_count_invalidchars001.diff.txt") is a prototype patch
which implements wc -X to count invalid (multibyte) characters.
A possible testcase would look like this (erm... is the "7" correct ?):
-- snip --
$ LC_ALL=en_US.UTF-8 ~/bin/ksh -c 'builtin wc ; printf
"a\xe1kkkk\xe2xLl\n" | wc -m -X -q'
       7       2
-- snip --

Notes:
- -X currently only works with -m/-C, e.g. when characters (not bytes)
are being counted. -X could work with -c (=print byte count) when
-m/-C can be enabled internally, too. This may be usefull even in
single-byte locales since functions like |mbtowc()| should AFAIK
complain in cases when a byte does not represent a valid character
value (I'll test this later today)
- It would be nice if (assuming the POSIX/SUS standards allow it) that
both -c and -m/-C can be enabled at the same time. Is there anything
which disallows this from the standard's side ?

----

Bye,
Roland

-- 
  __ .  . __
 (o.\ \/ /.o) [email protected]
  \__\/\/__/  MPEG specialist, C&&JAVA&&Sun&&Unix programmer
  /O /==\ O\  TEL +49 641 3992797
 (;O/ \/ \O;)
diff -r -u original/src/lib/libcmd/wc.c 
build_i386_64bit_debug/src/lib/libcmd/wc.c
--- src/lib/libcmd/wc.c 2012-01-10 19:57:19.000000000 +0100
+++ src/lib/libcmd/wc.c 2012-11-21 14:36:32.648577898 +0100
@@ -27,7 +27,7 @@
  */
 
 static const char usage[] =
-"[-?\n@(#)$Id: wc (AT&T Research) 2009-11-28 $\n]"
+"[-?\n@(#)$Id: wc (AT&T Research) 2012-11-19 $\n]"
 USAGE_LICENSE
 "[+NAME?wc - print the number of bytes, words, and lines in files]"
 "[+DESCRIPTION?\bwc\b reads one or more input files and, by default, "
@@ -49,6 +49,7 @@
 "[w:words?List the word counts.]"
 "[c:bytes|chars:chars?List the byte counts.]"
 "[m|C:multibyte-chars?List the character counts.]"
+"[X:invalid-chars?List the counts of bytes not constituting a valid 
character.]"
 "[q:quiet?Suppress invalid multibyte character warnings.]"
 "[L:longest-line|max-line-length?List the longest line length; the newline,"
     "if any, is not counted in the length.]"
@@ -79,6 +80,8 @@
                sfprintf(sfstdout," %7I*d",sizeof(wp->words),wp->words);
        if (mode&WC_CHARS)
                sfprintf(sfstdout," %7I*d",sizeof(wp->chars),wp->chars);
+       if (mode&WC_INVALIDMBYTE)
+               sfprintf(sfstdout," 
%7I*d",sizeof(wp->invalidchars),wp->invalidchars);
        if (mode&WC_LONGEST)
                sfprintf(sfstdout," %7I*d",sizeof(wp->chars),wp->longest);
        if (name)
@@ -93,7 +96,7 @@
        register int    mode=0, n;
        register Wc_t   *wp;
        Sfio_t          *fp;
-       Sfoff_t         tlines=0, twords=0, tchars=0;
+       Sfoff_t         tlines=0, twords=0, tchars=0, tichars=0;
        struct stat     statb;
 
        cmdinit(argc, argv, context, ERROR_CATALOG, 0);
@@ -118,6 +121,9 @@
                case 'C':
                        mode |= WC_MBYTE;
                        continue;
+               case 'X':
+                       mode |= WC_INVALIDMBYTE;
+                       continue;
                case 'q':
                        mode |= WC_QUIET;
                        continue;
@@ -144,7 +150,7 @@
                        mode &= ~WC_MBYTE;
                mode |= WC_CHARS;
        }
-       if (!(mode&(WC_WORDS|WC_CHARS|WC_LINES|WC_MBYTE|WC_LONGEST)))
+       if 
(!(mode&(WC_WORDS|WC_CHARS|WC_LINES|WC_MBYTE|WC_INVALIDMBYTE|WC_LONGEST)))
                mode |= (WC_WORDS|WC_CHARS|WC_LINES);
        if (!(wp = wc_init(mode)))
                error(3,"internal error");
@@ -162,7 +168,7 @@
                }
                if (cp)
                        n++;
-               if (!(mode&(WC_WORDS|WC_LINES|WC_MBYTE|WC_LONGEST)) && 
fstat(sffileno(fp),&statb)>=0
+               if 
(!(mode&(WC_WORDS|WC_LINES|WC_MBYTE|WC_INVALIDMBYTE|WC_LONGEST)) && 
fstat(sffileno(fp),&statb)>=0
                         && S_ISREG(statb.st_mode))
                {
                        wp->chars = statb.st_size - lseek(sffileno(fp),0L,1);
@@ -172,16 +178,18 @@
                        wc_count(wp, fp, cp);
                if (fp!=sfstdin)
                        sfclose(fp);
-               tchars += wp->chars;
-               twords += wp->words;
-               tlines += wp->lines;
+               tchars  += wp->chars;
+               tichars += wp->invalidchars;
+               twords  += wp->words;
+               tlines  += wp->lines;
                printout(wp,cp,mode);
        } while (cp= *argv++);
        if (n > 1)
        {
-               wp->lines = tlines;
-               wp->chars = tchars;
-               wp->words = twords;
+               wp->lines               = tlines;
+               wp->chars               = tchars;
+               wp->invalidchars        = tichars;
+               wp->words               = twords;
                printout(wp,"total",mode);
        }
        return error_info.errors<ERRORMAX?error_info.errors:ERRORMAX;
diff -r -u original/src/lib/libcmd/wc.h 
build_i386_64bit_debug/src/lib/libcmd/wc.h
--- src/lib/libcmd/wc.h 2009-08-14 06:51:45.000000000 +0200
+++ src/lib/libcmd/wc.h 2012-11-20 01:02:13.841400758 +0100
@@ -35,9 +35,10 @@
 #define WC_WORDS       0x02
 #define WC_CHARS       0x04
 #define WC_MBYTE       0x08
-#define WC_LONGEST     0x10
-#define WC_QUIET       0x20
-#define WC_NOUTF8      0x40
+#define WC_INVALIDMBYTE        0x10
+#define WC_LONGEST     0x20
+#define WC_QUIET       0x40
+#define WC_NOUTF8      0x80
 
 typedef struct
 {
@@ -45,6 +46,7 @@
        Sfoff_t words;
        Sfoff_t lines;
        Sfoff_t chars;
+       Sfoff_t invalidchars;
        Sfoff_t longest;
        int     mode;
        int     mb;
diff -r -u original/src/lib/libcmd/wclib.c 
build_i386_64bit_debug/src/lib/libcmd/wclib.c
--- src/lib/libcmd/wclib.c      2009-12-01 18:10:57.000000000 +0100
+++ src/lib/libcmd/wclib.c      2012-11-20 02:04:13.574080479 +0100
@@ -99,14 +99,13 @@
        return wp;
 }
 
-static int invalid(const char *file, int nlines)
+static void invalid(const char *file, Sfoff_t nlines)
 {
        error_info.file = (char*)file;
        error_info.line = nlines;
        error(ERROR_SYSTEM|1, "invalid multibyte character");
        error_info.file = 0;
        error_info.line = 0;
-       return nlines;
 }
 
 /*
@@ -165,6 +164,7 @@
        register unsigned char* cp;
        register Sfoff_t        nbytes;
        register Sfoff_t        nchars;
+       register Sfoff_t        ninvalidchars;
        register Sfoff_t        nwords;
        register Sfoff_t        nlines;
        register Sfoff_t        eline = -1;
@@ -180,7 +180,7 @@
        unsigned char           side[32];
 
        sfset(fd,SF_WRITE,1);
-       nlines = nwords = nchars = nbytes = 0;
+       nlines = nwords = nchars = ninvalidchars = nbytes = 0;
        wp->longest = 0;
        if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
        {
@@ -227,8 +227,14 @@
                                        cp++;
                                        x = -1;
                                }
-                               if (x == -1 && eline != nlines && !(wp->mode & 
WC_QUIET))
-                                       eline = invalid(file, nlines);
+                               if (x == -1 && eline != nlines)
+                               {
+                                       ninvalidchars++;
+                                       if(!(wp->mode & 
WC_INVALIDMBYTE|WC_QUIET))
+                                               eline = nlines;
+                                       if(!(wp->mode & WC_QUIET))
+                                               invalid(file, nlines);
+                               }
                        }
                        else
                                cp += n ? n : 1;
@@ -448,8 +454,14 @@
                                err:
                                                skip = 0;
                                                state = 0;
-                                               if(eline!=nlines && !(wp->mode 
& WC_QUIET))
-                                                       eline = invalid(file, 
nlines);
+                                               if(eline!=nlines)
+                                               {
+                                                       ninvalidchars++;
+                                                       if(!(wp->mode & 
WC_INVALIDMBYTE|WC_QUIET))
+                                                               eline = nlines;
+                                                       if(!(wp->mode & 
WC_QUIET))
+                                                               invalid(file, 
nlines);
+                                               }
                                                while(mbc(c) && ((c|WC_ERR) || 
(c&7)==0)) 
                                                        c=type[*cp++];
                                                if(eol(c) && (cp > endbuff))
@@ -497,9 +509,10 @@
                else
                        nchars = nbytes;
        }
-       wp->chars = nchars;
-       wp->words = nwords;
-       wp->lines = nlines;
+       wp->chars               = nchars;
+       wp->invalidchars        = ninvalidchars;
+       wp->words               = nwords;
+       wp->lines               = nlines;
        return 0;
 }
 
_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers

Reply via email to