On Sun, Jul 15, 2012 at 4:07 AM, Roland Mainz <[email protected]> wrote:
> On Fri, Jul 13, 2012 at 3:58 PM, David Korn <[email protected]> wrote:
>> cc: [email protected]
>> Subject: Re: [ast-developers] RFE: New "wc" option "-X" which counts number
>> of bytes which do not constitute valid multibyte characters...
>>
>> Would -X automatically enable -c?
>
> Erm... -c is for plain bytes while -X should just count the number of
> bytes not covered by -m/-C.
>
> AFAIK all "wc" counting options count independently (e.g. -c/-m/-w/-l
> can all be used in one command line (at least with GNU "wc"... AST
> "wc" doesn't like having both -c and -m at the same command line)) ...
> -X would be an exception because i basically "feeds on the remainder"
> of -m/-C ...
>
>> Would the output contain the other count or just the invalid character count?
>
> It is the count of _bytes_ which do not make a valid multibyte
> character (technically it can happen in the "C"/"POSIX" locales,
> too... since both only cover bytes 0-127... making 128-255 invalid
> character values).
Attached (as "wc_count_invalidchars001.diff.txt") is a prototype patch
which implements wc -X to count invalid (multibyte) characters.
A possible testcase would look like this (erm... is the "7" correct ?):
-- snip --
$ LC_ALL=en_US.UTF-8 ~/bin/ksh -c 'builtin wc ; printf
"a\xe1kkkk\xe2xLl\n" | wc -m -X -q'
7 2
-- snip --
Notes:
- -X currently only works with -m/-C, e.g. when characters (not bytes)
are being counted. -X could work with -c (=print byte count) when
-m/-C can be enabled internally, too. This may be usefull even in
single-byte locales since functions like |mbtowc()| should AFAIK
complain in cases when a byte does not represent a valid character
value (I'll test this later today)
- It would be nice if (assuming the POSIX/SUS standards allow it) that
both -c and -m/-C can be enabled at the same time. Is there anything
which disallows this from the standard's side ?
----
Bye,
Roland
--
__ . . __
(o.\ \/ /.o) [email protected]
\__\/\/__/ MPEG specialist, C&&JAVA&&Sun&&Unix programmer
/O /==\ O\ TEL +49 641 3992797
(;O/ \/ \O;)
diff -r -u original/src/lib/libcmd/wc.c
build_i386_64bit_debug/src/lib/libcmd/wc.c
--- src/lib/libcmd/wc.c 2012-01-10 19:57:19.000000000 +0100
+++ src/lib/libcmd/wc.c 2012-11-21 14:36:32.648577898 +0100
@@ -27,7 +27,7 @@
*/
static const char usage[] =
-"[-?\n@(#)$Id: wc (AT&T Research) 2009-11-28 $\n]"
+"[-?\n@(#)$Id: wc (AT&T Research) 2012-11-19 $\n]"
USAGE_LICENSE
"[+NAME?wc - print the number of bytes, words, and lines in files]"
"[+DESCRIPTION?\bwc\b reads one or more input files and, by default, "
@@ -49,6 +49,7 @@
"[w:words?List the word counts.]"
"[c:bytes|chars:chars?List the byte counts.]"
"[m|C:multibyte-chars?List the character counts.]"
+"[X:invalid-chars?List the counts of bytes not constituting a valid
character.]"
"[q:quiet?Suppress invalid multibyte character warnings.]"
"[L:longest-line|max-line-length?List the longest line length; the newline,"
"if any, is not counted in the length.]"
@@ -79,6 +80,8 @@
sfprintf(sfstdout," %7I*d",sizeof(wp->words),wp->words);
if (mode&WC_CHARS)
sfprintf(sfstdout," %7I*d",sizeof(wp->chars),wp->chars);
+ if (mode&WC_INVALIDMBYTE)
+ sfprintf(sfstdout,"
%7I*d",sizeof(wp->invalidchars),wp->invalidchars);
if (mode&WC_LONGEST)
sfprintf(sfstdout," %7I*d",sizeof(wp->chars),wp->longest);
if (name)
@@ -93,7 +96,7 @@
register int mode=0, n;
register Wc_t *wp;
Sfio_t *fp;
- Sfoff_t tlines=0, twords=0, tchars=0;
+ Sfoff_t tlines=0, twords=0, tchars=0, tichars=0;
struct stat statb;
cmdinit(argc, argv, context, ERROR_CATALOG, 0);
@@ -118,6 +121,9 @@
case 'C':
mode |= WC_MBYTE;
continue;
+ case 'X':
+ mode |= WC_INVALIDMBYTE;
+ continue;
case 'q':
mode |= WC_QUIET;
continue;
@@ -144,7 +150,7 @@
mode &= ~WC_MBYTE;
mode |= WC_CHARS;
}
- if (!(mode&(WC_WORDS|WC_CHARS|WC_LINES|WC_MBYTE|WC_LONGEST)))
+ if
(!(mode&(WC_WORDS|WC_CHARS|WC_LINES|WC_MBYTE|WC_INVALIDMBYTE|WC_LONGEST)))
mode |= (WC_WORDS|WC_CHARS|WC_LINES);
if (!(wp = wc_init(mode)))
error(3,"internal error");
@@ -162,7 +168,7 @@
}
if (cp)
n++;
- if (!(mode&(WC_WORDS|WC_LINES|WC_MBYTE|WC_LONGEST)) &&
fstat(sffileno(fp),&statb)>=0
+ if
(!(mode&(WC_WORDS|WC_LINES|WC_MBYTE|WC_INVALIDMBYTE|WC_LONGEST)) &&
fstat(sffileno(fp),&statb)>=0
&& S_ISREG(statb.st_mode))
{
wp->chars = statb.st_size - lseek(sffileno(fp),0L,1);
@@ -172,16 +178,18 @@
wc_count(wp, fp, cp);
if (fp!=sfstdin)
sfclose(fp);
- tchars += wp->chars;
- twords += wp->words;
- tlines += wp->lines;
+ tchars += wp->chars;
+ tichars += wp->invalidchars;
+ twords += wp->words;
+ tlines += wp->lines;
printout(wp,cp,mode);
} while (cp= *argv++);
if (n > 1)
{
- wp->lines = tlines;
- wp->chars = tchars;
- wp->words = twords;
+ wp->lines = tlines;
+ wp->chars = tchars;
+ wp->invalidchars = tichars;
+ wp->words = twords;
printout(wp,"total",mode);
}
return error_info.errors<ERRORMAX?error_info.errors:ERRORMAX;
diff -r -u original/src/lib/libcmd/wc.h
build_i386_64bit_debug/src/lib/libcmd/wc.h
--- src/lib/libcmd/wc.h 2009-08-14 06:51:45.000000000 +0200
+++ src/lib/libcmd/wc.h 2012-11-20 01:02:13.841400758 +0100
@@ -35,9 +35,10 @@
#define WC_WORDS 0x02
#define WC_CHARS 0x04
#define WC_MBYTE 0x08
-#define WC_LONGEST 0x10
-#define WC_QUIET 0x20
-#define WC_NOUTF8 0x40
+#define WC_INVALIDMBYTE 0x10
+#define WC_LONGEST 0x20
+#define WC_QUIET 0x40
+#define WC_NOUTF8 0x80
typedef struct
{
@@ -45,6 +46,7 @@
Sfoff_t words;
Sfoff_t lines;
Sfoff_t chars;
+ Sfoff_t invalidchars;
Sfoff_t longest;
int mode;
int mb;
diff -r -u original/src/lib/libcmd/wclib.c
build_i386_64bit_debug/src/lib/libcmd/wclib.c
--- src/lib/libcmd/wclib.c 2009-12-01 18:10:57.000000000 +0100
+++ src/lib/libcmd/wclib.c 2012-11-20 02:04:13.574080479 +0100
@@ -99,14 +99,13 @@
return wp;
}
-static int invalid(const char *file, int nlines)
+static void invalid(const char *file, Sfoff_t nlines)
{
error_info.file = (char*)file;
error_info.line = nlines;
error(ERROR_SYSTEM|1, "invalid multibyte character");
error_info.file = 0;
error_info.line = 0;
- return nlines;
}
/*
@@ -165,6 +164,7 @@
register unsigned char* cp;
register Sfoff_t nbytes;
register Sfoff_t nchars;
+ register Sfoff_t ninvalidchars;
register Sfoff_t nwords;
register Sfoff_t nlines;
register Sfoff_t eline = -1;
@@ -180,7 +180,7 @@
unsigned char side[32];
sfset(fd,SF_WRITE,1);
- nlines = nwords = nchars = nbytes = 0;
+ nlines = nwords = nchars = ninvalidchars = nbytes = 0;
wp->longest = 0;
if (wp->mb < 0 && (wp->mode & (WC_MBYTE|WC_WORDS)))
{
@@ -227,8 +227,14 @@
cp++;
x = -1;
}
- if (x == -1 && eline != nlines && !(wp->mode &
WC_QUIET))
- eline = invalid(file, nlines);
+ if (x == -1 && eline != nlines)
+ {
+ ninvalidchars++;
+ if(!(wp->mode &
WC_INVALIDMBYTE|WC_QUIET))
+ eline = nlines;
+ if(!(wp->mode & WC_QUIET))
+ invalid(file, nlines);
+ }
}
else
cp += n ? n : 1;
@@ -448,8 +454,14 @@
err:
skip = 0;
state = 0;
- if(eline!=nlines && !(wp->mode
& WC_QUIET))
- eline = invalid(file,
nlines);
+ if(eline!=nlines)
+ {
+ ninvalidchars++;
+ if(!(wp->mode &
WC_INVALIDMBYTE|WC_QUIET))
+ eline = nlines;
+ if(!(wp->mode &
WC_QUIET))
+ invalid(file,
nlines);
+ }
while(mbc(c) && ((c|WC_ERR) ||
(c&7)==0))
c=type[*cp++];
if(eol(c) && (cp > endbuff))
@@ -497,9 +509,10 @@
else
nchars = nbytes;
}
- wp->chars = nchars;
- wp->words = nwords;
- wp->lines = nlines;
+ wp->chars = nchars;
+ wp->invalidchars = ninvalidchars;
+ wp->words = nwords;
+ wp->lines = nlines;
return 0;
}
_______________________________________________
ast-developers mailing list
[email protected]
http://lists.research.att.com/mailman/listinfo/ast-developers