i just had a similar problem a day or two ago.

i needed to change some capitalization and the 
tr 'A-Z' 'a-z' idiom doesn't work on random utf.

i solved it a bit differently -- lifting the fullrune()
check into the main loop. so i don't have a readu() 
function. also (unlike tcs) at the cost of 1 extra check 
at the end-of-input, the output buffer is dumped only 
when full. on japanese, greek or other text with 
>1 byte/char, this will save calls to OUT() --
or in my case print().

okay, total overkill. i know. but it was more interesting
to do that way. 

here's upper.c. convert to upper/lower/title case:



#include <u.h>
#include <libc.h>

enum { BLOCK = 1024*4 };

typedef Rune (*Rconv)(Rune);

void output(Rune* r, int nrunes, Rconv R){
        int i;

        for(i=0; i<nrunes; i++){
                r[i] = R(r[i]);
        }
        print("%.*S", nrunes, r);
}

const char* casify(int fd, Rconv R){
        char in[BLOCK + UTFmax];
        Rune r[BLOCK + UTFmax];
        long rem_len;
        long blen;
        long j;
        long i;

        rem_len=0;
        j = 0;
again:  while (0 < (blen = read(fd, in + rem_len, BLOCK))){
                blen += rem_len;

                for(i=0; i<blen; ){
                        if (!fullrune(in + i, blen - i)){
                                rem_len = blen - i;
                                memcpy(in, in + i, rem_len);
                                goto again;
                        }
                        i += chartorune(r + j++, in + i);
                        if (j > BLOCK){
                                output(r, j, R);
                                j=0;
                        }
                }
        }

        if (rem_len){
                // non unicode garbage.
                fprint(2, "non-utf8 garbage %.*s at eof\n", rem_len, in);
        }

        if (j){
                output(r, j, R);
        }

        if (blen>0){
                return 0;
        }
        return "read";
}

void main(int argc, /* pfft const */ char** argv){
        Rconv R;
        const char* v;
        const char* status;
        const char* s;
        int fd;

        v = strrchr(argv[0], '/');
        if (v){
                v++;
        } else {
                v = argv[0];
        }
        
        if (0 == strcmp(v, "tolower")){
                R = tolowerrune;
        } else if (0 == strcmp(v, "totitle")){
                R = totitlerune;
        } else {
                R = toupperrune;
        }

        ARGBEGIN{
        case 'u':
                R = toupperrune;
                break;
        case 'l':
                R = tolowerrune;
                break;
        case 't':
                R = totitlerune;
                break;
        default:
                fprint(2, "%s: bad option %c\n", argv0, ARGC());
                fprint(2, "usage: %s -[ult]\n", argv0);
                exits("usage");
        } ARGEND

        if (!*argv){
                s = casify(0, R);
        } else {
                for(status = 0; *argv; argv++){
                        fd = open(*argv, OREAD);
                        if (-1 == fd){
                                if (s && !status){
                                        status = "open";
                                }
                                continue;
                        }
                        s = casify(fd, R);
                        if (s && !status){
                                status = s;
                        }
                        close(fd);
                }
        }

        exits(status ? status : "");
}



Reply via email to