On Fri, 31 Dec 2004 11:50:27 -0500 (EST)
Henry Spencer <[EMAIL PROTECTED]> wrote:

> On Fri, 31 Dec 2004, Michael B Allen wrote:
> > I'm looking for a C function to convert the case of a UTF-8 string.
> 
> Bear in mind that doing this right is not a simple exercise, and the
> mbtowc/towupper approach isn't really sufficient -- for example, a case
> change can alter the length of the string.

Dear god please tell me your mistaken. Please provide an example?

>  It would help if you could
> supply more context:  why do you want to do this, as part of what? 

I just want to upcase or downcase a string. The following seems to work ok.

Mike

/**
 * $ gcc -Wall -W -o toupper toupper.c
 * $ LANG=en_US.UTF-8 ./toupper utf8.txt
 */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <locale.h>
#include <string.h>
#include <wchar.h>
#include <ctype.h>
#include <wctype.h>

#define BSIZE 0xFFFF

int
utf8toupper(unsigned char *str, unsigned char *slim)
{
    unsigned char *start = str;
    mbstate_t psw, psm;

    memset(&psw, 0, sizeof(psw));
    memset(&psm, 0, sizeof(psm));

    while (str < slim && *str) { 
        if ((*str & 0x80) == 0 && 0) { 
            *str++ = toupper(*str);
        } else {
            wchar_t wc, wcu;
            size_t n;

            if ((n = mbrtowc(&wc, str, slim - str, &psw)) == (size_t)-1) {
                return -1;
            }
            if ((wcu = towupper(wc)) != wc) {
                if (wcrtomb(str, wcu, &psm) == (size_t)-1) {
                    return -1;
                }
            }
            str += n;
        }
    }

    return str - start;
}

int
main(int argc, char *argv[])
{
    FILE *fp;
    size_t n;
    unsigned char buf[BSIZE];

    if (argc < 2) {
        fprintf(stderr, "usage: %s <utf8file>\n", argv[0]);
        return EXIT_FAILURE;
    }

    errno = 0;

    if ((fp = fopen(argv[1], "r")) == NULL) { 
        return EXIT_FAILURE;
    }

    if (!setlocale(LC_CTYPE, "")) {
        return EXIT_FAILURE;
    }

    while ((n = fread(buf, 1, BSIZE, fp)) > 0) {
        if (utf8toupper(buf, buf + n) != (int)n) {
            return EXIT_FAILURE;
        }
        if (fwrite(buf, 1, n, stdout) != n) { 
            return EXIT_FAILURE;
        }
    }

    return EXIT_SUCCESS;
}

-- 
Greedo shoots first? Not in my Star Wars.

--
Linux-UTF8:   i18n of Linux on all levels
Archive:      http://mail.nl.linux.org/linux-utf8/

Reply via email to