I'm making a case insensitive app-specific VFS for a MS compatible
application and I need UTF-8 samples that I can extract words from to
make tests with. Can someone think of a source of real UTF-8 input words
like this:

  http://www.columbia.edu/kermit/utf8.html

but with a *lot* more words and less punctuation?

Thanks,
Mike

PS: Does this look like a sane UTF-8 caseless string comparison (haven't
tried to compile it yet):

utf8casecmp(const char *str1, size_t sn1, const char *str2, size_t sn2)
{
        size_t n1, n2;
        wchar_t ucs1, ucs2;
        mbstate_t ps1, ps2;
        unsigned char uc1, uc2;

        memset(&ps1, 0, sizeof(ps1));
        memset(&ps2, 0, sizeof(ps2));
        while (sn1 > 0 && sn2 > 0) {
                if ((*str1 & 0x80) && (*str2 & 0x80)) {           /* both multibyte */
                        if ((n1 = mbrtowc(&ucs1, str1, sn, &ps1)) < 0 ||
                                        (n2 = mbrtowc(&ucs2, str2, sn, &ps2)) < 0) {
                                perror("mbrtowc");
                                return -1;
                        }
                        if (ucs1 != ucs2 && (ucs1 = towupper(ucs1)) != (ucs2 = 
towupper(ucs2))) {
                                return ucs1 < ucs2 ? -1 : 1;
                        }
                        sn1 -= n1; str1 += n1;
                        sn2 -= n2; str2 += n2;
                } else {                                /* neither or one multibyte */
                        uc1 = toupper(*str1);
                        uc2 = toupper(*str2);
                        if (uc1 != uc2) {
                                return uc1 < uc2 ? -1 : 1;
                        } else if (uc1 == '\0') {
                                return 0;
                        }
                        sn1--; str1++;
                        sn2--; str2++;
                }
        }
        return 0;
}


-- 
A  program should be written to model the concepts of the task it
performs rather than the physical world or a process because this
maximizes  the  potential  for it to be applied to tasks that are
conceptually  similar and, more important, to tasks that have not
yet been conceived. 
--
Linux-UTF8:   i18n of Linux on all levels
Archive:      http://mail.nl.linux.org/linux-utf8/

Reply via email to