A Unicode String ADT

Michael B. Allen Sun, 03 Jun 2001 00:39:37 -0700
Hi,

I have written a string ADT that normalizes ASCII, ISO-8859-1, UCS-2,
UCS-2LE, and UTF-8 to uint16_t UCS codes(UCS-2 or UCS-2LE depending on
the byte order of the machine) using mainly iconv. This is supposed to
be an easy(and fast?) way to manipulate strings of different character
encodings just as one might use 'char *'.

I would appreciate any feedback on correctness, portability, style,
design, c proper, ... anything. The plan is to use this in a fairly
advanced project so if it's a brick please do me a favor and hit me with
a rubber bat before I continue to flesh this out.

This 272 lines of c below the --8<-- should compile and run (see main)
as is with gcc -Wall -o str.o str.c.

Thanks,
Mike

--8<--

/* str.h - a unicode string adt
 */

#ifndef STR_H
#define STR_H

#include <stdlib.h>
#include <stdint.h>

/* Supported character encodings
 */

enum encodings {
    STR_RESV,    /* 0 is reserved */
    STR_STR,     /* encoding used by str_t itself */
    STR_ASCII,
    STR_8859_1,
    STR_UCS_2,
    STR_UCS_2LE,
    STR_UTF_8,
    STR_NUM_ENC
};

/* The str_t type
 */

typedef uint16_t str_t;

/* Convert enc encoded src bytes into a new str_t *
 */
str_t *str_new(void *src, size_t n, int enc);

/* Returns the number of bytes this null term string of enc encoded src
 * occupies in memory
 */ 
size_t str_enc_size(void *src, int enc);

/* Returns the number of bytes this string occupies in memory
 */
size_t str_size(const str_t *s);

/* Returns the length of the string encoded as enc at src
 */
unsigned int str_enc_len(void *src, int enc);

/* Returns the length of a str_t
 */
unsigned int str_len(const str_t *s);

/* Encode a str_t * into dst encoded as enc for at most n bytes or len characters
 */
size_t str_enc(const str_t *src, unsigned int len, void *dst, size_t n, int enc);

#endif /* STR_H */

/* str.c - a unicode string adt
 */

#include <stdlib.h>
#include <string.h>
#include <iconv.h>
#include <errno.h>
#include "str.h"

#if BYTE_ORDER == LITTLE_ENDIAN
#define HOST_BO_UCS_2 "UNICODELITTLE"
#elif BYTE_ORDER == BIG_ENDIAN
#define HOST_BO_UCS_2 "UCS-2"
#else
#error Cannot determine host byte order.
#endif

/* Table of encoding identifiers and character widths multiple for
 * conversion to UCS-2
 */

struct {
    char *e;
    int w;
} enc_tbl[STR_NUM_ENC] = {
    {NULL, 0},           /* 0 is reserved */
    {HOST_BO_UCS_2, 1},  /* encoding used by str_t itself */
    {"ASCII", 2},
    {"ISO-8859-1", 2},
    {"UCS-2", 1},
    {"UNICODELITTLE", 1},
    {"UTF-8", 0}
};

str_t *
str_new(void *src, size_t n, int enc)
{
    /* Convert the character data src, which occupies n bytes in
     * memory and is encoded in enc, and return a str_t array
     * encoded in UCS-2 or UCS-2LE depending on the byte order of
     * the host.
     */

    iconv_t cd;
    unsigned char *dst;
    unsigned char *s, *d;
    size_t sn, dn, i;

    if (enc > STR_NUM_ENC ||
            enc <= 0 ||
            (cd = iconv_open(HOST_BO_UCS_2, enc_tbl[enc].e)) == (iconv_t)-1) {
        return NULL;
    }

    if (enc == STR_UTF_8) {
        s = src;
        dn = 0;
        for (i = 0; i < n; i++) {
            if (s[i] < 0x80 || s[i] > 0xBF) {
                dn += 2;
            }
        }
    } else {
        dn = n * enc_tbl[enc].w;
    }
    if ((dst = malloc(dn)) == NULL) {
        iconv_close(cd);
        return NULL;
    }

    sn = n;
    s = src;
    d = dst;

    if (iconv(cd, (const char **)&s, &sn, (char **)&d, &dn) == (size_t)-1) {
        free(dst);
        dst = NULL;
    }

    iconv_close(cd);
    return (str_t *)dst;
}

size_t
str_enc_size(void *src, int enc)
{
    str_t *s;

    switch(enc) {
        case STR_ASCII:
        case STR_8859_1:
        case STR_UTF_8:
            return strlen((const char *)src) + 1;
        case STR_STR:
        case STR_UCS_2:
        case STR_UCS_2LE:
            for (s = src; *s != 0x0000; s++) {
                ;
            }
            return 2 * (s - (str_t *)src) + 2;
    }
    errno = EINVAL;
    return -1;
}
unsigned int
str_enc_len(void *src, int enc)
{
    str_t *s;
    const unsigned char *b;
    size_t r;

    switch(enc) {
        case STR_ASCII:
        case STR_8859_1:
            return strlen((const char *)src);
        case STR_UTF_8:
            for (r = 0, b = src; *b != '\0'; b++) {
                if (*b < 0x80 || *b > 0xBF) {
                    r++;
                }
            }
            return r;
        case STR_STR:
        case STR_UCS_2:
        case STR_UCS_2LE:
            for (s = src; *s != 0x0000; s++) {
                ;
            }
            return s - (str_t *)src;
    }
    errno = EINVAL;
    return -1;
}
size_t
str_size(const str_t *s)
{
    const str_t *p;

    for (p = s; *p != 0x0000; p++) {
        ;
    }
    return 2 * (p - s) + 2;
}
unsigned int
str_len(const str_t *s)
{
    const str_t *p;

    for (p = s; *p != 0x0000; p++) {
        ;
    }
    return p - s;
}

size_t
str_enc(const str_t *src, unsigned int len, void *dst, size_t n, int enc)
{
    iconv_t cd;
    const char *s;
    char *d;
    size_t sn, dn;

    if (enc > STR_NUM_ENC ||
            enc <= 0 ||
            (cd = iconv_open(enc_tbl[enc].e, HOST_BO_UCS_2)) == (iconv_t)-1) {
        return -1;
    }

    s = (const char *)src;
    sn = 2 * len;
    d = (char *)dst;
    dn = n;

    if (iconv(cd, &s, &sn, &d, &dn) == (size_t)-1) {
        iconv_close(cd);
        return -1;
    }

    iconv_close(cd);
    return d - (char *)dst;
}

#include <stdlib.h>
#include <stdio.h>
#include <locale.h>
#include "str.h"

char in[] = "Sch�ne Gr��e";

int
main(int argc, char *argv[])
{
    size_t l;
    str_t *s;
    char buf[1024];

    if (!setlocale(LC_CTYPE, "")) {
        fprintf(stderr, "Failed to set locale\n");
        return EXIT_FAILURE;
    }

    /* Convert the ISO-8859-1 string 'Sch�ne Gr��e' to UTF-8 using
     * this str_t along the way, then back to ISO-8859-1 and print
     * what should be the original input.
     */

    l = str_enc_size(in, STR_8859_1);
    s = str_new(in, l, STR_8859_1);
    l = str_enc(s, str_len(s) + 1, buf, 1024, STR_UTF_8);
    free(s);
    s = str_new(buf, l, STR_UTF_8);
    l = str_enc(s, str_len(s) + 1, buf, 1024, STR_8859_1); 
    printf("out=%s\ns[3]=%c\n", buf, s[3]);
    free(s);

    return EXIT_SUCCESS;
}
-
Linux-UTF8:   i18n of Linux on all levels
Archive:      http://mail.nl.linux.org/linux-utf8/
A Unicode String ADT

Reply via email to