I have written some time ago a little C program that generates such a list for all of 
Unicode, not just the UCS-2 subset.

It uses macros from a few ICU header files, but does not need the compiled ICU 
library. On Unixes, you may need to runConfigure, but on Windows it will work out of 
the box.

For supplementary code points on the non-UTF-8-side, it will write either 6-digit code 
points (default) or two 4-digit surrogates (have a command line argument, anything).

For ICU see http://oss.software.ibm.com/icu/

markus


----------------------- utf8map.c ------------
#include <stdio.h>
#include "unicode/utypes.h"

extern int
main(int argc, const char *argv[]) {
    uint16_t utf16[2];
    uint8_t utf8[4];
    uint32_t c, i, j;

    for(c=0; c<=0x7f; ++c) {
        printf("%04lx     %02x\n", c, c);
    }
    for(; c<=0x7ff; ++c) {
        j=0;
        UTF8_APPEND_CHAR_UNSAFE(utf8, j, c);
        printf("%04lx     %02x%02x\n", c, utf8[0], utf8[1]);
    }
    for(; c<=0xffff; ++c) {
        j=0;
        UTF8_APPEND_CHAR_UNSAFE(utf8, j, c);
        printf("%04lx     %02x%02x%02x\n", c, utf8[0], utf8[1], utf8[2]);
    }

    if(argc==1) {
        /* no argument: write code points */
        for(; c<=0xfffff; ++c) {
            j=0;
            UTF8_APPEND_CHAR_UNSAFE(utf8, j, c);
            printf("%lx    %02x%02x%02x%02x\n", c, utf8[0], utf8[1], utf8[2], utf8[3]);
        }
        /* 6 digits for the code points, align nicely */
        for(; c<=0x10ffff; ++c) {
            j=0;
            UTF8_APPEND_CHAR_UNSAFE(utf8, j, c);
            printf("%lx   %02x%02x%02x%02x\n", c, utf8[0], utf8[1], utf8[2], utf8[3]);
        }
    } else {
        /* there is an argument: write surrogate pairs */
        for(; c<=0x10ffff; ++c) {
            i=j=0;
            UTF16_APPEND_CHAR_UNSAFE(utf16, i, c);
            UTF8_APPEND_CHAR_UNSAFE(utf8, j, c);
            printf("%x%x %02x%02x%02x%02x\n", utf16[0], utf16[1], utf8[0], utf8[1], 
utf8[2], utf8[3]);
        }
    }

    return 0;
}

  • ... Damien Donlon - Sun Microsystems Ireland - Solaris Software - Software Engineer
    • ... David Starner
    • ... Markus Scherer
    • ... Hietaniemi Jarkko (NRC/Boston)

Reply via email to