Re: ANSI to UTF8

teo Thu, 03 Feb 2011 06:25:55 -0800

On Mon, 31 Jan 2011 17:08:33 +0000, Janusch wrote:

> Hello!
> 
> I'm trying to convert ANSI characters to UTF8 that but it doesn't work
> correctly.
> 
> I used the following:
> 
> void main() {
>       writeln(convertToUTF8("ä"));
> }
> 
> string convertToUTF8(string text) {
> 
>       string result;
> 
>       for (uint i=0; i<text.length; i++) {
>               char ch = text[i];
>               if (ch < 0x80) {
>                       result ~= ch;
>               } else {
>                       result ~= 0xC0 | (ch >> 6);
>                       result ~= 0x80 | (ch & 0x3F);
>               }
>       }
>       return result;
> 
> }
> 
> But writeln doesn't print anything (only a blank line), but not my
> character. The same problem exists for similar characters like ü or ö.
> 
> Is there anything I'm doing wrong?



You may give a try to the following code. It is based on PHP 5.2.9
---
module ISO88591;

/+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++
decode latin-1 (ISO-8859-1) string to UTF-8
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++/
string decode(byte[] content)
{
        byte[] result = new byte[content.length * 4];
        uint n = 0;
        uint i = content.length;
        ubyte* p = cast(ubyte*)content.ptr;
        while (i > 0)
        {
                uint c = cast(uint)*p;
                if (c < 0x80)
                {
                        result[n++] = cast(ubyte)c;
                }
                else if (c < 0x800)
                {
                        result[n++] = cast(ubyte)(0xC0 | (c >> 6));
                        result[n++] = cast(ubyte)(0x80 | (c & 0x3F));
                }
                else if (c < 0x10000)
                {
                        result[n++] = cast(ubyte)(0xE0 | (c >> 12));
                        result[n++] = cast(ubyte)(0xC0 | ((c >> 6) & 
0x3F));
                        result[n++] = cast(ubyte)(0x80 | (c & 0x3F));
                }
                else if (c < 0x200000)
                {
                        result[n++] = cast(ubyte)(0xF0 | (c >> 18));
                        result[n++] = cast(ubyte)(0xE0 | ((c >> 12) & 
0x3F));
                        result[n++] = cast(ubyte)(0xC0 | ((c >> 6) & 
0x3F));
                        result[n++] = cast(ubyte)(0x80 | (c & 0x3F));
                }
                p++;
                i--;
        }
        result.length = n;
        return cast(string)result;
}

/+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++++++
encode UTF-8 string to latin-1 (ISO-8859-1)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+++++/
byte[] encode(string content)
{
        byte[] buf = cast(byte[])content;
        byte[] result = new byte[buf.length];
        uint n = 0;
        uint i = buf.length;
        ubyte* p = cast(ubyte*)buf.ptr;
        while (i > 0)
        {
                uint c = *p;
                if (c >= 0xF0)
                {
                        // four bytes encoded, 21 bits
                        if (i >= 4)
                        {
                                c = ((p[0] & 0x07) << 18) | ((p[1] & 
0x3F) << 12) | ((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
                        }
                        else
                        {
                                c = 0x3F;
                        }
                        p += 4;
                        i -= 4;
                }
                else if (c >= 0xE0)
                {
                        // three bytes encoded, 16 bits
                        if (i >= 3)
                        {
                                c = ((p[0] & 0x3F) << 12) | ((p[1] & 
0x3F) << 6) | (p[2] & 0x3F);
                        }
                        else
                        {
                                c = 0x3F;
                        }
                        p += 3;
                        i -= 3;
                }
                else if (c >= 0xC0)
                {
                        // two bytes encoded, 11 bits
                        if (i >= 2)
                        {
                                c = ((p[0] & 0x3F) << 6) | (p[1] & 0x3F);
                        }
                        else
                        {
                                c = 0x3F;
                        }
                        p += 2;
                        i -= 2;
                }
                else
                {
                        p++;
                        i--;
                }
                // use '?' (0x3F) if no mapping is possible
                result[n++] = cast(ubyte)((c > 0xFF) ? 0x3F : c);
        }
        result.length = n;
        return result;
}
---

I wrote it for D1 and did now quick tests with D2. It should be working.
Please give feedback.

And here is my test program:

import std.stdio;
import ISO88591;

void main()
{
        string str = "äöüß";
        auto tmp = encode(str);
        writefln("latin-1:%x", cast(ubyte[])tmp);
        auto res = decode(tmp);
        writefln("utf-8:%x:%s", cast(ubyte[])res, res);

        return;
}

Re: ANSI to UTF8

Reply via email to