Why is there no full support for Unicode?

Set the encoding using BOM.

The status of the binary file should be given only after checking 0x00 characters.

BOM is part of the Unicode standard. http://www.unicode.org/faq/utf_bom.html#bom4

Files with encoding greater than 8 bits without BOM at the beginning can be immediately identified as binary.

My function in C#:

/// <summary>
/// </summary>
/// <param name="stream"></param>
/// <returns>null - binary</returns>
public static Encoding GetEncodingStream(Stream stream)
{
    BinaryReader bin = new BinaryReader(stream);
    byte[] bom = new byte[4];
    bin.BaseStream.Seek(0, SeekOrigin.Begin);
    bin.BaseStream.Read(bom, 0, bom.Length);
    bin.BaseStream.Seek(0, SeekOrigin.Begin);
if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == 0xFE && bom[3] == 0xFF) {
        return new UTF32Encoding(true, true); // UTF-32, big-endian
    } else if (bom[0] == 0xFE && bom[1] == 0xFF) {
        return new UnicodeEncoding(true, true); // UTF-16, big-endian
    } else if (bom[0] == 0xFF && bom[1] == 0xFE) {
        if (bom[2] == 0x00 && bom[2] == 0x00) {
            return new UTF32Encoding(false, true); // UTF-32, little-endian
        } else {
return new UnicodeEncoding(false, true); // UTF-16, little-endian
        }
    } else if (bom[0] == 0xEF && bom[1] == 0xBB && bom[2] == 0xBF) {
        return new UTF8Encoding(true);
    } else {
        bool binary = false;
        long fsize = bin.BaseStream.Length;
        if (fsize > 100000) {
            fsize = 100000;
        }
        byte[] bts = new byte[fsize];
        bin.BaseStream.Seek(0, SeekOrigin.Begin);
        bin.BaseStream.Read(bts, 0, (int)fsize);
        bin.BaseStream.Seek(0, SeekOrigin.Begin);
        for (int x = 0; x < fsize; x++) {
            if (bts[x] == 0) {
                binary = true;
                break;
            }
        }
        if (binary) {
            return null;
        }

        return Encoding.Default;
    }
}

Reply via email to