On Thu, Dec 28, 2017 at 05:56:32PM +0000, Andrei via Digitalmars-d-learn wrote: > There is one everlasting problem writing Cyrillic programs in Windows: > Microsoft consequently invented two much different code pages for > Russia and other Cyrillic-alphabet countries: first was MSDOS-866 (and > alike), second Windows-1251. Nowadays MS Windows uses first code page > for console programs, second for GUI applications, and there always > are many workarounds to get proper translation between them. Mostly a > programmer should write program sources either in one code page for > console and other for GUI, or use .NET, which basically uses UTF8 in > sources and makes seamless translation depending on back end. > > In D language which uses only UTF8 for string encoding I cannot write > neither MS866 code page program texts, nor Windows-1251 - both cases > end in a compiler error like "Invalid trailing code unit" or "Outside > Unicode code space". And writing Cyrillic strings in UTF8 format is > fatal for both console and GUI Windows targets. > > My question is: is there any standard means to translate Cyrillic or > any other localized UTF8 strings for console and GUI output in D > libraries. If so - where I can get more information and good example. > Google would not help. [...]
The string / wstring / dstring types in D are intended to be Unicode strings. If you need to use other encodings, you really should be using ubyte[] or const(ubyte)[] or immutable(ubyte)[], instead of string. One approach is to use UTF-8 in your code, and only translate to one of the code pages when you need to produce output. I wrote a small module for translating to/from KOI8-R when dealing with Russian text; you might find it helpful: ------------------------------------------------------------------------------- /** * Module to convert between UTF and KOI8-R */ module koi8r; import std.string; import std.range; static immutable ubyte[0x450 - 0x410] utf2koi8r = [ 225, 226, 247, 231, 228, 229, 246, 250, // АБВГДЕЖЗ 233, 234, 235, 236, 237, 238, 239, 240, // ИЙКЛМНОП 242, 243, 244, 245, 230, 232, 227, 254, // РСТУФХЦЧ 251, 253, 255, 249, 248, 252, 224, 241, // ШЩЪЫЬЭЮЯ 193, 194, 215, 199, 196, 197, 214, 218, // абвгдежз 201, 202, 203, 204, 205, 206, 207, 208, // ийклмноп 210, 211, 212, 213, 198, 200, 195, 222, // рстуфхцч 219, 221, 223, 217, 216, 220, 192, 209 // шщъыьэюя ]; /** * Translates a range of UTF characters into KOI8-R characters. * Returns: Range of KOI8-R characters (as ubyte). */ auto toKOI8r(R)(R range) if (isInputRange!R && is(ElementType!R : dchar)) { static struct Result { R _range; @property bool empty() { return _range.empty; } @property ubyte front() { dchar ch = _range.front; // ASCII if (ch < 128) return cast(ubyte)ch; // Primary alphabetic range if (ch >= 0x410 && ch < 0x450) return utf2koi8r[ch - 0x410]; // Special case: Ё and ё are outside the usual range. if (ch == 0x401) return 179; if (ch == 0x451) return 163; throw new Exception( "Encoding error: unable to convert '%c' to KOI8-R".format(ch)); } void popFront() { _range.popFront(); } static if (isForwardRange!R) { @property Result save() { Result copy; copy._range = _range.save; return copy; } } } return Result(range); } unittest { import std.string; import std.algorithm : equal; assert("юабцдефгхийклмнопярстужвьызшэщчъ".toKOI8r.equal(iota(192, 224))); assert("ЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ".toKOI8r.equal(iota(224, 256))); } unittest { auto r = "abc абв".toKOI8r; static assert(isForwardRange!(typeof(r))); import std.algorithm.comparison : equal; assert(r.equal(['a', 'b', 'c', ' ', 193, 194, 215])); } static dchar[0x100 - 0xC0] koi8r2utf = [ 'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г', // 192-199 'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о', // 200-207 'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в', // 208-215 'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ', // 216-223 'Ю', 'А', 'Б', 'Ц', 'Д', 'Е', 'Ф', 'Г', // 224-231 'Х', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', // 232-239 'П', 'Я', 'Р', 'С', 'Т', 'У', 'Ж', 'В', // 240-247 'Ь', 'Ы', 'З', 'Ш', 'Э', 'Щ', 'Ч', 'Ъ' // 248-255 ]; /** * Translates a range of KOI8-R characters to UTF. * Returns: Range of UTF characters (as dchar). */ auto fromKOI8r(R)(R range) if (isInputRange!R && is(ElementType!R : ubyte)) { static struct Result { R _range; @property bool empty() { return _range.empty; } @property dchar front() { ubyte b = _range.front; if (b < 128) return b; if (b >= 192) return koi8r2utf[b - 192]; switch (b) { case 128: return '─'; case 152: return '≤'; case 153: return '≥'; case 163: return 'ё'; case 179: return 'Ё'; default: import std.string : format; throw new Exception( "KOI8-R character %d not implemented yet".format(b)); } } void popFront() { _range.popFront(); } static if (isForwardRange!R) { @property Result save() { Result copy; copy._range = _range.save; return copy; } } } return Result(range); } unittest { import std.algorithm.comparison : equal; ubyte[] lower = [ 193, 194, 215, 199, 196, 197, 163, 214, 218, 201, 202, 203, 204, 205, 206, 207, 208, 210, 211, 212, 213, 198, 200, 195, 222, 219, 221, 223, 217, 216, 220, 192, 209 ]; assert(lower.fromKOI8r.equal("абвгдеёжзийклмнопрстуфхцчшщъыьэюя")); ubyte[] upper = [ 225, 226, 247, 231, 228, 229, 179, 246, 250, 233, 234, 235, 236, 237, 238, 239, 240, 242, 243, 244, 245, 230, 232, 227, 254, 251, 253, 255, 249, 248, 252, 224, 241 ]; assert(upper.fromKOI8r.equal("АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ")); } ------------------------------------------------------------------------------- As the unittests show, you just call toKOI8r or fromKOI8r to translate between encodings. All non-Unicode strings are traded as ubyte[], so that you won't accidentally mix up a Unicode string with a KOI8-R string. And the code should be straightforward enough to be adapted for other encodings as well. Hope this helps. T -- For every argument for something, there is always an equal and opposite argument against it. Debates don't give answers, only wounded or inflated egos.