On Tue, Sep 29, 2020 at 04:22:18PM +, Dukc via Digitalmars-d-learn wrote:
> On Monday, 28 September 2020 at 18:23:43 UTC, Chloé Kekoa wrote:
> > The documentation of std.uni [1] says that the unicode struct
> > provides sets for several binary properties. I am looking for a way
> > to query non-binary properties of a character. Is that possible with
> > std.uni or do I need to use a third-party library?
> >
> > I am specifically interested in the East_Asian_Width property [2]
> > (which has six allowed values). Trying to access
> > std.uni.unicode.East_Asian_Width results in the error message:
> >
> > > No unicode set by name East_Asian_Width was found.
> >
> > [1]: https://dlang.org/library/std/uni.html
> > [2]: https://www.unicode.org/reports/tr11/tr11-38.html
>
> It seems the East Asian width is Unicode standard 13.0, while Phobos
> implements 6.2. So seems like ca case for a third-party library :(.
[...]
OTOH, the relevant Unicode data file that contains East_Asian_Width data
(EastAsianWidth.txt) is relatively straightforward to parse. In one of
my projects, I wrote a little helper program to parse this file and
generate a function that tells me if a given dchar is wide or narrow.
Here's the generated function (just copy-n-paste this into your code, no
need for yet another external library dependency):
bool isWide(dchar ch) @safe pure nothrow @nogc
{
if (ch < 63744)
{
if (ch < 12880)
{
if (ch < 11904)
{
if (ch < 4352) return false;
if (ch < 4448) return true;
if (ch == 9001 || ch == 9002) return true;
return false;
}
else if (ch < 12351) return true;
else
{
if (ch < 12353) return false;
if (ch < 12872) return true;
return false;
}
}
else if (ch < 19904) return true;
else
{
if (ch < 43360)
{
if (ch < 19968) return false;
if (ch < 42183) return true;
return false;
}
else if (ch < 43389) return true;
else
{
if (ch < 44032) return false;
if (ch < 55204) return true;
return false;
}
}
}
else if (ch < 64256) return true;
else
{
if (ch < 65504)
{
if (ch < 65072)
{
if (ch < 65040) return false;
if (ch < 65050) return true;
return false;
}
else if (ch < 65132) return true;
else
{
if (ch < 65281) return false;
if (ch < 65377) return true;
return false;
}
}
else if (ch < 65511) return true;
else
{
if (ch < 127488)
{
if (ch == 110592 || ch == 110593) return true;
return false;
}
else if (ch < 127570) return true;
else
{
if (ch < 131072) return false;
if (ch < 262142) return true;
return false;
}
}
}
}
Here's the utility that generated this code:
/**
* Simple program to parse EastAsianWidth.txt to extract some useful
info.
*/
import std.algorithm;
import std.conv;
import std.range;
import std.regex;
import std.stdio;
struct CodeRange
{
dchar start, end;
bool overlaps(CodeRange cr)
{
return ((start >= cr.start && start < cr.end) ||
(end >= cr.start && end < cr.end));
}
unittest
{
assert(CodeRange(1,11).overlaps(CodeRange(11,12)));
assert(!CodeRange(1,10).overlaps(CodeRange(11,12)));
}
void merge(CodeRange cr)
{
start = min(start, cr.start);
end = max(end, cr.end);
}
unittest
{
auto cr = CodeRange(10,20);
cr.merge(CodeRange(20,30));
assert