I like the direction you're taking but have some quibbles about details.
Specifically, I'd go for a more complete separation into random-access
code-unit ranges and bidirectional code-point ranges:
On 01/12/10 02:18, Steven Schveighoffer wrote:
// Written in the D programming language.
/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License: <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License
1.0</a>.
Authors: $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)
Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
(See accompanying file LICENSE_1_0.txt or copy at
http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;
struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
Is there a reason not to include is(Unqual!T == dchar)?
{
private T[] _data;
this(T[] data)
{
this._data = data;
}
An opAssign from a T[] could facilitate conversion back and forth
between code-point and code-unit ranges.
// note, this assumes that idx is a valid index
private size_t _charStart(size_t idx) const
{
static if(is(Unqual!T == wchar))
{
immutable c = _data.ptr[idx];
if(c >= 0xDC00 && c <= 0xDFFF)
{
// surrogate pair detected, verify we have at least 2 wchars,
// and that both wchars are properly encoded.
assert(idx > 0, "Invalid UTF character at beginning of string");
return idx-1;
}
else
return idx;
}
else
{
const p = _data.ptr + idx;
if ((p[0] & 0b1100_0000) != 0b1000_0000)
{
return idx;
}
else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
{
return idx - 1;
}
else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
{
return idx - 2;
}
else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
{
return idx - 3;
}
else
{
assert(false, "Invalid UTF character in string");
}
}
}
void popFront()
{
auto nc = std.utf.stride(_data, 0);
assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at beginning
of string");
_data = _data[nc .. $];
}
void popBack()
{
immutable n = _data.length;
assert(n, "Attempting to pop back of an empty string");
_data = _data.ptr[0.._charStart(n-1)];
}
@property dchar front() const
{
assert(_data.length, "Attempting to fetch the front of an empty string");
size_t i = 0;
return decode(_data, i);
}
@property dchar back() const
{
immutable n = _data.length;
assert(n, "Attempting to fetch the back of an empty string");
auto idx = _charStart(n-1);
return std.utf.decode(_data, idx);
}
There is the alternative of deferring decoding to the user and returning
T[]'s holding exactly 1 code-point instead of dchars. I'm not sure
which is best, but I'd be interested in seeing a case for choosing one
or the other.
@property bool empty() const
{
return !_data.length;
}
@property typeof(this) save()
{
return this;
}
// support read-only random access via code unit index.
dchar opIndex(size_t idx)
{
idx = _charStart(idx);
return std.utf.decode(_data, idx);
}
string_t opSlice()
{
return this;
}
string_t opSlice(size_t start, size_t end)
{
if(start != _data.length)
start = _charStart(start);
if(end != _data.length)
end = _charStart(end);
return string_t(_data[start..end]);
}
// note we don't call this length because length can be assumed to be the
// number of elements, which this isn't.
@property size_t codeUnits() const
{
return _data.length;
}
I don't see a need for _charStart, opIndex, opSlice and codeUnits. If
the underlying T[] can be returned by a property, then these can be done
through the code-unit array, which is random-access.
// support append and concat
// TODO: need to support appending various types of strings to eachother
// (right now only same-type strings can be appended, or raw arrays)
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==
string_t))
{
_data ~= data._data;
return this;
}
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U ==
string_t) && is(typeof(_data ~= U.init)))
{
_data ~= data;
return this;
}
string_t opBinary(string op, U)(U data) if (op == "~" && is(U == string_t))
{
return string_t(_data ~ data._data);
}
string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==
string_t) && is(typeof(_data ~ U.init)))
{
return string_t(_data ~ data);
}
}
template string_t(T) if (is(Unqual!T == dchar))
{
alias T[] string_t;
}
/** begin test code **/
import std.stdio;
alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;
void main()
{
auto str = mystring("hello");
str ~= " world";
str ~= mystring("!!!");
writeln(str._data);
}