In a prior thread, I promised to create a narrow string type that would enforce the requirements of narrow strings. That is, the new string type should respect the encoding of narrow strings.

Here is a rough draft, tested minimally, but it does compile and pass simple tests. It's pretty simple, which is what I would expect. I copied a lot of stuff from std.array to get this to work.

The point of this type is -- if we replace what the compiler considers "strings" with this type, then we get both the compiler *and* phobos agreeing as to what this type is: A bi-directional range of dchar.

As a bonus, char[] and wchar[] now would become arrays and be manipulated consistently with other arrays, which if not done correctly could cause problems, but may provide more flexibility and opportunity for performance. Instead of the library fighting you on it.

Anyways, here it is, released under the boost license, commence attack ;)


// Written in the D programming language.

/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License: <a href="http://www.boost.org/LICENSE_1_0.txt";>Boost License 1.0</a>.
Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;

struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
    private T[] _data;
    this(T[] data)
    {
        this._data = data;
    }

    // note, this assumes that idx is a valid index
    private size_t _charStart(size_t idx) const
    {
        static if(is(Unqual!T == wchar))
        {
            immutable c = _data.ptr[idx];
            if(c >= 0xDC00 && c <= 0xDFFF)
            {
// surrogate pair detected, verify we have at least 2 wchars,
                // and that both wchars are properly encoded.
assert(idx > 0, "Invalid UTF character at beginning of string");
                return idx-1;
            }
            else
                return idx;
        }
        else
        {
            const p = _data.ptr + idx;
            if ((p[0] & 0b1100_0000) != 0b1000_0000)
            {
                return idx;
            }
            else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 1;
            }
            else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 2;
            }
            else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 3;
            }
            else
            {
                assert(false, "Invalid UTF character in string");
            }
        }
    }

    void popFront()
    {
        auto nc = std.utf.stride(_data, 0);
assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at beginning of string");
        _data = _data[nc .. $];
    }

    void popBack()
    {
        immutable n = _data.length;
        assert(n, "Attempting to pop back of an empty string");
        _data = _data.ptr[0.._charStart(n-1)];
    }

    @property dchar front() const
    {
assert(_data.length, "Attempting to fetch the front of an empty string");
        size_t i = 0;
        return decode(_data, i);
    }

    @property dchar back() const
    {
        immutable n = _data.length;
        assert(n, "Attempting to fetch the back of an empty string");
        auto idx = _charStart(n-1);
        return std.utf.decode(_data, idx);
    }

    @property bool empty() const
    {
        return !_data.length;
    }

    @property typeof(this) save()
    {
        return this;
    }

    // support read-only random access via code unit index.
    dchar opIndex(size_t idx)
    {
        idx = _charStart(idx);
        return std.utf.decode(_data, idx);
    }

    string_t opSlice()
    {
        return this;
    }

    string_t opSlice(size_t start, size_t end)
    {
        if(start != _data.length)
            start = _charStart(start);
        if(end != _data.length)
            end = _charStart(end);
        return string_t(_data[start..end]);
    }

// note we don't call this length because length can be assumed to be the
    // number of elements, which this isn't.
    @property size_t codeUnits() const
    {
        return _data.length;
    }

    // support append and concat
// TODO: need to support appending various types of strings to eachother
    // (right now only same-type strings can be appended, or raw arrays)
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U == string_t))
    {
        _data ~= data._data;
        return this;
    }

ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U == string_t) && is(typeof(_data ~= U.init)))
    {
        _data ~= data;
        return this;
    }

string_t opBinary(string op, U)(U data) if (op == "~" && is(U == string_t))
    {
        return string_t(_data ~ data._data);
    }

string_t opBinary(string op, U)(U data) if (op == "~" && !is(U == string_t) && is(typeof(_data ~ U.init)))
    {
        return string_t(_data ~ data);
    }
}

template string_t(T) if (is(Unqual!T == dchar))
{
    alias T[] string_t;
}

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
{
    auto str = mystring("hello");
    str ~= " world";
    str ~= mystring("!!!");
    writeln(str._data);
}

Reply via email to