Based on the suggestions of others, I have updated my string type.

changes:

* opApply for iterating with foreach w/ index
* indexing the string at an invalid location (i.e. not the start of a code point) throws a RangeError (does that make sense, or should it be an exception?). * charStart is now public so you can use it to ensure you are accessing the start of a code point * validIdx new function that tells you if your index is at the start of a code point.
* data property which gets the underlying T[]
* Added free functions for string_t!dchar to make it have the same properties as the other string types.
* Added an ability to assign to a T[] array for ease of use.
* Added a ptr property so it works seamlessly with code that currently uses strings (but we still need $, however it appears this isn't implemented yet in dmd).
* fully documented

Here it is:


// Written in the D programming language.

/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010
License: <a href="http://www.boost.org/LICENSE_1_0.txt";>Boost License 1.0</a>.
Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;
import core.exception;

// BUG
version = norangeopapplyoverload;

/**
* Narrow string type. This string implements utf-8 or utf-16 depending on the
 * character type.
 *
 * The string type is a bi-directional range of dchars, with a monotonic
 * indexing scheme.  Essentially, because dchars are encoded at variable
 * widths, indexing and slicing based on dchars would be an O(n) operation.
* Therefore, we allow indexing and slicing, but based on the 'code-unit' or
 * unit of encoding (wchar or char).  This means some indexes are valid and
* some are not (those which do not point to the start of an encoded dchar are
 * not).
 *
* While this might seem confusing, it is very rare that one needs arbitrary * index access. In order to achieve this you can either ensure to yourself * that the string's code-unit to dchar ratio is 1 to 1 (never more than one
 * code-unit to encode a dchar), or you can use the charStart funciton to
 * ensure a valid index.
 */
struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
    private T[] _data;

    /**
     * Constructor, builds a string based on given array data.
     */
    this(T[] data)
    {
        this._data = data;
    }

    /**
     * Provide access to underlying array data.
     */
    @property T[] data()
    {
        return _data;
    }

    /**
     * forward access to the ptr part of the array.  This allows the most
     * efficient operations when one knows what he is doing.
     */
    @property T* ptr()
    {
        return _data.ptr;
    }


    /**
     * Finds the largest valid index in the string that is <= idx.
* Essentially, this can be used to convert arbitrary indexes into valid
     * indexes.
     */
    size_t charStart(size_t idx) const
    {
        static if(is(Unqual!T == wchar))
        {
            immutable c = _data.ptr[idx];
            if(c >= 0xDC00 && c <= 0xDFFF)
            {
// surrogate pair detected, verify we have at least 2 wchars,
                // and that both wchars are properly encoded.
assert(idx > 0, "Invalid UTF character at beginning of string");
                return idx-1;
            }
            else
                return idx;
        }
        else
        {
            const p = _data.ptr + idx;
            if ((p[0] & 0b1100_0000) != 0b1000_0000)
            {
                return idx;
            }
            else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 1;
            }
            else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 2;
            }
            else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 3;
            }
            else
            {
                assert(false, "Invalid UTF character in string");
            }
        }
    }

    /**
     * Returns true if the given index starts an encoded dchar.
     */
    bool validIdx(size_t idx)
    {
        if(idx >= _data.length)
        {
            if(idx is _data.length)
                return true; // index one beyond the string is valid.
            return false;
        }
        immutable c = _data[idx];
        static if(is(Unqual!T == wchar))
        {
// make sure this isn't the second character of a surrogate pair
            return (c < 0xDC00 || c > 0xDFFF);
        }
        else // char
        {
            return ((c & 0b1100_0000) != 0b1000_0000);
        }
    }

    /**
     * remove the first code-point from the string.
     */
    void popFront()
    {
        auto nc = std.utf.stride(_data, 0);
assert(nc <= _data.length && nc != 0xFF, "Invalid sequence at beginning of string");
        _data = _data[nc .. $];
    }

    /**
     * Remove the last code-point from the string.
     */
    void popBack()
    {
        immutable n = _data.length;
        assert(n, "Attempting to pop back of an empty string");
        _data = _data.ptr[0..charStart(n-1)];
    }

    /**
     * Get the first code-point in the string
     */
    @property dchar front() const
    {
assert(_data.length, "Attempting to fetch the front of an empty string");
        size_t i = 0;
        return decode(_data, i);
    }

    /**
     * Get the last code-point in the string
     */
    @property dchar back() const
    {
        immutable n = _data.length;
        assert(n, "Attempting to fetch the back of an empty string");
        auto idx = charStart(n-1);
        return std.utf.decode(_data, idx);
    }

    /**
     * Does the string contain any data?
     */
    @property bool empty() const
    {
        return !_data.length;
    }

    /**
     * Copy the string (trivial function, needed for range definitions)
     */
    @property typeof(this) save()
    {
        return this;
    }

    /**
     * support read-only random access via code unit index.
     *
* Note that an invalid idx (one that does not start a code point) results
     * in an exception
     */
    dchar opIndex(size_t idx)
    {
        if(idx is _data.length || !validIdx(idx))
            throw new RangeError(__FILE__, __LINE__);

        return std.utf.decode(_data, idx);
    }

    /**
     * slice the whole string
     */
    string_t opSlice()
    {
        return this;
    }

    /**
     * Slice based on valid start and end indexes.
     *
     * Throws RangeError if start or end are not valid indexes
     */
    string_t opSlice(size_t start, size_t end)
    {
        if(end < start || !validIdx(start) || !validIdx(end))
            throw new RangeError(__FILE__, __LINE__);
        return string_t(_data[start..end]);
    }

    /**
     * Get the number of code units in the string.
     *
     * Note that this is specifically not called length because length
* generally implies the number of elements in a range. Since dchar is our
     * element type, and the number of dchars cannot be determined in O(1)
     * time, using the name length would be incorrect.
     */
    @property size_t codeUnits() const
    {
        return _data.length;
    }

    /**
     * Append a string to this string.
     *
     * TODO: support appending any string type to this string type.
     */
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U == string_t))
    {
        _data ~= data._data;
        return this;
    }

    /**
     * Support appending any types that the underlying array can support.
     */
ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U == string_t) && is(typeof(_data ~= U.init)))
    {
        _data ~= data;
        return this;
    }

    /**
     * Concatenation between two strings
     */
string_t opBinary(string op, U)(U data) if (op == "~" && is(U == string_t))
    {
        return string_t(_data ~ data._data);
    }

    /**
     * Support any concatenation that is supported by the underlying data
     * array.
     */
string_t opBinary(string op, U)(U data) if (op == "~" && !is(U == string_t) && is(typeof(_data ~ U.init)))
    {
        return string_t(_data ~ data);
    }

// note, this should not be required, it should use the range interface but
    // the compiler doesn't allow both to coexist for foreach.
    version(norangeopapplyoverload)
    {
        /**
         * Foreach with just dchars.  Note, you cannot actually change the
         * data, despite the argument being ref.  opApply requires ref.
         */
        int opApply(scope int delegate(ref dchar d) dg)
        {
            dchar d;
            size_t idx = 0;
            immutable len = _data.length;
            int result = 0;
            while(result == 0 && idx < len)
            {
                d = std.utf.decode(_data, idx);
                result = dg(d);
            }
            return result;
        }
    }

    /**
     * Foreach over the string with accompanied index.
     *
* Note, the refs are required for foreach, you cannot change the index or
     * the data in the string.
     */
    int opApply(scope int delegate(ref size_t idx, ref dchar d) dg)
    {
        dchar d;
        size_t idx = 0;
        immutable len = _data.length;
        int result = 0;
        while(result == 0 && idx < len)
        {
            size_t tmpidx = idx;
            d = std.utf.decode(_data, idx);
            result = dg(tmpidx, d);
        }
        return result;
    }

    /**
     * Assign a string
     */
    string_t opAssign(U)(U u) if(is(U == string_t))
    {
        this._data = u._data;
        return this;
    }

    /**
     * Assign a string from another type
     */
string_t opAssign(U)(U u) if (!is(U == string_t) && is(typeof(_data = u)))
    {
        _data = u;
        return this;
    }
}

/**
 * String type for dchar
 */
template string_t(T) if (is(Unqual!T == dchar))
{
    alias T[] string_t;
}

// support string functions for dchar that aren't already defined.

// TODO: do we need this one?
// TODO: should be inout instead of a template
@property T[] data(T)(T[] t) if (is(Unqual!T == dchar))
{
    return t;
}

/**
 * Finds the largest valid index in the string that is <= idx.
 * Essentially, this can be used to convert arbitrary indexes into valid
 * indexes.
 */
size_t charStart(const(dchar)[] t, size_t idx)
{
    return idx;
}

/**
 * Returns true if the given index starts an encoded dchar.
 */
bool validIdx(const(dchar)[] t, size_t idx)
{
    return idx <= t.length;
}

// TODO: do we need this one?
@property size_t codeUnits(const(dchar)[] t)
{
    return t.length;
}

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
{
    auto str = mystring("hello");
    foreach(dchar d; str) { }
    str ~= " world";
    str ~= mystring("!!!");
    writeln(str.data);
    mystring str2 = "blah blah";
    str2 = str;
    str2 = "blah blah";
}

Reply via email to