[review] new string type (take 2)

Steven Schveighoffer Thu, 13 Jan 2011 08:25:29 -0800


Based on the suggestions of others, I have updated my string type.


changes:

* opApply for iterating with foreach w/ index

* indexing the string at an invalid location (i.e. not the start of a codepoint) throws a RangeError (does that make sense, or should it be anexception?).* charStart is now public so you can use it to ensure you are accessingthe start of a code point* validIdx new function that tells you if your index is at the start of acode point.

* data property which gets the underlying T[]

* Added free functions for string_t!dchar to make it have the sameproperties as the other string types.

* Added an ability to assign to a T[] array for ease of use.

* Added a ptr property so it works seamlessly with code that currentlyuses strings (but we still need $, however it appears this isn'timplemented yet in dmd).

* fully documented

Here it is:


// Written in the D programming language.

/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010

License: <a href="http://www.boost.org/LICENSE_1_0.txt";>Boost License1.0</a>.

Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;
import core.exception;

// BUG
version = norangeopapplyoverload;

/**

* Narrow string type. This string implements utf-8 or utf-16 dependingon the

 * character type.
 *
 * The string type is a bi-directional range of dchars, with a monotonic
 * indexing scheme.  Essentially, because dchars are encoded at variable
 * widths, indexing and slicing based on dchars would be an O(n) operation.

* Therefore, we allow indexing and slicing, but based on the 'code-unit'or

 * unit of encoding (wchar or char).  This means some indexes are valid and

* some are not (those which do not point to the start of an encoded dcharare

 * not).
 *

* While this might seem confusing, it is very rare that one needsarbitrary* index access. In order to achieve this you can either ensure toyourself* that the string's code-unit to dchar ratio is 1 to 1 (never more thanone

 * code-unit to encode a dchar), or you can use the charStart funciton to
 * ensure a valid index.
 */
struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
    private T[] _data;

    /**
     * Constructor, builds a string based on given array data.
     */
    this(T[] data)
    {
        this._data = data;
    }

    /**
     * Provide access to underlying array data.
     */
    @property T[] data()
    {
        return _data;
    }

    /**
     * forward access to the ptr part of the array.  This allows the most
     * efficient operations when one knows what he is doing.
     */
    @property T* ptr()
    {
        return _data.ptr;
    }


    /**
     * Finds the largest valid index in the string that is <= idx.

* Essentially, this can be used to convert arbitrary indexes intovalid

     * indexes.
     */
    size_t charStart(size_t idx) const
    {
        static if(is(Unqual!T == wchar))
        {
            immutable c = _data.ptr[idx];
            if(c >= 0xDC00 && c <= 0xDFFF)
            {

// surrogate pair detected, verify we have at least 2wchars,

                // and that both wchars are properly encoded.

assert(idx > 0, "Invalid UTF character at beginning ofstring");

                return idx-1;
            }
            else
                return idx;
        }
        else
        {
            const p = _data.ptr + idx;
            if ((p[0] & 0b1100_0000) != 0b1000_0000)
            {
                return idx;
            }
            else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 1;
            }
            else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 2;
            }
            else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 3;
            }
            else
            {
                assert(false, "Invalid UTF character in string");
            }
        }
    }

    /**
     * Returns true if the given index starts an encoded dchar.
     */
    bool validIdx(size_t idx)
    {
        if(idx >= _data.length)
        {
            if(idx is _data.length)
                return true; // index one beyond the string is valid.
            return false;
        }
        immutable c = _data[idx];
        static if(is(Unqual!T == wchar))
        {

// make sure this isn't the second character of a surrogatepair

            return (c < 0xDC00 || c > 0xDFFF);
        }
        else // char
        {
            return ((c & 0b1100_0000) != 0b1000_0000);
        }
    }

    /**
     * remove the first code-point from the string.
     */
    void popFront()
    {
        auto nc = std.utf.stride(_data, 0);

assert(nc <= _data.length && nc != 0xFF, "Invalid sequence atbeginning of string");

        _data = _data[nc .. $];
    }

    /**
     * Remove the last code-point from the string.
     */
    void popBack()
    {
        immutable n = _data.length;
        assert(n, "Attempting to pop back of an empty string");
        _data = _data.ptr[0..charStart(n-1)];
    }

    /**
     * Get the first code-point in the string
     */
    @property dchar front() const
    {

assert(_data.length, "Attempting to fetch the front of an emptystring");

        size_t i = 0;
        return decode(_data, i);
    }

    /**
     * Get the last code-point in the string
     */
    @property dchar back() const
    {
        immutable n = _data.length;
        assert(n, "Attempting to fetch the back of an empty string");
        auto idx = charStart(n-1);
        return std.utf.decode(_data, idx);
    }

    /**
     * Does the string contain any data?
     */
    @property bool empty() const
    {
        return !_data.length;
    }

    /**
     * Copy the string (trivial function, needed for range definitions)
     */
    @property typeof(this) save()
    {
        return this;
    }

    /**
     * support read-only random access via code unit index.
     *

* Note that an invalid idx (one that does not start a code point)results

     * in an exception
     */
    dchar opIndex(size_t idx)
    {
        if(idx is _data.length || !validIdx(idx))
            throw new RangeError(__FILE__, __LINE__);

        return std.utf.decode(_data, idx);
    }

    /**
     * slice the whole string
     */
    string_t opSlice()
    {
        return this;
    }

    /**
     * Slice based on valid start and end indexes.
     *
     * Throws RangeError if start or end are not valid indexes
     */
    string_t opSlice(size_t start, size_t end)
    {
        if(end < start || !validIdx(start) || !validIdx(end))
            throw new RangeError(__FILE__, __LINE__);
        return string_t(_data[start..end]);
    }

    /**
     * Get the number of code units in the string.
     *
     * Note that this is specifically not called length because length

* generally implies the number of elements in a range. Since dcharis our

     * element type, and the number of dchars cannot be determined in O(1)
     * time, using the name length would be incorrect.
     */
    @property size_t codeUnits() const
    {
        return _data.length;
    }

    /**
     * Append a string to this string.
     *
     * TODO: support appending any string type to this string type.
     */

ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==string_t))

    {
        _data ~= data._data;
        return this;
    }

    /**
     * Support appending any types that the underlying array can support.
     */

ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U== string_t) && is(typeof(_data ~= U.init)))

    {
        _data ~= data;
        return this;
    }

    /**
     * Concatenation between two strings
     */

string_t opBinary(string op, U)(U data) if (op == "~" && is(U ==string_t))

    {
        return string_t(_data ~ data._data);
    }

    /**
     * Support any concatenation that is supported by the underlying data
     * array.
     */

string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==string_t) && is(typeof(_data ~ U.init)))

    {
        return string_t(_data ~ data);
    }

// note, this should not be required, it should use the rangeinterface but

    // the compiler doesn't allow both to coexist for foreach.
    version(norangeopapplyoverload)
    {
        /**
         * Foreach with just dchars.  Note, you cannot actually change the
         * data, despite the argument being ref.  opApply requires ref.
         */
        int opApply(scope int delegate(ref dchar d) dg)
        {
            dchar d;
            size_t idx = 0;
            immutable len = _data.length;
            int result = 0;
            while(result == 0 && idx < len)
            {
                d = std.utf.decode(_data, idx);
                result = dg(d);
            }
            return result;
        }
    }

    /**
     * Foreach over the string with accompanied index.
     *

* Note, the refs are required for foreach, you cannot change theindex or

     * the data in the string.
     */
    int opApply(scope int delegate(ref size_t idx, ref dchar d) dg)
    {
        dchar d;
        size_t idx = 0;
        immutable len = _data.length;
        int result = 0;
        while(result == 0 && idx < len)
        {
            size_t tmpidx = idx;
            d = std.utf.decode(_data, idx);
            result = dg(tmpidx, d);
        }
        return result;
    }

    /**
     * Assign a string
     */
    string_t opAssign(U)(U u) if(is(U == string_t))
    {
        this._data = u._data;
        return this;
    }

    /**
     * Assign a string from another type
     */

string_t opAssign(U)(U u) if (!is(U == string_t) && is(typeof(_data =u)))

    {
        _data = u;
        return this;
    }
}

/**
 * String type for dchar
 */
template string_t(T) if (is(Unqual!T == dchar))
{
    alias T[] string_t;
}

// support string functions for dchar that aren't already defined.

// TODO: do we need this one?
// TODO: should be inout instead of a template
@property T[] data(T)(T[] t) if (is(Unqual!T == dchar))
{
    return t;
}

/**
 * Finds the largest valid index in the string that is <= idx.
 * Essentially, this can be used to convert arbitrary indexes into valid
 * indexes.
 */
size_t charStart(const(dchar)[] t, size_t idx)
{
    return idx;
}

/**
 * Returns true if the given index starts an encoded dchar.
 */
bool validIdx(const(dchar)[] t, size_t idx)
{
    return idx <= t.length;
}

// TODO: do we need this one?
@property size_t codeUnits(const(dchar)[] t)
{
    return t.length;
}

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
{
    auto str = mystring("hello");
    foreach(dchar d; str) { }
    str ~= " world";
    str ~= mystring("!!!");
    writeln(str.data);
    mystring str2 = "blah blah";
    str2 = str;
    str2 = "blah blah";
}

[review] new string type (take 2)

Reply via email to