[review] new string type

Steven Schveighoffer Tue, 30 Nov 2010 07:50:24 -0800

In a prior thread, I promised to create a narrow string type that wouldenforce the requirements of narrow strings. That is, the new string typeshould respect the encoding of narrow strings.

Here is a rough draft, tested minimally, but it does compile and passsimple tests. It's pretty simple, which is what I would expect. I copieda lot of stuff from std.array to get this to work.

The point of this type is -- if we replace what the compiler considers"strings" with this type, then we get both the compiler *and* phobosagreeing as to what this type is: A bi-directional range of dchar.

As a bonus, char[] and wchar[] now would become arrays and be manipulatedconsistently with other arrays, which if not done correctly could causeproblems, but may provide more flexibility and opportunity forperformance. Instead of the library fighting you on it.


Anyways, here it is, released under the boost license, commence attack ;)


// Written in the D programming language.

/**
Copyright: Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010

License: <a href="http://www.boost.org/LICENSE_1_0.txt";>Boost License1.0</a>.

Authors:   $(WEB erdani.org, Andrei Alexandrescu, Steven Schveighoffer)

Copyright Andrei Alexandrescu and Steven Schveighoffer 2008-2010.
Distributed under the Boost Software License, Version 1.0.
   (See accompanying file LICENSE_1_0.txt or copy at
         http://www.boost.org/LICENSE_1_0.txt)
*/
import std.utf;
import std.traits;

struct string_t(T) if (is(Unqual!T == char) || is(Unqual!T == wchar))
{
    private T[] _data;
    this(T[] data)
    {
        this._data = data;
    }

    // note, this assumes that idx is a valid index
    private size_t _charStart(size_t idx) const
    {
        static if(is(Unqual!T == wchar))
        {
            immutable c = _data.ptr[idx];
            if(c >= 0xDC00 && c <= 0xDFFF)
            {

// surrogate pair detected, verify we have at least 2wchars,

                // and that both wchars are properly encoded.

assert(idx > 0, "Invalid UTF character at beginning ofstring");

                return idx-1;
            }
            else
                return idx;
        }
        else
        {
            const p = _data.ptr + idx;
            if ((p[0] & 0b1100_0000) != 0b1000_0000)
            {
                return idx;
            }
            else if (idx >= 1 && (p[-1] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 1;
            }
            else if (idx >= 2 && (p[-2] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 2;
            }
            else if (idx >= 3 && (p[-3] & 0b1100_0000) != 0b1000_0000)
            {
                return idx - 3;
            }
            else
            {
                assert(false, "Invalid UTF character in string");
            }
        }
    }

    void popFront()
    {
        auto nc = std.utf.stride(_data, 0);

assert(nc <= _data.length && nc != 0xFF, "Invalid sequence atbeginning of string");

        _data = _data[nc .. $];
    }

    void popBack()
    {
        immutable n = _data.length;
        assert(n, "Attempting to pop back of an empty string");
        _data = _data.ptr[0.._charStart(n-1)];
    }

    @property dchar front() const
    {

assert(_data.length, "Attempting to fetch the front of an emptystring");

        size_t i = 0;
        return decode(_data, i);
    }

    @property dchar back() const
    {
        immutable n = _data.length;
        assert(n, "Attempting to fetch the back of an empty string");
        auto idx = _charStart(n-1);
        return std.utf.decode(_data, idx);
    }

    @property bool empty() const
    {
        return !_data.length;
    }

    @property typeof(this) save()
    {
        return this;
    }

    // support read-only random access via code unit index.
    dchar opIndex(size_t idx)
    {
        idx = _charStart(idx);
        return std.utf.decode(_data, idx);
    }

    string_t opSlice()
    {
        return this;
    }

    string_t opSlice(size_t start, size_t end)
    {
        if(start != _data.length)
            start = _charStart(start);
        if(end != _data.length)
            end = _charStart(end);
        return string_t(_data[start..end]);
    }

// note we don't call this length because length can be assumed to bethe

    // number of elements, which this isn't.
    @property size_t codeUnits() const
    {
        return _data.length;
    }

    // support append and concat

// TODO: need to support appending various types of strings toeachother

    // (right now only same-type strings can be appended, or raw arrays)

ref string_t opOpAssign(string op, U)(U data) if (op == "~" && is(U ==string_t))

    {
        _data ~= data._data;
        return this;
    }

ref string_t opOpAssign(string op, U)(U data) if (op == "~" && !is(U== string_t) && is(typeof(_data ~= U.init)))

    {
        _data ~= data;
        return this;
    }

string_t opBinary(string op, U)(U data) if (op == "~" && is(U ==string_t))

    {
        return string_t(_data ~ data._data);
    }

string_t opBinary(string op, U)(U data) if (op == "~" && !is(U ==string_t) && is(typeof(_data ~ U.init)))

    {
        return string_t(_data ~ data);
    }
}

template string_t(T) if (is(Unqual!T == dchar))
{
    alias T[] string_t;
}

/** begin test code **/
import std.stdio;

alias string_t!(immutable char) mystring;
alias string_t!(immutable wchar) mywstring;
alias string_t!(immutable dchar) mydstring;

void main()
{
    auto str = mystring("hello");
    str ~= " world";
    str ~= mystring("!!!");
    writeln(str._data);
}

[review] new string type

Reply via email to