On Fri, Apr 17, 2015 at 09:59:40AM -0700, H. S. Teoh via Digitalmars-d wrote:
[...]
> So either you have to throw out all pretenses of Unicode-correctness
> and just stick with ASCII-style per-character line-wrapping, or you
> have to live with byGrapheme with all the complexity that it entails.
> The former is quite easy to write -- I could throw it together in a
> couple o' hours max, but the latter is a pretty big project (cf.
> Unicode line-breaking algorithm, which is one of the TR's).
[...]

Well, talk is cheap, so here's a working implementation of the
non-Unicode-correct line wrapper that uses ranges and does not allocate:

        import std.range.primitives;
        
        /**
         * Range version of $(D std.string.wrap).
         *
         * Bugs:
         * This function does not conform to the Unicode line-breaking 
algorithm. It
         * does not take into account zero-width characters, combining 
diacritics,
         * double-width characters, non-breaking spaces, and bidi markings.  
Strings
         * containing these characters therefore may not be wrapped correctly.
         */
        auto wrapped(R)(R range, in size_t columns = 80, R firstindent = null,
                        R indent = null, in size_t tabsize = 8)
            if (isForwardRange!R && is(ElementType!R : dchar))
        {
            import std.algorithm.iteration : map, joiner;
            import std.range : chain;
            import std.uni;
        
            alias CharType = ElementType!R;
        
            // Returns: Wrapped lines.
            struct Result
            {
                private R range, indent;
                private size_t maxCols, tabSize;
        
                private size_t currentCol = 0;
                private R curIndent;
                bool empty = true;
                bool atBreak = false;
        
                this(R _range, R _firstindent, R _indent, size_t columns, 
size_t tabsize)
                {
                    this.range = _range;
                    this.curIndent = _firstindent.save;
                    this.indent = _indent;
                    this.maxCols = columns;
                    this.tabSize = tabsize;
        
                    empty = _range.empty;
        
                }
        
                @property CharType front()
                {
                    if (atBreak)
                        return '\n';    // should implicit convert to wider 
characters
                    else if (!curIndent.empty)
                        return curIndent.front;
                    else
                        return range.front;
                }
        
                void popFront()
                {
                    if (atBreak)
                    {
                        // We're at a linebreak.
                        atBreak = false;
                        currentCol = 0;
        
                        // Start new line with indent
                        curIndent = indent.save;
                        return;
                    }
                    else if (!curIndent.empty)
                    {
                        // We're iterating over an initial indent.
                        curIndent.popFront();
                        currentCol++;
                        return;
                    }
        
                    // We're iterating over the main range.
                    range.popFront();
                    if (range.empty)
                    {
                        empty = true;
                        return;
                    }
        
                    if (range.front == '\t')
                        currentCol += tabSize;
                    else if (isWhite(range.front))
                    {
                        // Scan for next word boundary to decide whether or not 
to
                        // break here.
                        R tmp = range.save;
                        assert(!tmp.empty);
        
                        size_t col = currentCol;
        
                        // Find start of next word
                        while (!tmp.empty && isWhite(tmp.front))
                        {
                            col++;
                            tmp.popFront();
                        }
        
                        // Remember start of next word so that if we need to 
break, we
                        // won't introduce extraneous spaces to the start of 
the new
                        // line.
                        R nextWord = tmp.save;
        
                        while (!tmp.empty && !isWhite(tmp.front))
                        {
                            col++;
                            tmp.popFront();
                        }
                        assert(tmp.empty || isWhite(tmp.front));
        
                        if (col > maxCols)
                        {
                            // Word wrap needed. Move current range position to
                            // start of next word.
                            atBreak = true;
                            range = nextWord;
                            return;
                        }
                    }
                    currentCol++;
                }
        
                @property Result save()
                {
                    Result copy = this;
                    copy.range = this.range.save;
                    //copy.indent = this.indent.save; // probably not needed?
                    copy.curIndent = this.curIndent.save;
                    return copy;
                }
            }
            static assert(isForwardRange!Result);
        
            return Result(range, firstindent, indent, columns, tabsize);
        }
        
        unittest
        {
            import std.algorithm.comparison : equal;
        
            auto s = ("This is a very long, artificially long, and gratuitously 
long "~
                      "single-line sentence to serve as a test case for 
byParagraph.")
                     .wrapped(30, ">>>>", ">>");
            assert(s.equal(
                ">>>>This is a very long,\n"~
                ">>artificially long, and\n"~
                ">>gratuitously long single-line\n"~
                ">>sentence to serve as a test\n"~
                ">>case for byParagraph."
            ));
        }


I didn't bother with avoiding autodecoding -- that should be relatively
easy to add, but I think it's stupid that we have to continually write
workarounds in our code to get around auto-decoding. If it's so
important that we don't autodecode, can we pretty please make the stupid
decision already and kill it off for good?!


T

-- 
To err is human; to forgive is not our policy. -- Samuel Adler

Reply via email to