On 06/01/10 23:12, Nick Wellnhofer wrote:
Here is a preliminary patch.

Here is bigger patch that makes the following changes:

- Move the function pointers from string_iterator_t to encoding_t
- Remove now unneeded iter_init from encoding_t
- Introduce new STRING_ITER_ macros
- Add iter_regress_and_decode function to encoding_t
- Change the string iterator PMC to actually use the string iterator API
- Change Parrot_str_split to use iterators
- Optimize utf8_set_position to also search backward

Nick
Index: src/pmc/stringiterator.pmc
===================================================================
--- src/pmc/stringiterator.pmc  (revision 43406)
+++ src/pmc/stringiterator.pmc  (working copy)
@@ -23,11 +23,10 @@
 
 
 pmclass StringIterator auto_attrs extends Iterator {
-    ATTR PMC    *string;    /* String to iterate over */
-    ATTR INTVAL  pos;       /* Current position of iterator for forward 
iterator */
-                            /* Previous position of iterator for reverse 
iterator */
-    ATTR INTVAL  length;    /* Length of C<string> */
-    ATTR INTVAL  reverse;   /* Direction of iteration. 1 - for reverse 
iteration */
+    ATTR PMC         *string;    /* String to iterate over */
+    ATTR String_iter  iter;      /* String iterator */
+    ATTR UINTVAL      length;    /* Length of C<string> */
+    ATTR INTVAL       reverse;   /* Direction of iteration. 1 - for reverse 
iteration */
 
 /*
 
@@ -39,7 +38,12 @@
 
 */
     VTABLE void init_pmc(PMC *string) {
+        Parrot_StringIterator_attributes * const attrs =
+                PARROT_STRINGITERATOR(SELF);
+        STRING * const str_val = VTABLE_get_string(INTERP, string);
+
         SET_ATTR_string(INTERP, SELF, string);
+        STRING_ITER_INIT(INTERP, str_val, &attrs->iter);
 
         /* by default, iterate from start */
         SELF.set_integer_native(ITERATE_FROM_START);
@@ -77,7 +81,7 @@
         Parrot_StringIterator_attributes * const clone_attrs =
                 PARROT_STRINGITERATOR(clone);
 
-        clone_attrs->pos     = attrs->pos;
+        clone_attrs->iter    = attrs->iter;
         clone_attrs->reverse = attrs->reverse;
         return clone;
     }
@@ -110,9 +114,9 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
         if (attrs->reverse)
-            return attrs->pos;
+            return attrs->iter.charpos;
         else
-            return attrs->length - attrs->pos;
+            return attrs->length - attrs->iter.charpos;
     }
 
     VTABLE INTVAL get_integer() {
@@ -137,13 +141,13 @@
                 PARROT_STRINGITERATOR(SELF);
         if (value == ITERATE_FROM_START) {
             attrs->reverse   = 0;
-            attrs->pos       = 0;
             attrs->length    = VTABLE_elements(INTERP, attrs->string);
+            STRING_ITER_SET_POSITION(INTERP, &attrs->iter, 0);
         }
         else if (value == ITERATE_FROM_END) {
             attrs->reverse   = 1;
-            attrs->pos       = attrs->length
-                             = VTABLE_elements(INTERP, attrs->string);
+            attrs->length    = VTABLE_elements(INTERP, attrs->string);
+            STRING_ITER_SET_POSITION(INTERP, &attrs->iter, attrs->length);
         }
         else
             Parrot_ex_throw_from_c_args(INTERP, NULL, 
EXCEPTION_INVALID_OPERATION,
@@ -179,14 +183,16 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
         PMC *ret;
+        STRING *str;
 
-        if (attrs->pos >= attrs->length)
+        if (attrs->iter.charpos >= attrs->length)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
                 "StopIteration");
 
         ret = pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, 
enum_class_String));
-        VTABLE_set_string_native(INTERP, ret,
-                VTABLE_get_string_keyed_int(INTERP, attrs->string, 
attrs->pos++));
+        str = Parrot_str_iter_get_and_advance(interp,
+                VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
+        VTABLE_set_string_native(INTERP, ret, str);
         return ret;
     }
 
@@ -203,11 +209,12 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
 
-        if (attrs->pos >= attrs->length)
+        if (attrs->iter.charpos >= attrs->length)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
                 "StopIteration");
 
-        return VTABLE_get_string_keyed_int(INTERP, attrs->string, 
attrs->pos++);
+        return Parrot_str_iter_get_and_advance(interp,
+                VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
     }
 
 /*
@@ -223,11 +230,11 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
 
-        if (attrs->pos >= attrs->length)
+        if (attrs->iter.charpos >= attrs->length)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
                 "StopIteration");
 
-        return VTABLE_get_integer_keyed_int(INTERP, attrs->string, 
attrs->pos++);
+        return STRING_ITER_GET_AND_ADVANCE(INTERP, &attrs->iter);
     }
 
 /*
@@ -243,14 +250,16 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
         PMC *ret;
+        STRING * str;
 
-        if (!STATICSELF.get_bool())
+        if (attrs->iter.charpos <= 0)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
                 "StopIteration");
 
         ret = pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp, 
enum_class_String));
-        VTABLE_set_string_native(INTERP, ret,
-                VTABLE_get_string_keyed_int(INTERP, attrs->string, 
--attrs->pos));
+        str = Parrot_str_iter_regress_and_get(interp,
+                VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
+        VTABLE_set_string_native(INTERP, ret, str);
         return ret;
     }
 
@@ -267,11 +276,12 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
 
-        if (!STATICSELF.get_bool())
+        if (attrs->iter.charpos <= 0)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
                 "StopIteration");
 
-        return VTABLE_get_string_keyed_int(INTERP, attrs->string, 
--attrs->pos);
+        return Parrot_str_iter_regress_and_get(interp,
+                VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
     }
 
 /*
@@ -287,11 +297,11 @@
         Parrot_StringIterator_attributes * const attrs =
                 PARROT_STRINGITERATOR(SELF);
 
-        if (!STATICSELF.get_bool())
+        if (attrs->iter.charpos <= 0)
             Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
                 "StopIteration");
 
-        return VTABLE_get_integer_keyed_int(INTERP, attrs->string, 
--attrs->pos);
+        return STRING_ITER_REGRESS_AND_GET(INTERP, &attrs->iter);
     }
 
 /*
@@ -306,7 +316,7 @@
 
     VTABLE INTVAL get_integer_keyed_int(INTVAL idx) {
         return VTABLE_get_integer_keyed_int(INTERP, STATICSELF.get_pmc(),
-                PARROT_STRINGITERATOR(SELF)->pos + idx);
+                PARROT_STRINGITERATOR(SELF)->iter.charpos + idx);
     }
 
 /*
@@ -321,7 +331,7 @@
 
     VTABLE STRING *get_string_keyed_int(INTVAL idx) {
         return VTABLE_get_string_keyed_int(INTERP, STATICSELF.get_pmc(),
-                PARROT_STRINGITERATOR(SELF)->pos + idx);
+                PARROT_STRINGITERATOR(SELF)->iter.charpos + idx);
     }
 }
 
Index: src/string/encoding/utf16.c
===================================================================
--- src/string/encoding/utf16.c (revision 43406)
+++ src/string/encoding/utf16.c (working copy)
@@ -101,14 +101,6 @@
         __attribute__nonnull__(5)
         FUNC_MODIFIES(*return_string);
 
-static void iter_init(PARROT_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
-
 static void set_byte(PARROT_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset,
@@ -147,13 +139,19 @@
         __attribute__nonnull__(1)
         __attribute__nonnull__(2);
 
-PARROT_WARN_UNUSED_RESULT
 static UINTVAL utf16_decode_and_advance(PARROT_INTERP,
     ARGMOD(String_iter *i))
         __attribute__nonnull__(1)
         __attribute__nonnull__(2)
         FUNC_MODIFIES(*i);
 
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL utf16_regress_and_decode(PARROT_INTERP,
+    ARGMOD(String_iter *i))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        FUNC_MODIFIES(*i);
+
 static void utf16_encode_and_advance(PARROT_INTERP,
     ARGMOD(String_iter *i),
     UINTVAL c)
@@ -198,10 +196,6 @@
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src) \
     , PARROT_ASSERT_ARG(return_string))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
@@ -220,6 +214,9 @@
 #define ASSERT_ARGS_utf16_decode_and_advance __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_utf16_regress_and_decode __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_utf16_encode_and_advance __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(i))
@@ -498,11 +495,11 @@
     UINTVAL start;
     STRING * const return_string = Parrot_str_new_COW(interp, src);
 
-    iter_init(interp, src, &iter);
-    iter.set_position(interp, &iter, offset);
+    STRING_ITER_INIT(interp, src, &iter);
+    utf16_set_position(interp, &iter, offset);
     start = iter.bytepos;
     return_string->strstart = (char *)return_string->strstart + start ;
-    iter.set_position(interp, &iter, offset + count);
+    utf16_set_position(interp, &iter, offset + count);
     return_string->bufused = iter.bytepos - start;
     return_string->strlen = count;
     return_string->hashval = 0;
@@ -532,11 +529,11 @@
     String_iter iter;
     UINTVAL start;
     Parrot_str_reuse_COW(interp, src, return_string);
-    iter_init(interp, src, &iter);
-    iter.set_position(interp, &iter, offset);
+    STRING_ITER_INIT(interp, src, &iter);
+    utf16_set_position(interp, &iter, offset);
     start = iter.bytepos;
     return_string->strstart = (char *)return_string->strstart + start ;
-    iter.set_position(interp, &iter, offset + count);
+    utf16_set_position(interp, &iter, offset + count);
     return_string->bufused = iter.bytepos - start;
     return_string->strlen = count;
     return_string->hashval = 0;
@@ -680,9 +677,9 @@
      * this is used to initially calculate src->strlen,
      * therefore we must scan the whole string
      */
-    iter_init(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
     while (iter.bytepos < src->bufused)
-        iter.get_and_advance(interp, &iter);
+        utf16_decode_and_advance(interp, &iter);
     return iter.charpos;
 }
 
@@ -715,7 +712,6 @@
 
 */
 
-PARROT_WARN_UNUSED_RESULT
 static UINTVAL
 utf16_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
 {
@@ -734,6 +730,33 @@
 
 /*
 
+=item C<static UINTVAL utf16_regress_and_decode(PARROT_INTERP, String_iter *i)>
+
+Moves the string iterator C<i> to the previous UTF-16 codepoint.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+utf16_regress_and_decode(PARROT_INTERP, ARGMOD(String_iter *i))
+{
+    ASSERT_ARGS(utf16_regress_and_decode)
+    UChar *s = (UChar*) i->str->strstart;
+    UINTVAL c, pos;
+    pos = i->bytepos / sizeof (UChar);
+    /* TODO either make sure that we don't go past end or use SAFE
+     *      iter versions
+     */
+    U16_PREV_UNSAFE(s, pos, c);
+    i->charpos--;
+    i->bytepos = pos * sizeof (UChar);
+    return c;
+}
+
+/*
+
 =item C<static void utf16_encode_and_advance(PARROT_INTERP, String_iter *i,
 UINTVAL c)>
 
@@ -783,33 +806,6 @@
 
 /*
 
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
-    ASSERT_ARGS(iter_init)
-    iter->str = src;
-    iter->bytepos = iter->charpos = 0;
-#if PARROT_HAS_ICU
-    iter->get_and_advance = utf16_decode_and_advance;
-    iter->set_and_advance = utf16_encode_and_advance;
-    iter->set_position =    utf16_set_position;
-#else
-    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
-        "no ICU lib loaded");
-#endif
-}
-
-/*
-
 =item C<ENCODING * Parrot_encoding_utf16_init(PARROT_INTERP)>
 
 Initializes the UTF-16 encoding.
@@ -842,8 +838,11 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init,
-        find_cclass
+        find_cclass,
+        utf16_decode_and_advance,
+        utf16_encode_and_advance,
+        utf16_regress_and_decode,
+        utf16_set_position
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "utf16", return_encoding);
Index: src/string/encoding/fixed_8.c
===================================================================
--- src/string/encoding/fixed_8.c       (revision 43406)
+++ src/string/encoding/fixed_8.c       (working copy)
@@ -50,6 +50,11 @@
         __attribute__nonnull__(2)
         FUNC_MODIFIES(*iter);
 
+static UINTVAL fixed8_get_prev(PARROT_INTERP, ARGMOD(String_iter *iter))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        FUNC_MODIFIES(*iter);
+
 static void fixed8_set_next(PARROT_INTERP,
     ARGMOD(String_iter *iter),
     UINTVAL c)
@@ -119,13 +124,6 @@
         __attribute__nonnull__(5)
         FUNC_MODIFIES(*dest_string);
 
-static void iter_init(SHIM_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
-
 static void set_byte(PARROT_INTERP,
     ARGIN(const STRING *source_string),
     UINTVAL offset,
@@ -181,6 +179,9 @@
 #define ASSERT_ARGS_fixed8_get_next __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_get_prev __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_fixed8_set_next __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(iter))
@@ -206,9 +207,6 @@
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(source_string) \
     , PARROT_ASSERT_ARG(dest_string))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(source_string))
@@ -600,6 +598,24 @@
 
 /*
 
+=item C<static UINTVAL fixed8_get_prev(PARROT_INTERP, String_iter *iter)>
+
+Moves the string iterator C<i> to the previous codepoint.
+
+=cut
+
+*/
+
+static UINTVAL
+fixed8_get_prev(PARROT_INTERP, ARGMOD(String_iter *iter))
+{
+    ASSERT_ARGS(fixed8_get_prev)
+    iter->bytepos--;
+    return get_byte(interp, iter->str, --iter->charpos);
+}
+
+/*
+
 =item C<static void fixed8_set_next(PARROT_INTERP, String_iter *iter, UINTVAL
 c)>
 
@@ -640,28 +656,6 @@
 
 /*
 
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
-    ASSERT_ARGS(iter_init)
-    iter->str             = src;
-    iter->bytepos         = iter->charpos        = 0;
-    iter->get_and_advance = fixed8_get_next;
-    iter->set_and_advance = fixed8_set_next;
-    iter->set_position    = fixed8_set_position;
-}
-
-/*
-
 =item C<ENCODING * Parrot_encoding_fixed_8_init(PARROT_INTERP)>
 
 Initializes the fixed-8 encoding.
@@ -694,8 +688,11 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init,
-        find_cclass
+        find_cclass,
+        fixed8_get_next,
+        fixed8_set_next,
+        fixed8_get_prev,
+        fixed8_set_position
 
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
Index: src/string/encoding/utf8.c
===================================================================
--- src/string/encoding/utf8.c  (revision 43406)
+++ src/string/encoding/utf8.c  (working copy)
@@ -98,13 +98,6 @@
         FUNC_MODIFIES(*src)
         FUNC_MODIFIES(*return_string);
 
-static void iter_init(SHIM_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
-
 static void set_byte(PARROT_INTERP,
     ARGIN(const STRING *src),
     UINTVAL offset,
@@ -158,6 +151,12 @@
         __attribute__nonnull__(2)
         FUNC_MODIFIES(*i);
 
+static UINTVAL utf8_regress_and_decode(PARROT_INTERP,
+    ARGMOD(String_iter *i))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        FUNC_MODIFIES(*i);
+
 PARROT_CANNOT_RETURN_NULL
 static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
         __attribute__nonnull__(1)
@@ -213,9 +212,6 @@
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src) \
     , PARROT_ASSERT_ARG(return_string))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(src))
@@ -238,6 +234,9 @@
 #define ASSERT_ARGS_utf8_decode_and_advance __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_utf8_regress_and_decode __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_utf8_encode __attribute__unused__ int _ASSERT_ARGS_CHECK = 
(\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(ptr))
@@ -505,6 +504,30 @@
 
 /*
 
+=item C<static UINTVAL utf8_regress_and_decode(PARROT_INTERP, String_iter *i)>
+
+The UTF-8 implementation of the string iterator's C<regress_and_get>
+function.
+
+=cut
+
+*/
+
+static UINTVAL
+utf8_regress_and_decode(PARROT_INTERP, ARGMOD(String_iter *i))
+{
+    ASSERT_ARGS(utf8_regress_and_decode)
+    const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos);
+
+    u8ptr--;
+    while (UTF8_IS_CONTINUATION(*u8ptr))
+        u8ptr--;
+
+    return utf8_decode(interp, u8ptr);
+}
+
+/*
+
 =item C<static void utf8_encode_and_advance(PARROT_INTERP, String_iter *i,
 UINTVAL c)>
 
@@ -547,19 +570,39 @@
     ASSERT_ARGS(utf8_set_position)
     const utf8_t *u8ptr = (const utf8_t *)i->str->strstart;
 
-    /* start from last known charpos, if we can */
-    if (i->charpos <= pos) {
-        const UINTVAL old_pos = pos;
-        pos       -= i->charpos;
-        u8ptr     += i->bytepos;
-        i->charpos = old_pos;
+    if (pos == 0) {
+        i->charpos = 0;
+        i->bytepos = 0;
+        return;
     }
-    else
-        i->charpos = pos;
 
-    while (pos-- > 0)
-        u8ptr += UTF8SKIP(u8ptr);
+    /*
+     * we know the byte offsets of three positions: start, current and end
+     * now find the shortest way to reach pos
+     */
+    if (pos < i->charpos) {
+        if (pos <= (i->charpos >> 1)) {
+            /* go forward from start */
+            u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, pos);
+        }
+        else {
+            /* go backward from current */
+            u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + i->bytepos, 
i->charpos - pos);
+        }
+    }
+    else {
+        const UINTVAL  len = i->str->strlen;
+        if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
+            /* go forward from current */
+            u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr + i->bytepos, pos 
- i->charpos);
+        }
+        else {
+            /* go backward from end */
+            u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + 
i->str->bufused, len - pos);
+        }
+    }
 
+    i->charpos = pos;
     i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
 }
 
@@ -582,8 +625,8 @@
 {
     ASSERT_ARGS(to_encoding)
     STRING *result;
-    String_iter src_iter;
-    UINTVAL offs, dest_len, dest_pos, src_len;
+    const ENCODING *src_encoding;
+    UINTVAL dest_len, dest_pos, src_len;
     const int in_place = (dest == NULL);
     unsigned char *new_pos, *pos, *p;
 
@@ -597,8 +640,8 @@
         result = dest;
     }
 
-    /* init iter before possilby changing encoding */
-    ENCODING_ITER_INIT(interp, src, &src_iter);
+    /* save source encoding before possibly changing it */
+    src_encoding = src->encoding;
     result->charset  = Parrot_unicode_charset_ptr;
     result->encoding = Parrot_utf8_encoding_ptr;
     result->strlen   = src_len;
@@ -621,12 +664,14 @@
         result->bufused = dest_len;
     }
     else {
+        String_iter src_iter;
+        STRING_ITER_INIT(interp, src, &src_iter);
         dest_len = src_len;
         dest_pos = 0;
-        for (offs = 0; offs < src_len; ++offs) {
-            const UINTVAL c = src_iter.get_and_advance(interp, &src_iter);
+        while (src_iter.charpos < src_len) {
+            const UINTVAL c = src_encoding->iter_get_and_advance(interp, 
&src_iter);
             if (dest_len - dest_pos < 6) {
-                UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
+                UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1) 
* 1.5);
                 if (need < 16)
                     need = 16;
                 dest_len += need;
@@ -790,16 +835,16 @@
     String_iter    iter;
     UINTVAL        start;
 
-    iter_init(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
 
     if (offset)
-        iter.set_position(interp, &iter, offset);
+        utf8_set_position(interp, &iter, offset);
 
     start                   = iter.bytepos;
     return_string->strstart = (char *)return_string->strstart + start;
 
     if (count)
-        iter.set_position(interp, &iter, offset + count);
+        utf8_set_position(interp, &iter, offset + count);
 
     return_string->bufused  = iter.bytepos - start;
     return_string->strlen   = count;
@@ -860,13 +905,13 @@
     UINTVAL start;
 
     Parrot_str_reuse_COW(interp, src, return_string);
-    iter_init(interp, src, &iter);
-    iter.set_position(interp, &iter, offset);
+    STRING_ITER_INIT(interp, src, &iter);
+    utf8_set_position(interp, &iter, offset);
 
     start = iter.bytepos;
 
     return_string->strstart = (char *)return_string->strstart + start;
-    iter.set_position(interp, &iter, offset + count);
+    utf8_set_position(interp, &iter, offset + count);
 
     return_string->bufused = iter.bytepos - start;
     return_string->strlen  = count;
@@ -973,9 +1018,9 @@
      * this is used to initially calculate src->strlen,
      * therefore we must scan the whole string
      */
-    iter_init(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
     while (iter.bytepos < src->bufused)
-        iter.get_and_advance(interp, &iter);
+        utf8_decode_and_advance(interp, &iter);
     return iter.charpos;
 }
 
@@ -999,29 +1044,6 @@
 
 /*
 
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
-    ASSERT_ARGS(iter_init)
-    iter->str             = src;
-    iter->bytepos         = 0;
-    iter->charpos         = 0;
-    iter->get_and_advance = utf8_decode_and_advance;
-    iter->set_and_advance = utf8_encode_and_advance;
-    iter->set_position    = utf8_set_position;
-}
-
-/*
-
 =item C<ENCODING * Parrot_encoding_utf8_init(PARROT_INTERP)>
 
 Initializes the UTF-8 encoding.
@@ -1054,8 +1076,11 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init,
-        find_cclass
+        find_cclass,
+        utf8_decode_and_advance,
+        utf8_encode_and_advance,
+        utf8_regress_and_decode,
+        utf8_set_position
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "utf8", return_encoding);
Index: src/string/encoding/ucs2.c
===================================================================
--- src/string/encoding/ucs2.c  (revision 43406)
+++ src/string/encoding/ucs2.c  (working copy)
@@ -106,14 +106,6 @@
     SHIM(STRING *dest_string))
         __attribute__nonnull__(1);
 
-static void iter_init(PARROT_INTERP,
-    ARGIN(const STRING *src),
-    ARGOUT(String_iter *iter))
-        __attribute__nonnull__(1)
-        __attribute__nonnull__(2)
-        __attribute__nonnull__(3)
-        FUNC_MODIFIES(*iter);
-
 static void set_byte(PARROT_INTERP,
     SHIM(const STRING *src),
     SHIM(UINTVAL offset),
@@ -157,6 +149,12 @@
         __attribute__nonnull__(2)
         FUNC_MODIFIES(*i);
 
+static UINTVAL ucs2_regress_and_decode(PARROT_INTERP,
+    ARGMOD(String_iter *i))
+        __attribute__nonnull__(1)
+        __attribute__nonnull__(2)
+        FUNC_MODIFIES(*i);
+
 static void ucs2_encode_and_advance(PARROT_INTERP,
     ARGMOD(String_iter *i),
     UINTVAL c)
@@ -196,10 +194,6 @@
     , PARROT_ASSERT_ARG(src))
 #define ASSERT_ARGS_get_codepoints_inplace __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
-       PARROT_ASSERT_ARG(interp) \
-    , PARROT_ASSERT_ARG(src) \
-    , PARROT_ASSERT_ARG(iter))
 #define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
 #define ASSERT_ARGS_set_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -216,6 +210,9 @@
 #define ASSERT_ARGS_ucs2_decode_and_advance __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_ucs2_regress_and_decode __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp) \
+    , PARROT_ASSERT_ARG(i))
 #define ASSERT_ARGS_ucs2_encode_and_advance __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp) \
     , PARROT_ASSERT_ARG(i))
@@ -397,11 +394,11 @@
         String_iter iter;
         UINTVAL start;
 
-        iter_init(interp, src, &iter);
-        iter.set_position(interp, &iter, offset);
+        STRING_ITER_INIT(interp, src, &iter);
+        ucs2_set_position(interp, &iter, offset);
         start = iter.bytepos;
         return_string->strstart = (char *)return_string->strstart + start;
-        iter.set_position(interp, &iter, offset + count);
+        ucs2_set_position(interp, &iter, offset + count);
         return_string->bufused = iter.bytepos - start;
     }
 #endif
@@ -611,6 +608,41 @@
 
 /*
 
+=item C<static UINTVAL ucs2_regress_and_decode(PARROT_INTERP, String_iter *i)>
+
+Moves the string iterator C<i> to the previous UCS-2 codepoint.
+
+=cut
+
+*/
+
+static UINTVAL
+ucs2_regress_and_decode(PARROT_INTERP, ARGMOD(String_iter *i))
+{
+    ASSERT_ARGS(ucs2_regress_and_decode)
+
+#if PARROT_HAS_ICU
+    UChar * const s = (UChar*) i->str->strstart;
+    size_t pos = i->bytepos / sizeof (UChar);
+
+    /* TODO either make sure that we don't go past end or use SAFE
+     *      iter versions
+     */
+    const UChar c = s[--pos];
+    i->charpos--;
+    i->bytepos = pos * sizeof (UChar);
+    return c;
+#else
+    /* This function must never be called if compiled without ICU.
+     * See TT #557
+     */
+    PARROT_ASSERT(0);
+    return (UINTVAL)0; /* Stop the static analyzers from panicing */
+#endif
+}
+
+/*
+
 =item C<static void ucs2_encode_and_advance(PARROT_INTERP, String_iter *i,
 UINTVAL c)>
 
@@ -669,33 +701,6 @@
 
 /*
 
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
-    ASSERT_ARGS(iter_init)
-#if PARROT_HAS_ICU
-    iter->str             = src;
-    iter->bytepos         = 0;
-    iter->charpos         = 0;
-    iter->get_and_advance = ucs2_decode_and_advance;
-    iter->set_and_advance = ucs2_encode_and_advance;
-    iter->set_position    = ucs2_set_position;
-#else
-    no_ICU_lib(interp);
-#endif
-}
-
-/*
-
 =item C<ENCODING * Parrot_encoding_ucs2_init(PARROT_INTERP)>
 
 Initializes the UCS-2 encoding.
@@ -728,8 +733,11 @@
         become_encoding,
         codepoints,
         bytes,
-        iter_init,
-        find_cclass
+        find_cclass,
+        ucs2_decode_and_advance,
+        ucs2_encode_and_advance,
+        ucs2_regress_and_decode,
+        ucs2_set_position
     };
     STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
     Parrot_register_encoding(interp, "ucs2", return_encoding);
Index: src/string/charset/iso-8859-1.c
===================================================================
--- src/string/charset/iso-8859-1.c     (revision 43406)
+++ src/string/charset/iso-8859-1.c     (working copy)
@@ -215,10 +215,10 @@
 to_iso_8859_1(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest))
 {
     ASSERT_ARGS(to_iso_8859_1)
-    UINTVAL offs, src_len;
+    UINTVAL src_len;
     String_iter iter;
 
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
     src_len = src->strlen;
     if (dest) {
         Parrot_gc_reallocate_string_storage(interp, dest, src_len);
@@ -229,16 +229,16 @@
         dest = src;
     }
     dest->bufused = src_len;
-    dest->charset = Parrot_iso_8859_1_charset_ptr;
-    dest->encoding = Parrot_fixed_8_encoding_ptr;
-    for (offs = 0; offs < src_len; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < src_len) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         if (c >= 0x100)
             Parrot_ex_throw_from_c_args(interp, NULL, 
EXCEPTION_LOSSY_CONVERSION,
                 "lossy conversion to iso-8559-1");
 
-        ENCODING_SET_BYTE(interp, dest, offs, c);
+        Parrot_fixed_8_encoding_ptr->set_byte(interp, dest, iter.charpos - 1, 
c);
     }
+    dest->charset = Parrot_iso_8859_1_charset_ptr;
+    dest->encoding = Parrot_fixed_8_encoding_ptr;
     return dest;
 }
 
@@ -258,24 +258,23 @@
 {
     ASSERT_ARGS(to_unicode)
     if (dest) {
-        UINTVAL offs;
         String_iter iter;
 
         dest->charset = Parrot_unicode_charset_ptr;
         dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
         Parrot_gc_reallocate_string_storage(interp, dest, src->strlen);
-        ENCODING_ITER_INIT(interp, dest, &iter);
-        for (offs = 0; offs < src->strlen; ++offs) {
-            const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
+        STRING_ITER_INIT(interp, dest, &iter);
+        while (iter.charpos < src->strlen) {
+            const UINTVAL c = ENCODING_GET_BYTE(interp, src, iter.charpos);
 
             if (iter.bytepos >= Buffer_buflen(dest) - 4) {
-                UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
+                UINTVAL need = (UINTVAL)((src->strlen - iter.charpos) * 1.5);
                 if (need < 16)
                     need = 16;
                 Parrot_gc_reallocate_string_storage(interp, dest,
                         Buffer_buflen(dest) + need);
             }
-            iter.set_and_advance(interp, &iter, c);
+            STRING_ITER_SET_AND_ADVANCE(interp, &iter, c);
         }
         dest->bufused = iter.bytepos;
         dest->strlen  = iter.charpos;
Index: src/string/charset/unicode.c
===================================================================
--- src/string/charset/unicode.c        (revision 43406)
+++ src/string/charset/unicode.c        (working copy)
@@ -704,20 +704,20 @@
 {
     ASSERT_ARGS(compare)
     String_iter l_iter, r_iter;
-    UINTVAL offs, cl, cr, min_len, l_len, r_len;
+    UINTVAL min_len, l_len, r_len;
 
     /* TODO make optimized equal - strings are equal length then already */
-    ENCODING_ITER_INIT(interp, lhs, &l_iter);
-    ENCODING_ITER_INIT(interp, rhs, &r_iter);
+    STRING_ITER_INIT(interp, lhs, &l_iter);
+    STRING_ITER_INIT(interp, rhs, &r_iter);
 
     l_len = lhs->strlen;
     r_len = rhs->strlen;
 
     min_len = l_len > r_len ? r_len : l_len;
 
-    for (offs = 0; offs < min_len; ++offs) {
-        cl = l_iter.get_and_advance(interp, &l_iter);
-        cr = r_iter.get_and_advance(interp, &r_iter);
+    while (l_iter.charpos < min_len) {
+        UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, &l_iter);
+        UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, &r_iter);
 
         if (cl != cr)
             return cl < cr ? -1 : 1;
@@ -769,12 +769,12 @@
 validate(PARROT_INTERP, ARGIN(STRING *src))
 {
     ASSERT_ARGS(validate)
-    UINTVAL     offset;
+    UINTVAL len = Parrot_str_byte_length(interp, src);
     String_iter iter;
 
-    ENCODING_ITER_INIT(interp, src, &iter);
-    for (offset = 0; offset < Parrot_str_byte_length(interp, src); ++offset) {
-        const UINTVAL codepoint = iter.get_and_advance(interp, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
+    while (iter.charpos < len) {
+        const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         /* Check for Unicode non-characters */
         if (codepoint >= 0xfdd0
         && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
@@ -924,24 +924,22 @@
     ASSERT_ARGS(find_cclass)
     String_iter iter;
     UINTVAL     codepoint;
-    UINTVAL     pos = offset;
     UINTVAL     end = offset + count;
 
-    ENCODING_ITER_INIT(interp, source_string, &iter);
+    STRING_ITER_INIT(interp, source_string, &iter);
+    STRING_ITER_SET_POSITION(interp, &iter, offset);
 
-    iter.set_position(interp, &iter, pos);
-
     end = source_string->strlen < end ? source_string->strlen : end;
 
-    for (; pos < end; ++pos) {
-        codepoint = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < end) {
+        codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         if (codepoint >= 256) {
             if (u_iscclass(interp, codepoint, flags))
-                    return pos;
+                    return iter.charpos - 1;
         }
         else {
             if (Parrot_iso_8859_1_typetable[codepoint] & flags)
-                return pos;
+                return iter.charpos - 1;
         }
     }
 
@@ -965,37 +963,36 @@
     ASSERT_ARGS(find_not_cclass)
     String_iter iter;
     UINTVAL     codepoint;
-    UINTVAL     pos = offset;
     UINTVAL     end = offset + count;
     int         bit;
 
-    if (pos > source_string->strlen) {
+    if (offset > source_string->strlen) {
         /* XXX: Throw in this case? */
         return offset + count;
     }
 
-    ENCODING_ITER_INIT(interp, source_string, &iter);
+    STRING_ITER_INIT(interp, source_string, &iter);
 
-    if (pos)
-        iter.set_position(interp, &iter, pos);
+    if (offset)
+        STRING_ITER_SET_POSITION(interp, &iter, offset);
 
     end = source_string->strlen < end ? source_string->strlen : end;
 
     if (flags == enum_cclass_any)
         return end;
 
-    for (; pos < end; ++pos) {
-        codepoint = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < end) {
+        codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         if (codepoint >= 256) {
             for (bit = enum_cclass_uppercase;
                     bit <= enum_cclass_word ; bit <<= 1) {
                 if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
-                    return pos;
+                    return iter.charpos - 1;
             }
         }
         else {
             if (!(Parrot_iso_8859_1_typetable[codepoint] & flags))
-                return pos;
+                return iter.charpos - 1;
         }
     }
 
@@ -1023,8 +1020,8 @@
 
     dest->strlen = 1;
 
-    ENCODING_ITER_INIT(interp, dest, &iter);
-    iter.set_and_advance(interp, &iter, codepoint);
+    STRING_ITER_INIT(interp, dest, &iter);
+    STRING_ITER_SET_AND_ADVANCE(interp, &iter, codepoint);
     dest->bufused = iter.bytepos;
 
     return dest;
@@ -1047,13 +1044,12 @@
 {
     ASSERT_ARGS(compute_hash)
     String_iter iter;
-    UINTVAL     offs;
     size_t      hashval = seed;
 
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
 
-    for (offs = 0; offs < src->strlen; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < src->strlen) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         hashval += hashval << 5;
         hashval += c;
     }
Index: src/string/charset/ascii.c
===================================================================
--- src/string/charset/ascii.c  (revision 43406)
+++ src/string/charset/ascii.c  (working copy)
@@ -263,7 +263,6 @@
 {
     ASSERT_ARGS(to_ascii)
     String_iter iter;
-    UINTVAL offs;
     unsigned char *p;
     const UINTVAL len = src->strlen;
 
@@ -275,9 +274,9 @@
         dest = src;
     }
     p = (unsigned char *)dest->strstart;
-    ENCODING_ITER_INIT(interp, src, &iter);
-    for (offs = 0; offs < len; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
+    while (iter.charpos < len) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         if (c >= 128)
             Parrot_ex_throw_from_c_args(interp, NULL, 
EXCEPTION_LOSSY_CONVERSION,
                     "can't convert unicode string to ascii");
@@ -557,11 +556,10 @@
             return ret_val < 0 ? -1 : 1;
     }
     else {
-        UINTVAL offs;
-        ENCODING_ITER_INIT(interp, rhs, &iter);
-        for (offs = 0; offs < min_len; ++offs) {
-            const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, offs);
-            const UINTVAL cr = iter.get_and_advance(interp, &iter);
+        STRING_ITER_INIT(interp, rhs, &iter);
+        while (iter.charpos < min_len) {
+            const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, iter.charpos);
+            const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
             if (cl != cr)
                 return cl < cr ? -1 : 1;
         }
@@ -596,30 +594,23 @@
 {
     ASSERT_ARGS(mixed_cs_index)
     String_iter src_iter, search_iter;
-    UINTVAL len;
-    INTVAL start;
+    const UINTVAL len = search->strlen;
+    UINTVAL start;
 
-    ENCODING_ITER_INIT(interp, src, &src_iter);
-    src_iter.set_position(interp, &src_iter, offs);
-    ENCODING_ITER_INIT(interp, search, &search_iter);
-    len = search->strlen;
+    STRING_ITER_INIT(interp, src, &src_iter);
+    STRING_ITER_SET_POSITION(interp, &src_iter, offs);
+    STRING_ITER_INIT(interp, search, &search_iter);
 
-    start = -1;
-    for (; len && offs < src->strlen; ++offs) {
-        const UINTVAL c1 = src_iter.get_and_advance(interp, &src_iter);
-        const UINTVAL c2 = search_iter.get_and_advance(interp, &search_iter);
-        if (c1 == c2) {
-            --len;
-            if (start == -1)
-                start = offs;
+    start = src_iter.charpos;
+    while (search_iter.charpos < len && src_iter.charpos < src->strlen) {
+        const UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, &src_iter);
+        const UINTVAL c2 = STRING_ITER_GET_AND_ADVANCE(interp, &search_iter);
+        if (c1 != c2) {
+            start = src_iter.charpos;
+            STRING_ITER_SET_POSITION(interp, &search_iter, 0);
         }
-        else {
-            len = search->strlen;
-            start = -1;
-            search_iter.set_position(interp, &search_iter, 0);
-        }
     }
-    if (len == 0)
+    if (search_iter.charpos >= len)
         return start;
     return -1;
 }
@@ -700,12 +691,12 @@
 validate(PARROT_INTERP, ARGIN(STRING *src))
 {
     ASSERT_ARGS(validate)
-    UINTVAL offset;
+    const UINTVAL len = Parrot_str_byte_length(interp, src);
     String_iter iter;
 
-    ENCODING_ITER_INIT(interp, src, &iter);
-    for (offset = 0; offset < Parrot_str_byte_length(interp, src); ++offset) {
-        const UINTVAL codepoint = iter.get_and_advance(interp, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
+    while (iter.charpos < len) {
+        const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         if (codepoint >= 0x80)
             return 0;
     }
Index: src/string/api.c
===================================================================
--- src/string/api.c    (revision 43406)
+++ src/string/api.c    (working copy)
@@ -1254,9 +1254,74 @@
     }
 }
 
+/*
 
+=item C<STRING * Parrot_str_iter_get_and_advance(PARROT_INTERP, STRING *str,
+String_iter *iter)>
+
+Returns the character in C<str> that C<iter> points to and advances C<iter>.
+
+=cut
+
+*/
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+Parrot_str_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(STRING *str), ARGOUT(String_iter *iter))
+{
+    ASSERT_ARGS(Parrot_str_iter_get_and_advance)
+    STRING *dest  = Parrot_str_new_COW(interp, str);
+    UINTVAL start = iter->bytepos;
+
+    STRING_ITER_GET_AND_ADVANCE(interp, iter);
+
+    dest->strstart = (char *)dest->strstart + start;
+    dest->bufused  = iter->bytepos - start;
+    dest->strlen   = 1;
+    dest->hashval  = 0;
+
+    return dest;
+}
+
 /*
 
+=item C<STRING * Parrot_str_iter_regress_and_get(PARROT_INTERP, STRING *str,
+String_iter *iter)>
+
+Moves C<iter> backwards and returns the character in C<str> that C<iter>
+points to.
+
+=cut
+
+*/
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+Parrot_str_iter_regress_and_get(PARROT_INTERP,
+    ARGIN(STRING *str), ARGOUT(String_iter *iter))
+{
+    ASSERT_ARGS(Parrot_str_iter_regress_and_get)
+    STRING *dest  = Parrot_str_new_COW(interp, str);
+    UINTVAL end = iter->bytepos;
+
+    STRING_ITER_REGRESS_AND_GET(interp, iter);
+
+    dest->strstart = (char *)dest->strstart + iter->bytepos;
+    dest->bufused  = end - iter->bytepos;
+    dest->strlen   = 1;
+    dest->hashval  = 0;
+
+    return dest;
+}
+
+
+/*
+
 =item C<STRING * Parrot_str_replace(PARROT_INTERP, STRING *src, INTVAL offset,
 INTVAL length, STRING *rep, STRING **d)>
 
@@ -1348,12 +1413,12 @@
     }
 
     /* get byte position of the part that will be replaced */
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
 
-    iter.set_position(interp, &iter, true_offset);
+    STRING_ITER_SET_POSITION(interp, &iter, true_offset);
     start_byte = iter.bytepos;
 
-    iter.set_position(interp, &iter, true_offset + true_length);
+    STRING_ITER_SET_POSITION(interp, &iter, true_offset + true_length);
     end_byte   = iter.bytepos;
 
     /* not possible.... */
@@ -1451,7 +1516,7 @@
 Parrot_str_chopn_inplace(PARROT_INTERP, ARGMOD(STRING *s), INTVAL n)
 {
     ASSERT_ARGS(Parrot_str_chopn_inplace)
-    UINTVAL new_length, uchar_size;
+    UINTVAL new_length;
 
     if (n < 0) {
         new_length = -n;
@@ -1472,23 +1537,23 @@
         return;
     }
 
-    uchar_size = s->bufused / s->strlen;
-    s->strlen  = new_length;
-
     if (s->encoding == Parrot_fixed_8_encoding_ptr) {
         s->bufused = new_length;
     }
     else if (s->encoding == Parrot_ucs2_encoding_ptr) {
+        const UINTVAL uchar_size = s->bufused / s->strlen;
         s->bufused = new_length * uchar_size;
     }
     else {
         String_iter iter;
 
-        ENCODING_ITER_INIT(interp, s, &iter);
-        iter.set_position(interp, &iter, new_length);
+        STRING_ITER_INIT(interp, s, &iter);
+        STRING_ITER_SET_POSITION(interp, &iter, new_length);
         s->bufused = iter.bytepos;
     }
 
+    s->strlen  = new_length;
+
     return;
 }
 
@@ -2124,13 +2189,12 @@
         int                 sign      = 1;
         INTVAL              i         = 0;
         String_iter         iter;
-        UINTVAL             offs;
         number_parse_state  state = parse_start;
 
-        ENCODING_ITER_INIT(interp, s, &iter);
+        STRING_ITER_INIT(interp, s, &iter);
 
-        for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
-            const UINTVAL c = iter.get_and_advance(interp, &iter);
+        while (state != parse_end && iter.charpos < s->strlen) {
+            const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
             /* Check for overflow */
             if (c > 255)
                 break;
@@ -2215,17 +2279,16 @@
     int           d_length  = 0;
     int           check_nan = 0;    /* Check for NaN and Inf after main loop */
     String_iter iter;
-    UINTVAL     offs;
     number_parse_state state = parse_start;
 
     if (!s)
         return 0.0;
 
-    ENCODING_ITER_INIT(interp, s, &iter);
+    STRING_ITER_INIT(interp, s, &iter);
 
     /* Handcrafter FSM to read float value */
-    for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    while (state != parse_end && iter.charpos < s->strlen) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         /* Check for overflow */
         if (c > 255)
             break;
@@ -2600,7 +2663,6 @@
 {
     ASSERT_ARGS(Parrot_str_to_hashval)
     String_iter iter;
-    UINTVAL     offs;
     size_t      hashval = interp->hash_seed;
 
     if (!s)
@@ -2609,10 +2671,10 @@
     /* ZZZZZ workaround for something not setting up encodings right */
     saneify_string(s);
 
-    ENCODING_ITER_INIT(interp, s, &iter);
+    STRING_ITER_INIT(interp, s, &iter);
 
-    for (offs = 0; offs < s->strlen; ++offs) {
-        const UINTVAL c = iter.get_and_advance(interp, &iter);
+    while (iter.charpos < s->strlen) {
+        const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         hashval += hashval << 5;
         hashval += c;
     }
@@ -2690,11 +2752,11 @@
             Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
 
     /* more work TODO */
-    ENCODING_ITER_INIT(interp, src, &iter);
+    STRING_ITER_INIT(interp, src, &iter);
     dp = (unsigned char *)result->strstart;
 
     for (i = 0; len > 0; --len) {
-        UINTVAL c = iter.get_and_advance(interp, &iter);
+        UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
         if (c < 0x7f) {
             /* process ASCII chars */
             if (i >= charlen - 2) {
@@ -2851,7 +2913,7 @@
         encoding = result->encoding;
     }
 
-    encoding->iter_init(interp, result, &iter);
+    STRING_ITER_INIT(interp, result, &iter);
 
     for (offs = d = 0; offs < clength; ++offs) {
         r = (Parrot_UInt4)((unsigned char *)result->strstart)[offs];
@@ -2874,7 +2936,7 @@
         }
 
         PARROT_ASSERT(d < offs);
-        iter.set_and_advance(interp, &iter, r);
+        encoding->iter_set_and_advance(interp, &iter, r);
         ++d;
     }
 
@@ -3409,8 +3471,10 @@
     ARGIN_NULLOK(STRING *delim), ARGIN_NULLOK(STRING *str))
 {
     ASSERT_ARGS(Parrot_str_split)
-    PMC    *res;
-    INTVAL  slen, dlen, ps, pe;
+    PMC     *res;
+    STRING  *tstr;
+    UINTVAL  slen, dlen, start, len;
+    String_iter iter;
 
     if (STRING_IS_NULL(delim) || STRING_IS_NULL(str))
         return PMCNULL;
@@ -3421,45 +3485,62 @@
     if (!slen)
         return res;
 
+    STRING_ITER_INIT(interp, str, &iter);
     dlen = Parrot_str_byte_length(interp, delim);
 
     if (dlen == 0) {
-        int i;
         VTABLE_set_integer_native(interp, res, slen);
 
-        for (i = 0; i < slen; ++i) {
-            STRING * const p = Parrot_str_substr(interp, str, i, 1, NULL, 0);
-            VTABLE_set_string_keyed_int(interp, res, i, p);
-        }
+        do {
+            tstr = Parrot_str_iter_get_and_advance(interp, str, &iter);
+            VTABLE_set_string_keyed_int(interp, res, iter.charpos - 1, tstr);
+        } while (iter.charpos < slen);
 
         return res;
     }
 
-    pe = Parrot_str_find_index(interp, str, delim, 0);
+    start = iter.bytepos;
+    len   = 0;
 
-    if (pe < 0) {
-        VTABLE_push_string(interp, res, str);
-        return res;
-    }
+    do {
+        UINTVAL end = start;
+        String_iter delim_iter;
 
-    ps = 0;
+        STRING_ITER_INIT(interp, delim, &delim_iter);
 
-    while (ps <= slen) {
-        const int      pl   = pe - ps;
-        STRING * const tstr = Parrot_str_substr(interp, str, ps, pl, NULL, 0);
+        while (delim_iter.charpos < dlen && iter.charpos < slen) {
+            const UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
+            const UINTVAL c2 = STRING_ITER_GET_AND_ADVANCE(interp, 
&delim_iter);
+            if (c1 != c2) {
+                len += delim_iter.charpos;
+                end = iter.bytepos;
+                STRING_ITER_SET_POSITION(interp, &delim_iter, 0);
+            }
+        }
 
-        VTABLE_push_string(interp, res, tstr);
-        ps = pe + Parrot_str_byte_length(interp, delim);
+        if (delim_iter.charpos >= dlen) {
+            tstr = Parrot_str_new_COW(interp, str);
+            tstr->strstart = (char *)tstr->strstart + start;
+            tstr->bufused  = end - start;
+            tstr->strlen   = len;
+            tstr->hashval  = 0;
+            VTABLE_push_string(interp, res, tstr);
 
-        if (ps > slen)
-            break;
+            start = iter.bytepos;
+            len   = 0;
+        }
+        else {
+            len += delim_iter.charpos;
+        }
+    } while (iter.charpos < slen);
 
-        pe = Parrot_str_find_index(interp, str, delim, ps);
+    tstr = Parrot_str_new_COW(interp, str);
+    tstr->strstart = (char *)tstr->strstart + start;
+    tstr->bufused  = iter.bytepos - start;
+    tstr->strlen   = len;
+    tstr->hashval  = 0;
+    VTABLE_push_string(interp, res, tstr);
 
-        if (pe < 0)
-            pe = slen;
-    }
-
     return res;
 }
 
Index: src/io/utf8.c
===================================================================
--- src/io/utf8.c       (revision 43406)
+++ src/io/utf8.c       (working copy)
@@ -57,7 +57,7 @@
     s->encoding = Parrot_utf8_encoding_ptr;
 
     /* count chars, verify utf8 */
-    Parrot_utf8_encoding_ptr->iter_init(interp, s, &iter);
+    STRING_ITER_INIT(interp, s, &iter);
 
     while (iter.bytepos < s->bufused) {
         if (iter.bytepos + 4 > s->bufused) {
@@ -92,7 +92,7 @@
             }
         }
 ok:
-        iter.get_and_advance(interp, &iter);
+        Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, &iter);
     }
     s->strlen = iter.charpos;
     return len;
Index: include/parrot/encoding.h
===================================================================
--- include/parrot/encoding.h   (revision 43406)
+++ include/parrot/encoding.h   (working copy)
@@ -35,8 +35,10 @@
 
 struct string_iterator_t;       /* s. parrot/string.h */
 
-typedef void (*encoding_iter_init_t)(PARROT_INTERP, const STRING *src,
-        struct string_iterator_t *);
+typedef UINTVAL (*encoding_iter_get_and_advance_t)(PARROT_INTERP, struct 
string_iterator_t *);
+typedef void (*encoding_iter_set_and_advance_t)(PARROT_INTERP, struct 
string_iterator_t *, UINTVAL);
+typedef UINTVAL (*encoding_iter_regress_and_get_t)(PARROT_INTERP, struct 
string_iterator_t *);
+typedef void (*encoding_iter_set_position_t)(PARROT_INTERP, struct 
string_iterator_t *, UINTVAL);
 
 struct _encoding {
     ARGIN(const char *name);
@@ -55,8 +57,11 @@
     encoding_become_encoding_t          become_encoding;
     encoding_codepoints_t               codepoints;
     encoding_bytes_t                    bytes;
-    encoding_iter_init_t                iter_init;
     encoding_find_cclass_t              find_cclass;
+    encoding_iter_get_and_advance_t     iter_get_and_advance;
+    encoding_iter_set_and_advance_t     iter_set_and_advance;
+    encoding_iter_regress_and_get_t     iter_regress_and_get;
+    encoding_iter_set_position_t        iter_set_position;
 };
 
 typedef struct _encoding ENCODING;
@@ -218,8 +223,6 @@
     ((src)->encoding)->codepoints((i), (src))
 #define ENCODING_BYTES(i, src) \
     ((src)->encoding)->bytes((i), (src))
-#define ENCODING_ITER_INIT(i, src, iter) \
-    ((src)->encoding)->iter_init((i), (src), (iter))
 #define ENCODING_FIND_CCLASS(i, src, typetable, flags, pos, end) \
     ((src)->encoding)->find_cclass((i), (src), (typetable), (flags), (pos), 
(end))
 
Index: include/parrot/string_funcs.h
===================================================================
--- include/parrot/string_funcs.h       (revision 43406)
+++ include/parrot/string_funcs.h       (working copy)
@@ -391,6 +391,24 @@
 
 PARROT_EXPORT
 PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * Parrot_str_iter_get_and_advance(PARROT_INTERP,
+    ARGIN(STRING *str),
+    ARGOUT(String_iter *iter))
+        __attribute__nonnull__(1)
+        FUNC_MODIFIES(*iter);
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * Parrot_str_iter_regress_and_get(PARROT_INTERP,
+    ARGIN(STRING *str),
+    ARGOUT(String_iter *iter))
+        __attribute__nonnull__(1)
+        FUNC_MODIFIES(*iter);
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
 PARROT_MALLOC
 STRING * Parrot_str_titlecase(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
         __attribute__nonnull__(1);
@@ -660,6 +678,10 @@
        PARROT_ASSERT_ARG(interp))
 #define ASSERT_ARGS_Parrot_str_substr __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_Parrot_str_iter_get_and_advance __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_Parrot_str_iter_regress_and_get __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
+       PARROT_ASSERT_ARG(interp))
 #define ASSERT_ARGS_Parrot_str_titlecase __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
        PARROT_ASSERT_ARG(interp))
 #define ASSERT_ARGS_Parrot_str_titlecase_inplace __attribute__unused__ int 
_ASSERT_ARGS_CHECK = (\
Index: include/parrot/string.h
===================================================================
--- include/parrot/string.h     (revision 43406)
+++ include/parrot/string.h     (working copy)
@@ -32,11 +32,22 @@
     const STRING *str;
     UINTVAL bytepos;
     UINTVAL charpos;
-    UINTVAL (*get_and_advance)(PARROT_INTERP, struct string_iterator_t *i);
-    void (*set_and_advance)(PARROT_INTERP, struct string_iterator_t *i, 
UINTVAL c);
-    void (*set_position)(PARROT_INTERP, struct string_iterator_t *i, UINTVAL 
pos);
 } String_iter;
 
+#define STRING_ITER_INIT(i, src, iter) do { \
+    (iter)->str     = (src); \
+    (iter)->charpos = 0; \
+    (iter)->bytepos = 0; \
+} while (0)
+#define STRING_ITER_GET_AND_ADVANCE(i, iter) \
+    ((iter)->str->encoding)->iter_get_and_advance((i), (iter))
+#define STRING_ITER_SET_AND_ADVANCE(i, iter, c) \
+    ((iter)->str->encoding)->iter_set_and_advance((i), (iter), (c))
+#define STRING_ITER_REGRESS_AND_GET(i, iter) \
+    ((iter)->str->encoding)->iter_regress_and_get((i), (iter))
+#define STRING_ITER_SET_POSITION(i, iter, pos) \
+    ((iter)->str->encoding)->iter_set_position((i), (iter), (pos))
+
 #define STREQ(x, y)  (strcmp((x), (y))==0)
 #define STRNEQ(x, y) (strcmp((x), (y))!=0)
 
_______________________________________________
http://lists.parrot.org/mailman/listinfo/parrot-dev

Reply via email to