On 06/01/10 23:12, Nick Wellnhofer wrote:
Here is a preliminary patch.
Here is bigger patch that makes the following changes:
- Move the function pointers from string_iterator_t to encoding_t
- Remove now unneeded iter_init from encoding_t
- Introduce new STRING_ITER_ macros
- Add iter_regress_and_decode function to encoding_t
- Change the string iterator PMC to actually use the string iterator API
- Change Parrot_str_split to use iterators
- Optimize utf8_set_position to also search backward
Nick
Index: src/pmc/stringiterator.pmc
===================================================================
--- src/pmc/stringiterator.pmc (revision 43406)
+++ src/pmc/stringiterator.pmc (working copy)
@@ -23,11 +23,10 @@
pmclass StringIterator auto_attrs extends Iterator {
- ATTR PMC *string; /* String to iterate over */
- ATTR INTVAL pos; /* Current position of iterator for forward
iterator */
- /* Previous position of iterator for reverse
iterator */
- ATTR INTVAL length; /* Length of C<string> */
- ATTR INTVAL reverse; /* Direction of iteration. 1 - for reverse
iteration */
+ ATTR PMC *string; /* String to iterate over */
+ ATTR String_iter iter; /* String iterator */
+ ATTR UINTVAL length; /* Length of C<string> */
+ ATTR INTVAL reverse; /* Direction of iteration. 1 - for reverse
iteration */
/*
@@ -39,7 +38,12 @@
*/
VTABLE void init_pmc(PMC *string) {
+ Parrot_StringIterator_attributes * const attrs =
+ PARROT_STRINGITERATOR(SELF);
+ STRING * const str_val = VTABLE_get_string(INTERP, string);
+
SET_ATTR_string(INTERP, SELF, string);
+ STRING_ITER_INIT(INTERP, str_val, &attrs->iter);
/* by default, iterate from start */
SELF.set_integer_native(ITERATE_FROM_START);
@@ -77,7 +81,7 @@
Parrot_StringIterator_attributes * const clone_attrs =
PARROT_STRINGITERATOR(clone);
- clone_attrs->pos = attrs->pos;
+ clone_attrs->iter = attrs->iter;
clone_attrs->reverse = attrs->reverse;
return clone;
}
@@ -110,9 +114,9 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
if (attrs->reverse)
- return attrs->pos;
+ return attrs->iter.charpos;
else
- return attrs->length - attrs->pos;
+ return attrs->length - attrs->iter.charpos;
}
VTABLE INTVAL get_integer() {
@@ -137,13 +141,13 @@
PARROT_STRINGITERATOR(SELF);
if (value == ITERATE_FROM_START) {
attrs->reverse = 0;
- attrs->pos = 0;
attrs->length = VTABLE_elements(INTERP, attrs->string);
+ STRING_ITER_SET_POSITION(INTERP, &attrs->iter, 0);
}
else if (value == ITERATE_FROM_END) {
attrs->reverse = 1;
- attrs->pos = attrs->length
- = VTABLE_elements(INTERP, attrs->string);
+ attrs->length = VTABLE_elements(INTERP, attrs->string);
+ STRING_ITER_SET_POSITION(INTERP, &attrs->iter, attrs->length);
}
else
Parrot_ex_throw_from_c_args(INTERP, NULL,
EXCEPTION_INVALID_OPERATION,
@@ -179,14 +183,16 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
PMC *ret;
+ STRING *str;
- if (attrs->pos >= attrs->length)
+ if (attrs->iter.charpos >= attrs->length)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
ret = pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp,
enum_class_String));
- VTABLE_set_string_native(INTERP, ret,
- VTABLE_get_string_keyed_int(INTERP, attrs->string,
attrs->pos++));
+ str = Parrot_str_iter_get_and_advance(interp,
+ VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
+ VTABLE_set_string_native(INTERP, ret, str);
return ret;
}
@@ -203,11 +209,12 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
- if (attrs->pos >= attrs->length)
+ if (attrs->iter.charpos >= attrs->length)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- return VTABLE_get_string_keyed_int(INTERP, attrs->string,
attrs->pos++);
+ return Parrot_str_iter_get_and_advance(interp,
+ VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
}
/*
@@ -223,11 +230,11 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
- if (attrs->pos >= attrs->length)
+ if (attrs->iter.charpos >= attrs->length)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- return VTABLE_get_integer_keyed_int(INTERP, attrs->string,
attrs->pos++);
+ return STRING_ITER_GET_AND_ADVANCE(INTERP, &attrs->iter);
}
/*
@@ -243,14 +250,16 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
PMC *ret;
+ STRING * str;
- if (!STATICSELF.get_bool())
+ if (attrs->iter.charpos <= 0)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
ret = pmc_new(INTERP, Parrot_get_ctx_HLL_type(interp,
enum_class_String));
- VTABLE_set_string_native(INTERP, ret,
- VTABLE_get_string_keyed_int(INTERP, attrs->string,
--attrs->pos));
+ str = Parrot_str_iter_regress_and_get(interp,
+ VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
+ VTABLE_set_string_native(INTERP, ret, str);
return ret;
}
@@ -267,11 +276,12 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
- if (!STATICSELF.get_bool())
+ if (attrs->iter.charpos <= 0)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- return VTABLE_get_string_keyed_int(INTERP, attrs->string,
--attrs->pos);
+ return Parrot_str_iter_regress_and_get(interp,
+ VTABLE_get_string(INTERP, attrs->string), &attrs->iter);
}
/*
@@ -287,11 +297,11 @@
Parrot_StringIterator_attributes * const attrs =
PARROT_STRINGITERATOR(SELF);
- if (!STATICSELF.get_bool())
+ if (attrs->iter.charpos <= 0)
Parrot_ex_throw_from_c_args(INTERP, NULL, EXCEPTION_OUT_OF_BOUNDS,
"StopIteration");
- return VTABLE_get_integer_keyed_int(INTERP, attrs->string,
--attrs->pos);
+ return STRING_ITER_REGRESS_AND_GET(INTERP, &attrs->iter);
}
/*
@@ -306,7 +316,7 @@
VTABLE INTVAL get_integer_keyed_int(INTVAL idx) {
return VTABLE_get_integer_keyed_int(INTERP, STATICSELF.get_pmc(),
- PARROT_STRINGITERATOR(SELF)->pos + idx);
+ PARROT_STRINGITERATOR(SELF)->iter.charpos + idx);
}
/*
@@ -321,7 +331,7 @@
VTABLE STRING *get_string_keyed_int(INTVAL idx) {
return VTABLE_get_string_keyed_int(INTERP, STATICSELF.get_pmc(),
- PARROT_STRINGITERATOR(SELF)->pos + idx);
+ PARROT_STRINGITERATOR(SELF)->iter.charpos + idx);
}
}
Index: src/string/encoding/utf16.c
===================================================================
--- src/string/encoding/utf16.c (revision 43406)
+++ src/string/encoding/utf16.c (working copy)
@@ -101,14 +101,6 @@
__attribute__nonnull__(5)
FUNC_MODIFIES(*return_string);
-static void iter_init(PARROT_INTERP,
- ARGIN(const STRING *src),
- ARGOUT(String_iter *iter))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
static void set_byte(PARROT_INTERP,
ARGIN(const STRING *src),
UINTVAL offset,
@@ -147,13 +139,19 @@
__attribute__nonnull__(1)
__attribute__nonnull__(2);
-PARROT_WARN_UNUSED_RESULT
static UINTVAL utf16_decode_and_advance(PARROT_INTERP,
ARGMOD(String_iter *i))
__attribute__nonnull__(1)
__attribute__nonnull__(2)
FUNC_MODIFIES(*i);
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL utf16_regress_and_decode(PARROT_INTERP,
+ ARGMOD(String_iter *i))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ FUNC_MODIFIES(*i);
+
static void utf16_encode_and_advance(PARROT_INTERP,
ARGMOD(String_iter *i),
UINTVAL c)
@@ -198,10 +196,6 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src) \
, PARROT_ASSERT_ARG(return_string))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(iter))
#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
@@ -220,6 +214,9 @@
#define ASSERT_ARGS_utf16_decode_and_advance __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_utf16_regress_and_decode __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_utf16_encode_and_advance __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(i))
@@ -498,11 +495,11 @@
UINTVAL start;
STRING * const return_string = Parrot_str_new_COW(interp, src);
- iter_init(interp, src, &iter);
- iter.set_position(interp, &iter, offset);
+ STRING_ITER_INIT(interp, src, &iter);
+ utf16_set_position(interp, &iter, offset);
start = iter.bytepos;
return_string->strstart = (char *)return_string->strstart + start ;
- iter.set_position(interp, &iter, offset + count);
+ utf16_set_position(interp, &iter, offset + count);
return_string->bufused = iter.bytepos - start;
return_string->strlen = count;
return_string->hashval = 0;
@@ -532,11 +529,11 @@
String_iter iter;
UINTVAL start;
Parrot_str_reuse_COW(interp, src, return_string);
- iter_init(interp, src, &iter);
- iter.set_position(interp, &iter, offset);
+ STRING_ITER_INIT(interp, src, &iter);
+ utf16_set_position(interp, &iter, offset);
start = iter.bytepos;
return_string->strstart = (char *)return_string->strstart + start ;
- iter.set_position(interp, &iter, offset + count);
+ utf16_set_position(interp, &iter, offset + count);
return_string->bufused = iter.bytepos - start;
return_string->strlen = count;
return_string->hashval = 0;
@@ -680,9 +677,9 @@
* this is used to initially calculate src->strlen,
* therefore we must scan the whole string
*/
- iter_init(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
while (iter.bytepos < src->bufused)
- iter.get_and_advance(interp, &iter);
+ utf16_decode_and_advance(interp, &iter);
return iter.charpos;
}
@@ -715,7 +712,6 @@
*/
-PARROT_WARN_UNUSED_RESULT
static UINTVAL
utf16_decode_and_advance(PARROT_INTERP, ARGMOD(String_iter *i))
{
@@ -734,6 +730,33 @@
/*
+=item C<static UINTVAL utf16_regress_and_decode(PARROT_INTERP, String_iter *i)>
+
+Moves the string iterator C<i> to the previous UTF-16 codepoint.
+
+=cut
+
+*/
+
+PARROT_WARN_UNUSED_RESULT
+static UINTVAL
+utf16_regress_and_decode(PARROT_INTERP, ARGMOD(String_iter *i))
+{
+ ASSERT_ARGS(utf16_regress_and_decode)
+ UChar *s = (UChar*) i->str->strstart;
+ UINTVAL c, pos;
+ pos = i->bytepos / sizeof (UChar);
+ /* TODO either make sure that we don't go past end or use SAFE
+ * iter versions
+ */
+ U16_PREV_UNSAFE(s, pos, c);
+ i->charpos--;
+ i->bytepos = pos * sizeof (UChar);
+ return c;
+}
+
+/*
+
=item C<static void utf16_encode_and_advance(PARROT_INTERP, String_iter *i,
UINTVAL c)>
@@ -783,33 +806,6 @@
/*
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
- ASSERT_ARGS(iter_init)
- iter->str = src;
- iter->bytepos = iter->charpos = 0;
-#if PARROT_HAS_ICU
- iter->get_and_advance = utf16_decode_and_advance;
- iter->set_and_advance = utf16_encode_and_advance;
- iter->set_position = utf16_set_position;
-#else
- Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LIBRARY_ERROR,
- "no ICU lib loaded");
-#endif
-}
-
-/*
-
=item C<ENCODING * Parrot_encoding_utf16_init(PARROT_INTERP)>
Initializes the UTF-16 encoding.
@@ -842,8 +838,11 @@
become_encoding,
codepoints,
bytes,
- iter_init,
- find_cclass
+ find_cclass,
+ utf16_decode_and_advance,
+ utf16_encode_and_advance,
+ utf16_regress_and_decode,
+ utf16_set_position
};
STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
Parrot_register_encoding(interp, "utf16", return_encoding);
Index: src/string/encoding/fixed_8.c
===================================================================
--- src/string/encoding/fixed_8.c (revision 43406)
+++ src/string/encoding/fixed_8.c (working copy)
@@ -50,6 +50,11 @@
__attribute__nonnull__(2)
FUNC_MODIFIES(*iter);
+static UINTVAL fixed8_get_prev(PARROT_INTERP, ARGMOD(String_iter *iter))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ FUNC_MODIFIES(*iter);
+
static void fixed8_set_next(PARROT_INTERP,
ARGMOD(String_iter *iter),
UINTVAL c)
@@ -119,13 +124,6 @@
__attribute__nonnull__(5)
FUNC_MODIFIES(*dest_string);
-static void iter_init(SHIM_INTERP,
- ARGIN(const STRING *src),
- ARGOUT(String_iter *iter))
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
static void set_byte(PARROT_INTERP,
ARGIN(const STRING *source_string),
UINTVAL offset,
@@ -181,6 +179,9 @@
#define ASSERT_ARGS_fixed8_get_next __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(iter))
+#define ASSERT_ARGS_fixed8_get_prev __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(iter))
#define ASSERT_ARGS_fixed8_set_next __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(iter))
@@ -206,9 +207,6 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(source_string) \
, PARROT_ASSERT_ARG(dest_string))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(iter))
#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(source_string))
@@ -600,6 +598,24 @@
/*
+=item C<static UINTVAL fixed8_get_prev(PARROT_INTERP, String_iter *iter)>
+
+Moves the string iterator C<i> to the previous codepoint.
+
+=cut
+
+*/
+
+static UINTVAL
+fixed8_get_prev(PARROT_INTERP, ARGMOD(String_iter *iter))
+{
+ ASSERT_ARGS(fixed8_get_prev)
+ iter->bytepos--;
+ return get_byte(interp, iter->str, --iter->charpos);
+}
+
+/*
+
=item C<static void fixed8_set_next(PARROT_INTERP, String_iter *iter, UINTVAL
c)>
@@ -640,28 +656,6 @@
/*
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
- ASSERT_ARGS(iter_init)
- iter->str = src;
- iter->bytepos = iter->charpos = 0;
- iter->get_and_advance = fixed8_get_next;
- iter->set_and_advance = fixed8_set_next;
- iter->set_position = fixed8_set_position;
-}
-
-/*
-
=item C<ENCODING * Parrot_encoding_fixed_8_init(PARROT_INTERP)>
Initializes the fixed-8 encoding.
@@ -694,8 +688,11 @@
become_encoding,
codepoints,
bytes,
- iter_init,
- find_cclass
+ find_cclass,
+ fixed8_get_next,
+ fixed8_set_next,
+ fixed8_get_prev,
+ fixed8_set_position
};
STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
Index: src/string/encoding/utf8.c
===================================================================
--- src/string/encoding/utf8.c (revision 43406)
+++ src/string/encoding/utf8.c (working copy)
@@ -98,13 +98,6 @@
FUNC_MODIFIES(*src)
FUNC_MODIFIES(*return_string);
-static void iter_init(SHIM_INTERP,
- ARGIN(const STRING *src),
- ARGOUT(String_iter *iter))
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
static void set_byte(PARROT_INTERP,
ARGIN(const STRING *src),
UINTVAL offset,
@@ -158,6 +151,12 @@
__attribute__nonnull__(2)
FUNC_MODIFIES(*i);
+static UINTVAL utf8_regress_and_decode(PARROT_INTERP,
+ ARGMOD(String_iter *i))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ FUNC_MODIFIES(*i);
+
PARROT_CANNOT_RETURN_NULL
static void * utf8_encode(PARROT_INTERP, ARGIN(void *ptr), UINTVAL c)
__attribute__nonnull__(1)
@@ -213,9 +212,6 @@
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src) \
, PARROT_ASSERT_ARG(return_string))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(iter))
#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(src))
@@ -238,6 +234,9 @@
#define ASSERT_ARGS_utf8_decode_and_advance __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_utf8_regress_and_decode __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_utf8_encode __attribute__unused__ int _ASSERT_ARGS_CHECK =
(\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(ptr))
@@ -505,6 +504,30 @@
/*
+=item C<static UINTVAL utf8_regress_and_decode(PARROT_INTERP, String_iter *i)>
+
+The UTF-8 implementation of the string iterator's C<regress_and_get>
+function.
+
+=cut
+
+*/
+
+static UINTVAL
+utf8_regress_and_decode(PARROT_INTERP, ARGMOD(String_iter *i))
+{
+ ASSERT_ARGS(utf8_regress_and_decode)
+ const utf8_t *u8ptr = (utf8_t *)((char *)i->str->strstart + i->bytepos);
+
+ u8ptr--;
+ while (UTF8_IS_CONTINUATION(*u8ptr))
+ u8ptr--;
+
+ return utf8_decode(interp, u8ptr);
+}
+
+/*
+
=item C<static void utf8_encode_and_advance(PARROT_INTERP, String_iter *i,
UINTVAL c)>
@@ -547,19 +570,39 @@
ASSERT_ARGS(utf8_set_position)
const utf8_t *u8ptr = (const utf8_t *)i->str->strstart;
- /* start from last known charpos, if we can */
- if (i->charpos <= pos) {
- const UINTVAL old_pos = pos;
- pos -= i->charpos;
- u8ptr += i->bytepos;
- i->charpos = old_pos;
+ if (pos == 0) {
+ i->charpos = 0;
+ i->bytepos = 0;
+ return;
}
- else
- i->charpos = pos;
- while (pos-- > 0)
- u8ptr += UTF8SKIP(u8ptr);
+ /*
+ * we know the byte offsets of three positions: start, current and end
+ * now find the shortest way to reach pos
+ */
+ if (pos < i->charpos) {
+ if (pos <= (i->charpos >> 1)) {
+ /* go forward from start */
+ u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr, pos);
+ }
+ else {
+ /* go backward from current */
+ u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr + i->bytepos,
i->charpos - pos);
+ }
+ }
+ else {
+ const UINTVAL len = i->str->strlen;
+ if (pos <= i->charpos + ((len - i->charpos) >> 1)) {
+ /* go forward from current */
+ u8ptr = (const utf8_t *)utf8_skip_forward(u8ptr + i->bytepos, pos
- i->charpos);
+ }
+ else {
+ /* go backward from end */
+ u8ptr = (const utf8_t *)utf8_skip_backward(u8ptr +
i->str->bufused, len - pos);
+ }
+ }
+ i->charpos = pos;
i->bytepos = (const char *)u8ptr - (const char *)i->str->strstart;
}
@@ -582,8 +625,8 @@
{
ASSERT_ARGS(to_encoding)
STRING *result;
- String_iter src_iter;
- UINTVAL offs, dest_len, dest_pos, src_len;
+ const ENCODING *src_encoding;
+ UINTVAL dest_len, dest_pos, src_len;
const int in_place = (dest == NULL);
unsigned char *new_pos, *pos, *p;
@@ -597,8 +640,8 @@
result = dest;
}
- /* init iter before possilby changing encoding */
- ENCODING_ITER_INIT(interp, src, &src_iter);
+ /* save source encoding before possibly changing it */
+ src_encoding = src->encoding;
result->charset = Parrot_unicode_charset_ptr;
result->encoding = Parrot_utf8_encoding_ptr;
result->strlen = src_len;
@@ -621,12 +664,14 @@
result->bufused = dest_len;
}
else {
+ String_iter src_iter;
+ STRING_ITER_INIT(interp, src, &src_iter);
dest_len = src_len;
dest_pos = 0;
- for (offs = 0; offs < src_len; ++offs) {
- const UINTVAL c = src_iter.get_and_advance(interp, &src_iter);
+ while (src_iter.charpos < src_len) {
+ const UINTVAL c = src_encoding->iter_get_and_advance(interp,
&src_iter);
if (dest_len - dest_pos < 6) {
- UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
+ UINTVAL need = (UINTVAL)((src->strlen - src_iter.charpos + 1)
* 1.5);
if (need < 16)
need = 16;
dest_len += need;
@@ -790,16 +835,16 @@
String_iter iter;
UINTVAL start;
- iter_init(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
if (offset)
- iter.set_position(interp, &iter, offset);
+ utf8_set_position(interp, &iter, offset);
start = iter.bytepos;
return_string->strstart = (char *)return_string->strstart + start;
if (count)
- iter.set_position(interp, &iter, offset + count);
+ utf8_set_position(interp, &iter, offset + count);
return_string->bufused = iter.bytepos - start;
return_string->strlen = count;
@@ -860,13 +905,13 @@
UINTVAL start;
Parrot_str_reuse_COW(interp, src, return_string);
- iter_init(interp, src, &iter);
- iter.set_position(interp, &iter, offset);
+ STRING_ITER_INIT(interp, src, &iter);
+ utf8_set_position(interp, &iter, offset);
start = iter.bytepos;
return_string->strstart = (char *)return_string->strstart + start;
- iter.set_position(interp, &iter, offset + count);
+ utf8_set_position(interp, &iter, offset + count);
return_string->bufused = iter.bytepos - start;
return_string->strlen = count;
@@ -973,9 +1018,9 @@
* this is used to initially calculate src->strlen,
* therefore we must scan the whole string
*/
- iter_init(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
while (iter.bytepos < src->bufused)
- iter.get_and_advance(interp, &iter);
+ utf8_decode_and_advance(interp, &iter);
return iter.charpos;
}
@@ -999,29 +1044,6 @@
/*
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(SHIM_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
- ASSERT_ARGS(iter_init)
- iter->str = src;
- iter->bytepos = 0;
- iter->charpos = 0;
- iter->get_and_advance = utf8_decode_and_advance;
- iter->set_and_advance = utf8_encode_and_advance;
- iter->set_position = utf8_set_position;
-}
-
-/*
-
=item C<ENCODING * Parrot_encoding_utf8_init(PARROT_INTERP)>
Initializes the UTF-8 encoding.
@@ -1054,8 +1076,11 @@
become_encoding,
codepoints,
bytes,
- iter_init,
- find_cclass
+ find_cclass,
+ utf8_decode_and_advance,
+ utf8_encode_and_advance,
+ utf8_regress_and_decode,
+ utf8_set_position
};
STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
Parrot_register_encoding(interp, "utf8", return_encoding);
Index: src/string/encoding/ucs2.c
===================================================================
--- src/string/encoding/ucs2.c (revision 43406)
+++ src/string/encoding/ucs2.c (working copy)
@@ -106,14 +106,6 @@
SHIM(STRING *dest_string))
__attribute__nonnull__(1);
-static void iter_init(PARROT_INTERP,
- ARGIN(const STRING *src),
- ARGOUT(String_iter *iter))
- __attribute__nonnull__(1)
- __attribute__nonnull__(2)
- __attribute__nonnull__(3)
- FUNC_MODIFIES(*iter);
-
static void set_byte(PARROT_INTERP,
SHIM(const STRING *src),
SHIM(UINTVAL offset),
@@ -157,6 +149,12 @@
__attribute__nonnull__(2)
FUNC_MODIFIES(*i);
+static UINTVAL ucs2_regress_and_decode(PARROT_INTERP,
+ ARGMOD(String_iter *i))
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2)
+ FUNC_MODIFIES(*i);
+
static void ucs2_encode_and_advance(PARROT_INTERP,
ARGMOD(String_iter *i),
UINTVAL c)
@@ -196,10 +194,6 @@
, PARROT_ASSERT_ARG(src))
#define ASSERT_ARGS_get_codepoints_inplace __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
-#define ASSERT_ARGS_iter_init __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
- PARROT_ASSERT_ARG(interp) \
- , PARROT_ASSERT_ARG(src) \
- , PARROT_ASSERT_ARG(iter))
#define ASSERT_ARGS_set_byte __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_set_bytes __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
@@ -216,6 +210,9 @@
#define ASSERT_ARGS_ucs2_decode_and_advance __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(i))
+#define ASSERT_ARGS_ucs2_regress_and_decode __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp) \
+ , PARROT_ASSERT_ARG(i))
#define ASSERT_ARGS_ucs2_encode_and_advance __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp) \
, PARROT_ASSERT_ARG(i))
@@ -397,11 +394,11 @@
String_iter iter;
UINTVAL start;
- iter_init(interp, src, &iter);
- iter.set_position(interp, &iter, offset);
+ STRING_ITER_INIT(interp, src, &iter);
+ ucs2_set_position(interp, &iter, offset);
start = iter.bytepos;
return_string->strstart = (char *)return_string->strstart + start;
- iter.set_position(interp, &iter, offset + count);
+ ucs2_set_position(interp, &iter, offset + count);
return_string->bufused = iter.bytepos - start;
}
#endif
@@ -611,6 +608,41 @@
/*
+=item C<static UINTVAL ucs2_regress_and_decode(PARROT_INTERP, String_iter *i)>
+
+Moves the string iterator C<i> to the previous UCS-2 codepoint.
+
+=cut
+
+*/
+
+static UINTVAL
+ucs2_regress_and_decode(PARROT_INTERP, ARGMOD(String_iter *i))
+{
+ ASSERT_ARGS(ucs2_regress_and_decode)
+
+#if PARROT_HAS_ICU
+ UChar * const s = (UChar*) i->str->strstart;
+ size_t pos = i->bytepos / sizeof (UChar);
+
+ /* TODO either make sure that we don't go past end or use SAFE
+ * iter versions
+ */
+ const UChar c = s[--pos];
+ i->charpos--;
+ i->bytepos = pos * sizeof (UChar);
+ return c;
+#else
+ /* This function must never be called if compiled without ICU.
+ * See TT #557
+ */
+ PARROT_ASSERT(0);
+ return (UINTVAL)0; /* Stop the static analyzers from panicing */
+#endif
+}
+
+/*
+
=item C<static void ucs2_encode_and_advance(PARROT_INTERP, String_iter *i,
UINTVAL c)>
@@ -669,33 +701,6 @@
/*
-=item C<static void iter_init(PARROT_INTERP, const STRING *src, String_iter
-*iter)>
-
-Initializes for string C<src> the string iterator C<iter>.
-
-=cut
-
-*/
-
-static void
-iter_init(PARROT_INTERP, ARGIN(const STRING *src), ARGOUT(String_iter *iter))
-{
- ASSERT_ARGS(iter_init)
-#if PARROT_HAS_ICU
- iter->str = src;
- iter->bytepos = 0;
- iter->charpos = 0;
- iter->get_and_advance = ucs2_decode_and_advance;
- iter->set_and_advance = ucs2_encode_and_advance;
- iter->set_position = ucs2_set_position;
-#else
- no_ICU_lib(interp);
-#endif
-}
-
-/*
-
=item C<ENCODING * Parrot_encoding_ucs2_init(PARROT_INTERP)>
Initializes the UCS-2 encoding.
@@ -728,8 +733,11 @@
become_encoding,
codepoints,
bytes,
- iter_init,
- find_cclass
+ find_cclass,
+ ucs2_decode_and_advance,
+ ucs2_encode_and_advance,
+ ucs2_regress_and_decode,
+ ucs2_set_position
};
STRUCT_COPY_FROM_STRUCT(return_encoding, base_encoding);
Parrot_register_encoding(interp, "ucs2", return_encoding);
Index: src/string/charset/iso-8859-1.c
===================================================================
--- src/string/charset/iso-8859-1.c (revision 43406)
+++ src/string/charset/iso-8859-1.c (working copy)
@@ -215,10 +215,10 @@
to_iso_8859_1(PARROT_INTERP, ARGIN(STRING *src), ARGMOD_NULLOK(STRING *dest))
{
ASSERT_ARGS(to_iso_8859_1)
- UINTVAL offs, src_len;
+ UINTVAL src_len;
String_iter iter;
- ENCODING_ITER_INIT(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
src_len = src->strlen;
if (dest) {
Parrot_gc_reallocate_string_storage(interp, dest, src_len);
@@ -229,16 +229,16 @@
dest = src;
}
dest->bufused = src_len;
- dest->charset = Parrot_iso_8859_1_charset_ptr;
- dest->encoding = Parrot_fixed_8_encoding_ptr;
- for (offs = 0; offs < src_len; ++offs) {
- const UINTVAL c = iter.get_and_advance(interp, &iter);
+ while (iter.charpos < src_len) {
+ const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (c >= 0x100)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_LOSSY_CONVERSION,
"lossy conversion to iso-8559-1");
- ENCODING_SET_BYTE(interp, dest, offs, c);
+ Parrot_fixed_8_encoding_ptr->set_byte(interp, dest, iter.charpos - 1,
c);
}
+ dest->charset = Parrot_iso_8859_1_charset_ptr;
+ dest->encoding = Parrot_fixed_8_encoding_ptr;
return dest;
}
@@ -258,24 +258,23 @@
{
ASSERT_ARGS(to_unicode)
if (dest) {
- UINTVAL offs;
String_iter iter;
dest->charset = Parrot_unicode_charset_ptr;
dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interp, dest);
Parrot_gc_reallocate_string_storage(interp, dest, src->strlen);
- ENCODING_ITER_INIT(interp, dest, &iter);
- for (offs = 0; offs < src->strlen; ++offs) {
- const UINTVAL c = ENCODING_GET_BYTE(interp, src, offs);
+ STRING_ITER_INIT(interp, dest, &iter);
+ while (iter.charpos < src->strlen) {
+ const UINTVAL c = ENCODING_GET_BYTE(interp, src, iter.charpos);
if (iter.bytepos >= Buffer_buflen(dest) - 4) {
- UINTVAL need = (UINTVAL)((src->strlen - offs) * 1.5);
+ UINTVAL need = (UINTVAL)((src->strlen - iter.charpos) * 1.5);
if (need < 16)
need = 16;
Parrot_gc_reallocate_string_storage(interp, dest,
Buffer_buflen(dest) + need);
}
- iter.set_and_advance(interp, &iter, c);
+ STRING_ITER_SET_AND_ADVANCE(interp, &iter, c);
}
dest->bufused = iter.bytepos;
dest->strlen = iter.charpos;
Index: src/string/charset/unicode.c
===================================================================
--- src/string/charset/unicode.c (revision 43406)
+++ src/string/charset/unicode.c (working copy)
@@ -704,20 +704,20 @@
{
ASSERT_ARGS(compare)
String_iter l_iter, r_iter;
- UINTVAL offs, cl, cr, min_len, l_len, r_len;
+ UINTVAL min_len, l_len, r_len;
/* TODO make optimized equal - strings are equal length then already */
- ENCODING_ITER_INIT(interp, lhs, &l_iter);
- ENCODING_ITER_INIT(interp, rhs, &r_iter);
+ STRING_ITER_INIT(interp, lhs, &l_iter);
+ STRING_ITER_INIT(interp, rhs, &r_iter);
l_len = lhs->strlen;
r_len = rhs->strlen;
min_len = l_len > r_len ? r_len : l_len;
- for (offs = 0; offs < min_len; ++offs) {
- cl = l_iter.get_and_advance(interp, &l_iter);
- cr = r_iter.get_and_advance(interp, &r_iter);
+ while (l_iter.charpos < min_len) {
+ UINTVAL cl = STRING_ITER_GET_AND_ADVANCE(interp, &l_iter);
+ UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, &r_iter);
if (cl != cr)
return cl < cr ? -1 : 1;
@@ -769,12 +769,12 @@
validate(PARROT_INTERP, ARGIN(STRING *src))
{
ASSERT_ARGS(validate)
- UINTVAL offset;
+ UINTVAL len = Parrot_str_byte_length(interp, src);
String_iter iter;
- ENCODING_ITER_INIT(interp, src, &iter);
- for (offset = 0; offset < Parrot_str_byte_length(interp, src); ++offset) {
- const UINTVAL codepoint = iter.get_and_advance(interp, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
+ while (iter.charpos < len) {
+ const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
/* Check for Unicode non-characters */
if (codepoint >= 0xfdd0
&& (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe)
@@ -924,24 +924,22 @@
ASSERT_ARGS(find_cclass)
String_iter iter;
UINTVAL codepoint;
- UINTVAL pos = offset;
UINTVAL end = offset + count;
- ENCODING_ITER_INIT(interp, source_string, &iter);
+ STRING_ITER_INIT(interp, source_string, &iter);
+ STRING_ITER_SET_POSITION(interp, &iter, offset);
- iter.set_position(interp, &iter, pos);
-
end = source_string->strlen < end ? source_string->strlen : end;
- for (; pos < end; ++pos) {
- codepoint = iter.get_and_advance(interp, &iter);
+ while (iter.charpos < end) {
+ codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (codepoint >= 256) {
if (u_iscclass(interp, codepoint, flags))
- return pos;
+ return iter.charpos - 1;
}
else {
if (Parrot_iso_8859_1_typetable[codepoint] & flags)
- return pos;
+ return iter.charpos - 1;
}
}
@@ -965,37 +963,36 @@
ASSERT_ARGS(find_not_cclass)
String_iter iter;
UINTVAL codepoint;
- UINTVAL pos = offset;
UINTVAL end = offset + count;
int bit;
- if (pos > source_string->strlen) {
+ if (offset > source_string->strlen) {
/* XXX: Throw in this case? */
return offset + count;
}
- ENCODING_ITER_INIT(interp, source_string, &iter);
+ STRING_ITER_INIT(interp, source_string, &iter);
- if (pos)
- iter.set_position(interp, &iter, pos);
+ if (offset)
+ STRING_ITER_SET_POSITION(interp, &iter, offset);
end = source_string->strlen < end ? source_string->strlen : end;
if (flags == enum_cclass_any)
return end;
- for (; pos < end; ++pos) {
- codepoint = iter.get_and_advance(interp, &iter);
+ while (iter.charpos < end) {
+ codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (codepoint >= 256) {
for (bit = enum_cclass_uppercase;
bit <= enum_cclass_word ; bit <<= 1) {
if ((bit & flags) && !u_iscclass(interp, codepoint, bit))
- return pos;
+ return iter.charpos - 1;
}
}
else {
if (!(Parrot_iso_8859_1_typetable[codepoint] & flags))
- return pos;
+ return iter.charpos - 1;
}
}
@@ -1023,8 +1020,8 @@
dest->strlen = 1;
- ENCODING_ITER_INIT(interp, dest, &iter);
- iter.set_and_advance(interp, &iter, codepoint);
+ STRING_ITER_INIT(interp, dest, &iter);
+ STRING_ITER_SET_AND_ADVANCE(interp, &iter, codepoint);
dest->bufused = iter.bytepos;
return dest;
@@ -1047,13 +1044,12 @@
{
ASSERT_ARGS(compute_hash)
String_iter iter;
- UINTVAL offs;
size_t hashval = seed;
- ENCODING_ITER_INIT(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
- for (offs = 0; offs < src->strlen; ++offs) {
- const UINTVAL c = iter.get_and_advance(interp, &iter);
+ while (iter.charpos < src->strlen) {
+ const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
hashval += hashval << 5;
hashval += c;
}
Index: src/string/charset/ascii.c
===================================================================
--- src/string/charset/ascii.c (revision 43406)
+++ src/string/charset/ascii.c (working copy)
@@ -263,7 +263,6 @@
{
ASSERT_ARGS(to_ascii)
String_iter iter;
- UINTVAL offs;
unsigned char *p;
const UINTVAL len = src->strlen;
@@ -275,9 +274,9 @@
dest = src;
}
p = (unsigned char *)dest->strstart;
- ENCODING_ITER_INIT(interp, src, &iter);
- for (offs = 0; offs < len; ++offs) {
- const UINTVAL c = iter.get_and_advance(interp, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
+ while (iter.charpos < len) {
+ const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (c >= 128)
Parrot_ex_throw_from_c_args(interp, NULL,
EXCEPTION_LOSSY_CONVERSION,
"can't convert unicode string to ascii");
@@ -557,11 +556,10 @@
return ret_val < 0 ? -1 : 1;
}
else {
- UINTVAL offs;
- ENCODING_ITER_INIT(interp, rhs, &iter);
- for (offs = 0; offs < min_len; ++offs) {
- const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, offs);
- const UINTVAL cr = iter.get_and_advance(interp, &iter);
+ STRING_ITER_INIT(interp, rhs, &iter);
+ while (iter.charpos < min_len) {
+ const UINTVAL cl = ENCODING_GET_BYTE(interp, lhs, iter.charpos);
+ const UINTVAL cr = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (cl != cr)
return cl < cr ? -1 : 1;
}
@@ -596,30 +594,23 @@
{
ASSERT_ARGS(mixed_cs_index)
String_iter src_iter, search_iter;
- UINTVAL len;
- INTVAL start;
+ const UINTVAL len = search->strlen;
+ UINTVAL start;
- ENCODING_ITER_INIT(interp, src, &src_iter);
- src_iter.set_position(interp, &src_iter, offs);
- ENCODING_ITER_INIT(interp, search, &search_iter);
- len = search->strlen;
+ STRING_ITER_INIT(interp, src, &src_iter);
+ STRING_ITER_SET_POSITION(interp, &src_iter, offs);
+ STRING_ITER_INIT(interp, search, &search_iter);
- start = -1;
- for (; len && offs < src->strlen; ++offs) {
- const UINTVAL c1 = src_iter.get_and_advance(interp, &src_iter);
- const UINTVAL c2 = search_iter.get_and_advance(interp, &search_iter);
- if (c1 == c2) {
- --len;
- if (start == -1)
- start = offs;
+ start = src_iter.charpos;
+ while (search_iter.charpos < len && src_iter.charpos < src->strlen) {
+ const UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, &src_iter);
+ const UINTVAL c2 = STRING_ITER_GET_AND_ADVANCE(interp, &search_iter);
+ if (c1 != c2) {
+ start = src_iter.charpos;
+ STRING_ITER_SET_POSITION(interp, &search_iter, 0);
}
- else {
- len = search->strlen;
- start = -1;
- search_iter.set_position(interp, &search_iter, 0);
- }
}
- if (len == 0)
+ if (search_iter.charpos >= len)
return start;
return -1;
}
@@ -700,12 +691,12 @@
validate(PARROT_INTERP, ARGIN(STRING *src))
{
ASSERT_ARGS(validate)
- UINTVAL offset;
+ const UINTVAL len = Parrot_str_byte_length(interp, src);
String_iter iter;
- ENCODING_ITER_INIT(interp, src, &iter);
- for (offset = 0; offset < Parrot_str_byte_length(interp, src); ++offset) {
- const UINTVAL codepoint = iter.get_and_advance(interp, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
+ while (iter.charpos < len) {
+ const UINTVAL codepoint = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (codepoint >= 0x80)
return 0;
}
Index: src/string/api.c
===================================================================
--- src/string/api.c (revision 43406)
+++ src/string/api.c (working copy)
@@ -1254,9 +1254,74 @@
}
}
+/*
+=item C<STRING * Parrot_str_iter_get_and_advance(PARROT_INTERP, STRING *str,
+String_iter *iter)>
+
+Returns the character in C<str> that C<iter> points to and advances C<iter>.
+
+=cut
+
+*/
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+Parrot_str_iter_get_and_advance(PARROT_INTERP,
+ ARGIN(STRING *str), ARGOUT(String_iter *iter))
+{
+ ASSERT_ARGS(Parrot_str_iter_get_and_advance)
+ STRING *dest = Parrot_str_new_COW(interp, str);
+ UINTVAL start = iter->bytepos;
+
+ STRING_ITER_GET_AND_ADVANCE(interp, iter);
+
+ dest->strstart = (char *)dest->strstart + start;
+ dest->bufused = iter->bytepos - start;
+ dest->strlen = 1;
+ dest->hashval = 0;
+
+ return dest;
+}
+
/*
+=item C<STRING * Parrot_str_iter_regress_and_get(PARROT_INTERP, STRING *str,
+String_iter *iter)>
+
+Moves C<iter> backwards and returns the character in C<str> that C<iter>
+points to.
+
+=cut
+
+*/
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING *
+Parrot_str_iter_regress_and_get(PARROT_INTERP,
+ ARGIN(STRING *str), ARGOUT(String_iter *iter))
+{
+ ASSERT_ARGS(Parrot_str_iter_regress_and_get)
+ STRING *dest = Parrot_str_new_COW(interp, str);
+ UINTVAL end = iter->bytepos;
+
+ STRING_ITER_REGRESS_AND_GET(interp, iter);
+
+ dest->strstart = (char *)dest->strstart + iter->bytepos;
+ dest->bufused = end - iter->bytepos;
+ dest->strlen = 1;
+ dest->hashval = 0;
+
+ return dest;
+}
+
+
+/*
+
=item C<STRING * Parrot_str_replace(PARROT_INTERP, STRING *src, INTVAL offset,
INTVAL length, STRING *rep, STRING **d)>
@@ -1348,12 +1413,12 @@
}
/* get byte position of the part that will be replaced */
- ENCODING_ITER_INIT(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
- iter.set_position(interp, &iter, true_offset);
+ STRING_ITER_SET_POSITION(interp, &iter, true_offset);
start_byte = iter.bytepos;
- iter.set_position(interp, &iter, true_offset + true_length);
+ STRING_ITER_SET_POSITION(interp, &iter, true_offset + true_length);
end_byte = iter.bytepos;
/* not possible.... */
@@ -1451,7 +1516,7 @@
Parrot_str_chopn_inplace(PARROT_INTERP, ARGMOD(STRING *s), INTVAL n)
{
ASSERT_ARGS(Parrot_str_chopn_inplace)
- UINTVAL new_length, uchar_size;
+ UINTVAL new_length;
if (n < 0) {
new_length = -n;
@@ -1472,23 +1537,23 @@
return;
}
- uchar_size = s->bufused / s->strlen;
- s->strlen = new_length;
-
if (s->encoding == Parrot_fixed_8_encoding_ptr) {
s->bufused = new_length;
}
else if (s->encoding == Parrot_ucs2_encoding_ptr) {
+ const UINTVAL uchar_size = s->bufused / s->strlen;
s->bufused = new_length * uchar_size;
}
else {
String_iter iter;
- ENCODING_ITER_INIT(interp, s, &iter);
- iter.set_position(interp, &iter, new_length);
+ STRING_ITER_INIT(interp, s, &iter);
+ STRING_ITER_SET_POSITION(interp, &iter, new_length);
s->bufused = iter.bytepos;
}
+ s->strlen = new_length;
+
return;
}
@@ -2124,13 +2189,12 @@
int sign = 1;
INTVAL i = 0;
String_iter iter;
- UINTVAL offs;
number_parse_state state = parse_start;
- ENCODING_ITER_INIT(interp, s, &iter);
+ STRING_ITER_INIT(interp, s, &iter);
- for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
- const UINTVAL c = iter.get_and_advance(interp, &iter);
+ while (state != parse_end && iter.charpos < s->strlen) {
+ const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
/* Check for overflow */
if (c > 255)
break;
@@ -2215,17 +2279,16 @@
int d_length = 0;
int check_nan = 0; /* Check for NaN and Inf after main loop */
String_iter iter;
- UINTVAL offs;
number_parse_state state = parse_start;
if (!s)
return 0.0;
- ENCODING_ITER_INIT(interp, s, &iter);
+ STRING_ITER_INIT(interp, s, &iter);
/* Handcrafter FSM to read float value */
- for (offs = 0; (state != parse_end) && (offs < s->strlen); ++offs) {
- const UINTVAL c = iter.get_and_advance(interp, &iter);
+ while (state != parse_end && iter.charpos < s->strlen) {
+ const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
/* Check for overflow */
if (c > 255)
break;
@@ -2600,7 +2663,6 @@
{
ASSERT_ARGS(Parrot_str_to_hashval)
String_iter iter;
- UINTVAL offs;
size_t hashval = interp->hash_seed;
if (!s)
@@ -2609,10 +2671,10 @@
/* ZZZZZ workaround for something not setting up encodings right */
saneify_string(s);
- ENCODING_ITER_INIT(interp, s, &iter);
+ STRING_ITER_INIT(interp, s, &iter);
- for (offs = 0; offs < s->strlen; ++offs) {
- const UINTVAL c = iter.get_and_advance(interp, &iter);
+ while (iter.charpos < s->strlen) {
+ const UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
hashval += hashval << 5;
hashval += c;
}
@@ -2690,11 +2752,11 @@
Parrot_fixed_8_encoding_ptr, Parrot_ascii_charset_ptr, 0);
/* more work TODO */
- ENCODING_ITER_INIT(interp, src, &iter);
+ STRING_ITER_INIT(interp, src, &iter);
dp = (unsigned char *)result->strstart;
for (i = 0; len > 0; --len) {
- UINTVAL c = iter.get_and_advance(interp, &iter);
+ UINTVAL c = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
if (c < 0x7f) {
/* process ASCII chars */
if (i >= charlen - 2) {
@@ -2851,7 +2913,7 @@
encoding = result->encoding;
}
- encoding->iter_init(interp, result, &iter);
+ STRING_ITER_INIT(interp, result, &iter);
for (offs = d = 0; offs < clength; ++offs) {
r = (Parrot_UInt4)((unsigned char *)result->strstart)[offs];
@@ -2874,7 +2936,7 @@
}
PARROT_ASSERT(d < offs);
- iter.set_and_advance(interp, &iter, r);
+ encoding->iter_set_and_advance(interp, &iter, r);
++d;
}
@@ -3409,8 +3471,10 @@
ARGIN_NULLOK(STRING *delim), ARGIN_NULLOK(STRING *str))
{
ASSERT_ARGS(Parrot_str_split)
- PMC *res;
- INTVAL slen, dlen, ps, pe;
+ PMC *res;
+ STRING *tstr;
+ UINTVAL slen, dlen, start, len;
+ String_iter iter;
if (STRING_IS_NULL(delim) || STRING_IS_NULL(str))
return PMCNULL;
@@ -3421,45 +3485,62 @@
if (!slen)
return res;
+ STRING_ITER_INIT(interp, str, &iter);
dlen = Parrot_str_byte_length(interp, delim);
if (dlen == 0) {
- int i;
VTABLE_set_integer_native(interp, res, slen);
- for (i = 0; i < slen; ++i) {
- STRING * const p = Parrot_str_substr(interp, str, i, 1, NULL, 0);
- VTABLE_set_string_keyed_int(interp, res, i, p);
- }
+ do {
+ tstr = Parrot_str_iter_get_and_advance(interp, str, &iter);
+ VTABLE_set_string_keyed_int(interp, res, iter.charpos - 1, tstr);
+ } while (iter.charpos < slen);
return res;
}
- pe = Parrot_str_find_index(interp, str, delim, 0);
+ start = iter.bytepos;
+ len = 0;
- if (pe < 0) {
- VTABLE_push_string(interp, res, str);
- return res;
- }
+ do {
+ UINTVAL end = start;
+ String_iter delim_iter;
- ps = 0;
+ STRING_ITER_INIT(interp, delim, &delim_iter);
- while (ps <= slen) {
- const int pl = pe - ps;
- STRING * const tstr = Parrot_str_substr(interp, str, ps, pl, NULL, 0);
+ while (delim_iter.charpos < dlen && iter.charpos < slen) {
+ const UINTVAL c1 = STRING_ITER_GET_AND_ADVANCE(interp, &iter);
+ const UINTVAL c2 = STRING_ITER_GET_AND_ADVANCE(interp,
&delim_iter);
+ if (c1 != c2) {
+ len += delim_iter.charpos;
+ end = iter.bytepos;
+ STRING_ITER_SET_POSITION(interp, &delim_iter, 0);
+ }
+ }
- VTABLE_push_string(interp, res, tstr);
- ps = pe + Parrot_str_byte_length(interp, delim);
+ if (delim_iter.charpos >= dlen) {
+ tstr = Parrot_str_new_COW(interp, str);
+ tstr->strstart = (char *)tstr->strstart + start;
+ tstr->bufused = end - start;
+ tstr->strlen = len;
+ tstr->hashval = 0;
+ VTABLE_push_string(interp, res, tstr);
- if (ps > slen)
- break;
+ start = iter.bytepos;
+ len = 0;
+ }
+ else {
+ len += delim_iter.charpos;
+ }
+ } while (iter.charpos < slen);
- pe = Parrot_str_find_index(interp, str, delim, ps);
+ tstr = Parrot_str_new_COW(interp, str);
+ tstr->strstart = (char *)tstr->strstart + start;
+ tstr->bufused = iter.bytepos - start;
+ tstr->strlen = len;
+ tstr->hashval = 0;
+ VTABLE_push_string(interp, res, tstr);
- if (pe < 0)
- pe = slen;
- }
-
return res;
}
Index: src/io/utf8.c
===================================================================
--- src/io/utf8.c (revision 43406)
+++ src/io/utf8.c (working copy)
@@ -57,7 +57,7 @@
s->encoding = Parrot_utf8_encoding_ptr;
/* count chars, verify utf8 */
- Parrot_utf8_encoding_ptr->iter_init(interp, s, &iter);
+ STRING_ITER_INIT(interp, s, &iter);
while (iter.bytepos < s->bufused) {
if (iter.bytepos + 4 > s->bufused) {
@@ -92,7 +92,7 @@
}
}
ok:
- iter.get_and_advance(interp, &iter);
+ Parrot_utf8_encoding_ptr->iter_get_and_advance(interp, &iter);
}
s->strlen = iter.charpos;
return len;
Index: include/parrot/encoding.h
===================================================================
--- include/parrot/encoding.h (revision 43406)
+++ include/parrot/encoding.h (working copy)
@@ -35,8 +35,10 @@
struct string_iterator_t; /* s. parrot/string.h */
-typedef void (*encoding_iter_init_t)(PARROT_INTERP, const STRING *src,
- struct string_iterator_t *);
+typedef UINTVAL (*encoding_iter_get_and_advance_t)(PARROT_INTERP, struct
string_iterator_t *);
+typedef void (*encoding_iter_set_and_advance_t)(PARROT_INTERP, struct
string_iterator_t *, UINTVAL);
+typedef UINTVAL (*encoding_iter_regress_and_get_t)(PARROT_INTERP, struct
string_iterator_t *);
+typedef void (*encoding_iter_set_position_t)(PARROT_INTERP, struct
string_iterator_t *, UINTVAL);
struct _encoding {
ARGIN(const char *name);
@@ -55,8 +57,11 @@
encoding_become_encoding_t become_encoding;
encoding_codepoints_t codepoints;
encoding_bytes_t bytes;
- encoding_iter_init_t iter_init;
encoding_find_cclass_t find_cclass;
+ encoding_iter_get_and_advance_t iter_get_and_advance;
+ encoding_iter_set_and_advance_t iter_set_and_advance;
+ encoding_iter_regress_and_get_t iter_regress_and_get;
+ encoding_iter_set_position_t iter_set_position;
};
typedef struct _encoding ENCODING;
@@ -218,8 +223,6 @@
((src)->encoding)->codepoints((i), (src))
#define ENCODING_BYTES(i, src) \
((src)->encoding)->bytes((i), (src))
-#define ENCODING_ITER_INIT(i, src, iter) \
- ((src)->encoding)->iter_init((i), (src), (iter))
#define ENCODING_FIND_CCLASS(i, src, typetable, flags, pos, end) \
((src)->encoding)->find_cclass((i), (src), (typetable), (flags), (pos),
(end))
Index: include/parrot/string_funcs.h
===================================================================
--- include/parrot/string_funcs.h (revision 43406)
+++ include/parrot/string_funcs.h (working copy)
@@ -391,6 +391,24 @@
PARROT_EXPORT
PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * Parrot_str_iter_get_and_advance(PARROT_INTERP,
+ ARGIN(STRING *str),
+ ARGOUT(String_iter *iter))
+ __attribute__nonnull__(1)
+ FUNC_MODIFIES(*iter);
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
+PARROT_WARN_UNUSED_RESULT
+STRING * Parrot_str_iter_regress_and_get(PARROT_INTERP,
+ ARGIN(STRING *str),
+ ARGOUT(String_iter *iter))
+ __attribute__nonnull__(1)
+ FUNC_MODIFIES(*iter);
+
+PARROT_EXPORT
+PARROT_CANNOT_RETURN_NULL
PARROT_MALLOC
STRING * Parrot_str_titlecase(PARROT_INTERP, ARGIN_NULLOK(const STRING *s))
__attribute__nonnull__(1);
@@ -660,6 +678,10 @@
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_str_substr __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_Parrot_str_iter_get_and_advance __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
+#define ASSERT_ARGS_Parrot_str_iter_regress_and_get __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
+ PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_str_titlecase __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
PARROT_ASSERT_ARG(interp))
#define ASSERT_ARGS_Parrot_str_titlecase_inplace __attribute__unused__ int
_ASSERT_ARGS_CHECK = (\
Index: include/parrot/string.h
===================================================================
--- include/parrot/string.h (revision 43406)
+++ include/parrot/string.h (working copy)
@@ -32,11 +32,22 @@
const STRING *str;
UINTVAL bytepos;
UINTVAL charpos;
- UINTVAL (*get_and_advance)(PARROT_INTERP, struct string_iterator_t *i);
- void (*set_and_advance)(PARROT_INTERP, struct string_iterator_t *i,
UINTVAL c);
- void (*set_position)(PARROT_INTERP, struct string_iterator_t *i, UINTVAL
pos);
} String_iter;
+#define STRING_ITER_INIT(i, src, iter) do { \
+ (iter)->str = (src); \
+ (iter)->charpos = 0; \
+ (iter)->bytepos = 0; \
+} while (0)
+#define STRING_ITER_GET_AND_ADVANCE(i, iter) \
+ ((iter)->str->encoding)->iter_get_and_advance((i), (iter))
+#define STRING_ITER_SET_AND_ADVANCE(i, iter, c) \
+ ((iter)->str->encoding)->iter_set_and_advance((i), (iter), (c))
+#define STRING_ITER_REGRESS_AND_GET(i, iter) \
+ ((iter)->str->encoding)->iter_regress_and_get((i), (iter))
+#define STRING_ITER_SET_POSITION(i, iter, pos) \
+ ((iter)->str->encoding)->iter_set_position((i), (iter), (pos))
+
#define STREQ(x, y) (strcmp((x), (y))==0)
#define STRNEQ(x, y) (strcmp((x), (y))!=0)
_______________________________________________
http://lists.parrot.org/mailman/listinfo/parrot-dev