It seems that all ways to iterate over the characters in a UTF8 string have quadratic running time. See the attached test. I would expect that for keyed access and 'substr' but iterator access and 'split' should have better performance. I had a look at the string iterator PMC code and it doesn't use the iterators that the underlying string API provides.

I can offer to write a patch to fix this if noone else is working on this.

Nick

.sub _ :main
    .local string str
    str = utf8:unicode:"ä"
    $S0 = repeat "x", 30000
    str .= $S0

    .local pmc str_pmc
    str_pmc = box str

    say "iter"
    .local pmc i
    i = iter str_pmc 
  iter_loop:
    $S0 = shift i
    if i goto iter_loop

    say "keyed"
    $I0 = 0
  keyed_loop:
    $S0 = str_pmc[$I0]
    $I0 += 1
    if $S0 != "" goto keyed_loop

    say "substr"
    $I0 = 0
  char_loop:
    $S0 = substr str, $I0, 1
    $I0 += 1
    if $S0 != "" goto char_loop

    say "split"
    $P0 = split "", str
.end

_______________________________________________
http://lists.parrot.org/mailman/listinfo/parrot-dev

Reply via email to