andrei Sat Jun 24 21:57:14 2006 UTC Modified files: /php-src/ext/unicode unicode_iterators.c Log: A lot of work on making TextIterator support propert codepoint-level offsets and making it more robust in general.
http://cvs.php.net/viewvc.cgi/php-src/ext/unicode/unicode_iterators.c?r1=1.25&r2=1.26&diff_format=u Index: php-src/ext/unicode/unicode_iterators.c diff -u php-src/ext/unicode/unicode_iterators.c:1.25 php-src/ext/unicode/unicode_iterators.c:1.26 --- php-src/ext/unicode/unicode_iterators.c:1.25 Sat Jun 24 18:18:38 2006 +++ php-src/ext/unicode/unicode_iterators.c Sat Jun 24 21:57:14 2006 @@ -14,11 +14,12 @@ +----------------------------------------------------------------------+ */ -/* $Id: unicode_iterators.c,v 1.25 2006/06/24 18:18:38 andrei Exp $ */ +/* $Id: unicode_iterators.c,v 1.26 2006/06/24 21:57:14 andrei Exp $ */ /* * TODO * + * - test with empty and 1 character strings * - optimize current() to pass return_value to the handler so that it fills it * in directly instead of creating a new zval * - implement Countable (or count_elements handler) and Seekable interfaces @@ -51,20 +52,23 @@ size_t current_alloc; long flags; union { - int32_t start; struct { - int32_t start; + int32_t offset; + int32_t cp_offset; int32_t index; } cp; struct { int32_t start; int32_t end; int32_t index; + int32_t start_cp_offset; + int32_t end_cp_offset; } cs; struct { int32_t start; int32_t end; int32_t index; + int32_t cp_offset; UBreakIterator *iter; } brk; } u; @@ -77,11 +81,12 @@ } typedef struct { - int (*valid) (text_iter_obj* object TSRMLS_DC); - void (*current)(text_iter_obj* object TSRMLS_DC); - int (*key) (text_iter_obj* object TSRMLS_DC); - void (*next) (text_iter_obj* object TSRMLS_DC); - void (*rewind) (text_iter_obj* object TSRMLS_DC); + int (*valid) (text_iter_obj* object, long flags TSRMLS_DC); + void (*current)(text_iter_obj* object, long flags TSRMLS_DC); + int (*key) (text_iter_obj* object, long flags TSRMLS_DC); + int (*offset) (text_iter_obj* object, long flags TSRMLS_DC); + void (*next) (text_iter_obj* object, long flags TSRMLS_DC); + void (*rewind) (text_iter_obj* object, long flags TSRMLS_DC); } text_iter_ops; enum UBreakIteratorType brk_type_map[] = { @@ -97,52 +102,65 @@ /* Code point ops */ -static int text_iter_cp_valid(text_iter_obj* object TSRMLS_DC) +static int text_iter_cp_valid(text_iter_obj* object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { - return (object->u.cp.start > 0); + if (flags & ITER_REVERSE) { + return (object->u.cp.offset > 0); } else { - return (object->u.cp.start < object->text_len); + return (object->u.cp.offset < object->text_len); } } -static void text_iter_cp_current(text_iter_obj* object TSRMLS_DC) +static void text_iter_cp_current(text_iter_obj* object, long flags TSRMLS_DC) { - UChar32 cp; - int32_t tmp, buf_len; + UChar32 cp = 0; + int32_t tmp, buf_len = 0; - tmp = object->u.cp.start; - if (object->flags & ITER_REVERSE) { - U16_PREV(object->text, 0, tmp, cp); - } else { - U16_NEXT(object->text, tmp, object->text_len, cp); + if (text_iter_cp_valid(object, flags TSRMLS_CC)) { + tmp = object->u.cp.offset; + if (flags & ITER_REVERSE) { + U16_PREV(object->text, 0, tmp, cp); + } else { + U16_NEXT(object->text, tmp, object->text_len, cp); + } + buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current)); } - buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current)); Z_USTRVAL_P(object->current)[buf_len] = 0; Z_USTRLEN_P(object->current) = buf_len; } -static int text_iter_cp_key(text_iter_obj* object TSRMLS_DC) +static int text_iter_cp_key(text_iter_obj* object, long flags TSRMLS_DC) { return object->u.cp.index; } -static void text_iter_cp_next(text_iter_obj* object TSRMLS_DC) +static int text_iter_cp_offset(text_iter_obj* object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { - U16_BACK_1(object->text, 0, object->u.cp.start); - } else { - U16_FWD_1(object->text, object->u.cp.start, object->text_len); + return object->u.cp.cp_offset; +} + +static void text_iter_cp_next(text_iter_obj* object, long flags TSRMLS_DC) +{ + if (text_iter_cp_valid(object, flags TSRMLS_CC)) { + if (flags & ITER_REVERSE) { + U16_BACK_1(object->text, 0, object->u.cp.offset); + object->u.cp.cp_offset--; + } else { + U16_FWD_1(object->text, object->u.cp.offset, object->text_len); + object->u.cp.cp_offset++; + } + object->u.cp.index++; } - object->u.cp.index++; } -static void text_iter_cp_rewind(text_iter_obj *object TSRMLS_DC) +static void text_iter_cp_rewind(text_iter_obj *object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { - object->u.cp.start = object->text_len; + if (flags & ITER_REVERSE) { + object->u.cp.offset = object->text_len; + object->u.cp.cp_offset = u_countChar32(object->text, object->text_len); } else { - object->u.cp.start = 0; + object->u.cp.offset = 0; + object->u.cp.cp_offset = 0; } object->u.cp.index = 0; } @@ -151,25 +169,26 @@ text_iter_cp_valid, text_iter_cp_current, text_iter_cp_key, + text_iter_cp_offset, text_iter_cp_next, text_iter_cp_rewind, }; /* Combining sequence ops */ -static int text_iter_cs_valid(text_iter_obj* object TSRMLS_DC) +static int text_iter_cs_valid(text_iter_obj* object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { + if (flags & ITER_REVERSE) { return (object->u.cs.end > 0); } else { return (object->u.cs.end <= object->text_len); } } -static void text_iter_cs_current(text_iter_obj* object TSRMLS_DC) +static void text_iter_cs_current(text_iter_obj* object, long flags TSRMLS_DC) { uint32_t length = object->u.cs.end - object->u.cs.start; - if (length > object->current_alloc) { + if (length+1 > object->current_alloc) { object->current_alloc = length+1; Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); } @@ -178,50 +197,68 @@ Z_USTRLEN_P(object->current) = length; } -static int text_iter_cs_key(text_iter_obj* object TSRMLS_DC) +static int text_iter_cs_key(text_iter_obj* object, long flags TSRMLS_DC) { return object->u.cs.index; } -static void text_iter_cs_next(text_iter_obj* object TSRMLS_DC) +static int text_iter_cs_offset(text_iter_obj* object, long flags TSRMLS_DC) +{ + return object->u.cs.start_cp_offset; +} + +static void text_iter_cs_next(text_iter_obj* object, long flags TSRMLS_DC) { UChar32 cp; - uint32_t tmp; + int32_t tmp, tmp2; - if (object->flags & ITER_REVERSE) { - object->u.cs.end = object->u.cs.start; - U16_PREV(object->text, 0, object->u.cs.start, cp); - if (u_getCombiningClass(cp) != 0) { - do { - U16_PREV(object->text, 0, object->u.cs.start, cp); - } while (object->u.cs.start > 0 && u_getCombiningClass(cp) != 0); - } - } else { - object->u.cs.start = object->u.cs.end; - U16_NEXT(object->text, object->u.cs.end, object->text_len, cp); - if (u_getCombiningClass(cp) == 0) { - tmp = object->u.cs.end; - while (tmp < object->text_len) { - U16_NEXT(object->text, tmp, object->text_len, cp); - if (u_getCombiningClass(cp) == 0) { - break; - } else { - object->u.cs.end = tmp; + if (text_iter_cs_valid(object, flags TSRMLS_CC)) { + if (flags & ITER_REVERSE) { + object->u.cs.end = object->u.cs.start; + object->u.cs.end_cp_offset = object->u.cs.start_cp_offset; + U16_PREV(object->text, 0, object->u.cs.start, cp); + object->u.cs.start_cp_offset--; + if (u_getCombiningClass(cp) != 0) { + do { + U16_PREV(object->text, 0, object->u.cs.start, cp); + object->u.cs.start_cp_offset--; + } while (object->u.cs.start > 0 && u_getCombiningClass(cp) != 0); + } + } else { + object->u.cs.start = object->u.cs.end; + object->u.cs.start_cp_offset = object->u.cs.end_cp_offset; + U16_NEXT(object->text, object->u.cs.end, object->text_len, cp); + object->u.cs.end_cp_offset++; + if (u_getCombiningClass(cp) == 0) { + tmp = object->u.cs.end; + tmp2 = object->u.cs.end_cp_offset; + while (tmp < object->text_len) { + U16_NEXT(object->text, tmp, object->text_len, cp); + tmp2++; + if (u_getCombiningClass(cp) == 0) { + break; + } else { + object->u.cs.end = tmp; + object->u.cs.end_cp_offset = tmp2; + } } } } + object->u.cs.index++; } - object->u.cs.index++; } -static void text_iter_cs_rewind(text_iter_obj *object TSRMLS_DC) +static void text_iter_cs_rewind(text_iter_obj *object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { + if (flags & ITER_REVERSE) { object->u.cs.start = object->u.cs.end = object->text_len; + object->u.cs.start_cp_offset = object->u.cs.end_cp_offset = + u_countChar32(object->text, object->text_len); } else { object->u.cs.start = object->u.cs.end = 0; + object->u.cs.start_cp_offset = object->u.cs.end_cp_offset = 0; } - text_iter_cs_next(object TSRMLS_CC); /* find first sequence */ + text_iter_cs_next(object, flags TSRMLS_CC); /* find first sequence */ object->u.cs.index = 0; /* because _next increments index */ } @@ -229,6 +266,7 @@ text_iter_cs_valid, text_iter_cs_current, text_iter_cs_key, + text_iter_cs_offset, text_iter_cs_next, text_iter_cs_rewind, }; @@ -236,70 +274,86 @@ /* UBreakIterator Character Ops */ -static int text_iter_brk_char_valid(text_iter_obj* object TSRMLS_DC) +static int text_iter_brk_char_valid(text_iter_obj* object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { + if (flags & ITER_REVERSE) { return (object->u.brk.start != UBRK_DONE); } else { return (object->u.brk.end != UBRK_DONE); } } -static void text_iter_brk_char_current(text_iter_obj* object TSRMLS_DC) +static void text_iter_brk_char_current(text_iter_obj* object, long flags TSRMLS_DC) { uint32_t length; int32_t start = object->u.brk.start; int32_t end = object->u.brk.end; - if (object->flags & ITER_REVERSE) { - if (end == UBRK_DONE) { - end = object->text_len; + if (start != UBRK_DONE && end != UBRK_DONE) { + length = end - start; + if (length+1 > object->current_alloc) { + object->current_alloc = length+1; + Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); } + u_memcpy(Z_USTRVAL_P(object->current), object->text + start, length); } else { - if (start == UBRK_DONE) { - start = 0; - } + length = 0; } - length = end - start; - if (length > object->current_alloc-1) { - object->current_alloc = length+1; - Z_USTRVAL_P(object->current) = eurealloc(Z_USTRVAL_P(object->current), object->current_alloc); - } - u_memcpy(Z_USTRVAL_P(object->current), object->text + start, length); + Z_USTRVAL_P(object->current)[length] = 0; Z_USTRLEN_P(object->current) = length; } -static int text_iter_brk_char_key(text_iter_obj* object TSRMLS_DC) +static int text_iter_brk_char_key(text_iter_obj* object, long flags TSRMLS_DC) { return object->u.brk.index; } -static void text_iter_brk_char_next(text_iter_obj* object TSRMLS_DC) +static int text_iter_brk_char_offset(text_iter_obj* object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { - if (object->u.brk.start != UBRK_DONE) { + return object->u.brk.cp_offset; +} + +static void text_iter_brk_char_next(text_iter_obj* object, long flags TSRMLS_DC) +{ + if (text_iter_brk_char_valid(object, flags TSRMLS_CC)) { + if (flags & ITER_REVERSE) { object->u.brk.end = object->u.brk.start; object->u.brk.start = ubrk_previous(object->u.brk.iter); - object->u.brk.index++; - } - } else { - if (object->u.brk.end != UBRK_DONE) { + if (object->u.brk.end - object->u.brk.start > 1) { + object->u.brk.cp_offset -= u_countChar32(object->text, object->u.brk.end - object->u.brk.start); + } else { + object->u.brk.cp_offset--; + } + if (object->u.brk.start == UBRK_DONE) { + object->u.brk.end = UBRK_DONE; + } + } else { + if (object->u.brk.end - object->u.brk.start > 1) { + object->u.brk.cp_offset += u_countChar32(object->text, object->u.brk.end - object->u.brk.start); + } else { + object->u.brk.cp_offset++; + } object->u.brk.start = object->u.brk.end; object->u.brk.end = ubrk_next(object->u.brk.iter); - object->u.brk.index++; + if (object->u.brk.end == UBRK_DONE) { + object->u.brk.start = UBRK_DONE; + } } + object->u.brk.index++; } } -static void text_iter_brk_char_rewind(text_iter_obj *object TSRMLS_DC) +static void text_iter_brk_char_rewind(text_iter_obj *object, long flags TSRMLS_DC) { - if (object->flags & ITER_REVERSE) { - object->u.brk.end = ubrk_last(object->u.brk.iter); - object->u.brk.start = ubrk_previous(object->u.brk.iter); + if (flags & ITER_REVERSE) { + object->u.brk.end = ubrk_last(object->u.brk.iter); + object->u.brk.start = ubrk_previous(object->u.brk.iter); + object->u.brk.cp_offset = u_countChar32(object->text, object->u.brk.start); } else { - object->u.brk.start = ubrk_first(object->u.brk.iter); - object->u.brk.end = ubrk_next(object->u.brk.iter); + object->u.brk.start = ubrk_first(object->u.brk.iter); + object->u.brk.end = ubrk_next(object->u.brk.iter); + object->u.brk.cp_offset = 0; } object->u.brk.index = 0; } @@ -308,6 +362,7 @@ text_iter_brk_char_valid, text_iter_brk_char_current, text_iter_brk_char_key, + text_iter_brk_char_offset, text_iter_brk_char_next, text_iter_brk_char_rewind, }; @@ -338,7 +393,7 @@ { text_iter_obj* obj = text_iter_to_obj(iter); - if (iter_ops[obj->type]->valid(obj TSRMLS_CC)) { + if (iter_ops[obj->type]->valid(obj, obj->flags TSRMLS_CC)) { return SUCCESS; } else { return FAILURE; @@ -349,7 +404,7 @@ { text_iter_obj* obj = text_iter_to_obj(iter); - iter_ops[obj->type]->current(obj TSRMLS_CC); + iter_ops[obj->type]->current(obj, obj->flags TSRMLS_CC); *data = &obj->current; } @@ -357,7 +412,7 @@ { text_iter_obj* obj = text_iter_to_obj(iter); - *int_key = iter_ops[obj->type]->key(obj TSRMLS_CC); + *int_key = iter_ops[obj->type]->key(obj, obj->flags TSRMLS_CC); return HASH_KEY_IS_LONG; } @@ -365,14 +420,14 @@ { text_iter_obj* obj = text_iter_to_obj(iter); - iter_ops[obj->type]->next(obj TSRMLS_CC); + iter_ops[obj->type]->next(obj, obj->flags TSRMLS_CC); } static void text_iter_rewind(zend_object_iterator* iter TSRMLS_DC) { text_iter_obj* obj = text_iter_to_obj(iter); - iter_ops[obj->type]->rewind(obj TSRMLS_CC); + iter_ops[obj->type]->rewind(obj, obj->flags TSRMLS_CC); } zend_object_iterator_funcs text_iter_funcs = { @@ -488,7 +543,7 @@ } } - iter_ops[intern->type]->rewind(intern TSRMLS_CC); + iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC); } PHP_METHOD(TextIterator, current) @@ -496,7 +551,7 @@ zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); - iter_ops[intern->type]->current(intern TSRMLS_CC); + iter_ops[intern->type]->current(intern, intern->flags TSRMLS_CC); RETURN_UNICODEL(Z_USTRVAL_P(intern->current), Z_USTRLEN_P(intern->current), 1); } @@ -505,7 +560,12 @@ zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); - iter_ops[intern->type]->next(intern TSRMLS_CC); + iter_ops[intern->type]->next(intern, intern->flags TSRMLS_CC); + if (iter_ops[intern->type]->valid(intern, intern->flags TSRMLS_CC)) { + RETURN_LONG(iter_ops[intern->type]->offset(intern, intern->flags TSRMLS_CC)); + } else { + RETURN_LONG((long)UBRK_DONE); + } } PHP_METHOD(TextIterator, key) @@ -513,7 +573,7 @@ zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); - RETURN_LONG(iter_ops[intern->type]->key(intern TSRMLS_CC)); + RETURN_LONG(iter_ops[intern->type]->key(intern, intern->flags TSRMLS_CC)); } PHP_METHOD(TextIterator, valid) @@ -521,7 +581,7 @@ zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); - RETURN_BOOL(iter_ops[intern->type]->valid(intern TSRMLS_CC)); + RETURN_BOOL(iter_ops[intern->type]->valid(intern, intern->flags TSRMLS_CC)); } PHP_METHOD(TextIterator, rewind) @@ -529,7 +589,8 @@ zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); - iter_ops[intern->type]->rewind(intern TSRMLS_CC); + iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC); + RETURN_LONG(iter_ops[intern->type]->offset(intern, intern->flags TSRMLS_CC)); } PHP_METHOD(TextIterator, offset) @@ -537,7 +598,22 @@ zval *object = getThis(); text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); - RETURN_LONG(intern->u.start); + RETURN_LONG(iter_ops[intern->type]->offset(intern, intern->flags TSRMLS_CC)); +} + +PHP_METHOD(TextIterator, previous) +{ + long flags; + zval *object = getThis(); + text_iter_obj *intern = (text_iter_obj*) zend_object_store_get_object(object TSRMLS_CC); + + flags = intern->flags | ITER_REVERSE; + iter_ops[intern->type]->next(intern, flags TSRMLS_CC); + if (iter_ops[intern->type]->valid(intern, flags TSRMLS_CC)) { + RETURN_LONG(iter_ops[intern->type]->offset(intern, flags TSRMLS_CC)); + } else { + RETURN_LONG((long)UBRK_DONE); + } } static zend_function_entry text_iterator_funcs[] = { @@ -551,6 +627,7 @@ PHP_ME(TextIterator, rewind, NULL, ZEND_ACC_PUBLIC) PHP_ME(TextIterator, offset, NULL, ZEND_ACC_PUBLIC) + PHP_ME(TextIterator, previous, NULL, ZEND_ACC_PUBLIC) {NULL, NULL, NULL} }; @@ -578,6 +655,8 @@ zend_declare_class_constant_long(text_iterator_ce, "WORD", sizeof("WORD")-1, ITER_WORD TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "LINE", sizeof("LINE")-1, ITER_LINE TSRMLS_CC); zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC); + + zend_declare_class_constant_long(text_iterator_ce, "DONE", sizeof("DONE")-1, UBRK_DONE TSRMLS_CC); } /*
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php