andrei          Sat Jun 24 21:57:14 2006 UTC

  Modified files:              
    /php-src/ext/unicode        unicode_iterators.c 
  Log:
  A lot of work on making TextIterator support propert codepoint-level
  offsets and making it more robust in general.
  
  
http://cvs.php.net/viewvc.cgi/php-src/ext/unicode/unicode_iterators.c?r1=1.25&r2=1.26&diff_format=u
Index: php-src/ext/unicode/unicode_iterators.c
diff -u php-src/ext/unicode/unicode_iterators.c:1.25 
php-src/ext/unicode/unicode_iterators.c:1.26
--- php-src/ext/unicode/unicode_iterators.c:1.25        Sat Jun 24 18:18:38 2006
+++ php-src/ext/unicode/unicode_iterators.c     Sat Jun 24 21:57:14 2006
@@ -14,11 +14,12 @@
    +----------------------------------------------------------------------+
 */
 
-/* $Id: unicode_iterators.c,v 1.25 2006/06/24 18:18:38 andrei Exp $ */
+/* $Id: unicode_iterators.c,v 1.26 2006/06/24 21:57:14 andrei Exp $ */
 
 /*
  * TODO
  *
+ * - test with empty and 1 character strings
  * - optimize current() to pass return_value to the handler so that it fills it
  *   in directly instead of creating a new zval
  * - implement Countable (or count_elements handler) and Seekable interfaces
@@ -51,20 +52,23 @@
        size_t                  current_alloc;
        long                    flags;
        union {
-               int32_t         start;
                struct {
-                       int32_t start;
+                       int32_t offset;
+                       int32_t cp_offset;
                        int32_t index;
                } cp;
                struct {
                        int32_t start;
                        int32_t end;
                        int32_t index;
+                       int32_t start_cp_offset;
+                       int32_t end_cp_offset;
                } cs;
                struct {
                        int32_t start;
                        int32_t end;
                        int32_t index;
+                       int32_t cp_offset;
                        UBreakIterator *iter;
                } brk;
        } u;
@@ -77,11 +81,12 @@
 }
 
 typedef struct {
-       int  (*valid)  (text_iter_obj* object TSRMLS_DC);
-       void (*current)(text_iter_obj* object TSRMLS_DC);
-       int  (*key)    (text_iter_obj* object TSRMLS_DC);
-       void (*next)   (text_iter_obj* object TSRMLS_DC);
-       void (*rewind) (text_iter_obj* object TSRMLS_DC);
+       int  (*valid)  (text_iter_obj* object, long flags TSRMLS_DC);
+       void (*current)(text_iter_obj* object, long flags TSRMLS_DC);
+       int  (*key)    (text_iter_obj* object, long flags TSRMLS_DC);
+       int  (*offset) (text_iter_obj* object, long flags TSRMLS_DC);
+       void (*next)   (text_iter_obj* object, long flags TSRMLS_DC);
+       void (*rewind) (text_iter_obj* object, long flags TSRMLS_DC);
 } text_iter_ops;
 
 enum UBreakIteratorType brk_type_map[] = {
@@ -97,52 +102,65 @@
 
 /* Code point ops */
 
-static int text_iter_cp_valid(text_iter_obj* object TSRMLS_DC)
+static int text_iter_cp_valid(text_iter_obj* object, long flags TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
-               return (object->u.cp.start > 0);
+       if (flags & ITER_REVERSE) {
+               return (object->u.cp.offset > 0);
        } else {
-               return (object->u.cp.start < object->text_len);
+               return (object->u.cp.offset < object->text_len);
        }
 }
 
-static void text_iter_cp_current(text_iter_obj* object TSRMLS_DC)
+static void text_iter_cp_current(text_iter_obj* object, long flags TSRMLS_DC)
 {
-       UChar32 cp;
-       int32_t tmp, buf_len;
+       UChar32 cp = 0;
+       int32_t tmp, buf_len = 0;
 
-       tmp = object->u.cp.start;
-       if (object->flags & ITER_REVERSE) {
-               U16_PREV(object->text, 0, tmp, cp);
-       } else {
-               U16_NEXT(object->text, tmp, object->text_len, cp);
+       if (text_iter_cp_valid(object, flags TSRMLS_CC)) {
+               tmp = object->u.cp.offset;
+               if (flags & ITER_REVERSE) {
+                       U16_PREV(object->text, 0, tmp, cp);
+               } else {
+                       U16_NEXT(object->text, tmp, object->text_len, cp);
+               }
+               buf_len = zend_codepoint_to_uchar(cp, 
Z_USTRVAL_P(object->current));
        }
-       buf_len = zend_codepoint_to_uchar(cp, Z_USTRVAL_P(object->current));
        Z_USTRVAL_P(object->current)[buf_len] = 0;
        Z_USTRLEN_P(object->current) = buf_len;
 }
 
-static int text_iter_cp_key(text_iter_obj* object TSRMLS_DC)
+static int text_iter_cp_key(text_iter_obj* object, long flags TSRMLS_DC)
 {
        return object->u.cp.index;
 }
 
-static void text_iter_cp_next(text_iter_obj* object TSRMLS_DC)
+static int text_iter_cp_offset(text_iter_obj* object, long flags TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
-               U16_BACK_1(object->text, 0, object->u.cp.start);
-       } else {
-               U16_FWD_1(object->text, object->u.cp.start, object->text_len);
+       return object->u.cp.cp_offset;
+}
+
+static void text_iter_cp_next(text_iter_obj* object, long flags TSRMLS_DC)
+{
+       if (text_iter_cp_valid(object, flags TSRMLS_CC)) {
+               if (flags & ITER_REVERSE) {
+                       U16_BACK_1(object->text, 0, object->u.cp.offset);
+                       object->u.cp.cp_offset--;
+               } else {
+                       U16_FWD_1(object->text, object->u.cp.offset, 
object->text_len);
+                       object->u.cp.cp_offset++;
+               }
+               object->u.cp.index++;
        }
-       object->u.cp.index++;
 }
 
-static void text_iter_cp_rewind(text_iter_obj *object TSRMLS_DC)
+static void text_iter_cp_rewind(text_iter_obj *object, long flags TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
-               object->u.cp.start = object->text_len;
+       if (flags & ITER_REVERSE) {
+               object->u.cp.offset = object->text_len;
+               object->u.cp.cp_offset = u_countChar32(object->text, 
object->text_len);
        } else {
-               object->u.cp.start = 0;
+               object->u.cp.offset = 0;
+               object->u.cp.cp_offset = 0;
        }
        object->u.cp.index  = 0;
 }
@@ -151,25 +169,26 @@
        text_iter_cp_valid,
        text_iter_cp_current,
        text_iter_cp_key,
+       text_iter_cp_offset,
        text_iter_cp_next,
        text_iter_cp_rewind,
 };
 
 /* Combining sequence ops */
 
-static int text_iter_cs_valid(text_iter_obj* object TSRMLS_DC)
+static int text_iter_cs_valid(text_iter_obj* object, long flags TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
+       if (flags & ITER_REVERSE) {
                return (object->u.cs.end > 0);
        } else {
                return (object->u.cs.end <= object->text_len);
        }
 }
 
-static void text_iter_cs_current(text_iter_obj* object TSRMLS_DC)
+static void text_iter_cs_current(text_iter_obj* object, long flags TSRMLS_DC)
 {
        uint32_t length = object->u.cs.end - object->u.cs.start;
-       if (length > object->current_alloc) {
+       if (length+1 > object->current_alloc) {
                object->current_alloc = length+1;
                Z_USTRVAL_P(object->current) = 
eurealloc(Z_USTRVAL_P(object->current), object->current_alloc);
        }
@@ -178,50 +197,68 @@
        Z_USTRLEN_P(object->current) = length;
 }
 
-static int text_iter_cs_key(text_iter_obj* object TSRMLS_DC)
+static int text_iter_cs_key(text_iter_obj* object, long flags TSRMLS_DC)
 {
        return object->u.cs.index;
 }
 
-static void text_iter_cs_next(text_iter_obj* object TSRMLS_DC)
+static int text_iter_cs_offset(text_iter_obj* object, long flags TSRMLS_DC)
+{
+       return object->u.cs.start_cp_offset;
+}
+
+static void text_iter_cs_next(text_iter_obj* object, long flags TSRMLS_DC)
 {
        UChar32 cp;
-       uint32_t tmp;
+       int32_t tmp, tmp2;
 
-       if (object->flags & ITER_REVERSE) {
-               object->u.cs.end = object->u.cs.start;
-               U16_PREV(object->text, 0, object->u.cs.start, cp);
-               if (u_getCombiningClass(cp) != 0) {
-                       do {
-                               U16_PREV(object->text, 0, object->u.cs.start, 
cp);
-                       } while (object->u.cs.start > 0 && 
u_getCombiningClass(cp) != 0);
-               }
-       } else {
-               object->u.cs.start = object->u.cs.end;
-               U16_NEXT(object->text, object->u.cs.end, object->text_len, cp);
-               if (u_getCombiningClass(cp) == 0) {
-                       tmp = object->u.cs.end;
-                       while (tmp < object->text_len) {
-                               U16_NEXT(object->text, tmp, object->text_len, 
cp);
-                               if (u_getCombiningClass(cp) == 0) {
-                                       break;
-                               } else {
-                                       object->u.cs.end = tmp;
+       if (text_iter_cs_valid(object, flags TSRMLS_CC)) {
+               if (flags & ITER_REVERSE) {
+                       object->u.cs.end = object->u.cs.start;
+                       object->u.cs.end_cp_offset = 
object->u.cs.start_cp_offset;
+                       U16_PREV(object->text, 0, object->u.cs.start, cp);
+                       object->u.cs.start_cp_offset--;
+                       if (u_getCombiningClass(cp) != 0) {
+                               do {
+                                       U16_PREV(object->text, 0, 
object->u.cs.start, cp);
+                                       object->u.cs.start_cp_offset--;
+                               } while (object->u.cs.start > 0 && 
u_getCombiningClass(cp) != 0);
+                       }
+               } else {
+                       object->u.cs.start = object->u.cs.end;
+                       object->u.cs.start_cp_offset = 
object->u.cs.end_cp_offset;
+                       U16_NEXT(object->text, object->u.cs.end, 
object->text_len, cp);
+                       object->u.cs.end_cp_offset++;
+                       if (u_getCombiningClass(cp) == 0) {
+                               tmp = object->u.cs.end;
+                               tmp2 = object->u.cs.end_cp_offset;
+                               while (tmp < object->text_len) {
+                                       U16_NEXT(object->text, tmp, 
object->text_len, cp);
+                                       tmp2++;
+                                       if (u_getCombiningClass(cp) == 0) {
+                                               break;
+                                       } else {
+                                               object->u.cs.end = tmp;
+                                               object->u.cs.end_cp_offset = 
tmp2;
+                                       }
                                }
                        }
                }
+               object->u.cs.index++;
        }
-       object->u.cs.index++;
 }
 
-static void text_iter_cs_rewind(text_iter_obj *object TSRMLS_DC)
+static void text_iter_cs_rewind(text_iter_obj *object, long flags TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
+       if (flags & ITER_REVERSE) {
                object->u.cs.start = object->u.cs.end = object->text_len;
+               object->u.cs.start_cp_offset = object->u.cs.end_cp_offset =
+                               u_countChar32(object->text, object->text_len);
        } else {
                object->u.cs.start = object->u.cs.end = 0;
+               object->u.cs.start_cp_offset = object->u.cs.end_cp_offset = 0;
        }
-       text_iter_cs_next(object TSRMLS_CC); /* find first sequence */
+       text_iter_cs_next(object, flags TSRMLS_CC); /* find first sequence */
        object->u.cs.index = 0; /* because _next increments index */
 }
 
@@ -229,6 +266,7 @@
        text_iter_cs_valid,
        text_iter_cs_current,
        text_iter_cs_key,
+       text_iter_cs_offset,
        text_iter_cs_next,
        text_iter_cs_rewind,
 };
@@ -236,70 +274,86 @@
 
 /* UBreakIterator Character Ops */
 
-static int text_iter_brk_char_valid(text_iter_obj* object TSRMLS_DC)
+static int text_iter_brk_char_valid(text_iter_obj* object, long flags 
TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
+       if (flags & ITER_REVERSE) {
                return (object->u.brk.start != UBRK_DONE);
        } else {
                return (object->u.brk.end != UBRK_DONE);
        }
 }
 
-static void text_iter_brk_char_current(text_iter_obj* object TSRMLS_DC)
+static void text_iter_brk_char_current(text_iter_obj* object, long flags 
TSRMLS_DC)
 {
        uint32_t length;
        int32_t start = object->u.brk.start;
        int32_t end = object->u.brk.end;
 
-       if (object->flags & ITER_REVERSE) {
-               if (end == UBRK_DONE) {
-                       end = object->text_len;
+       if (start != UBRK_DONE && end != UBRK_DONE) {
+               length = end - start;
+               if (length+1 > object->current_alloc) {
+                       object->current_alloc = length+1;
+                       Z_USTRVAL_P(object->current) = 
eurealloc(Z_USTRVAL_P(object->current), object->current_alloc);
                }
+               u_memcpy(Z_USTRVAL_P(object->current), object->text + start, 
length);
        } else {
-               if (start == UBRK_DONE) {
-                       start = 0;
-               }
+               length = 0;
        }
-       length = end - start;
-       if (length > object->current_alloc-1) {
-               object->current_alloc = length+1;
-               Z_USTRVAL_P(object->current) = 
eurealloc(Z_USTRVAL_P(object->current), object->current_alloc);
-       }
-       u_memcpy(Z_USTRVAL_P(object->current), object->text + start, length);
+
        Z_USTRVAL_P(object->current)[length] = 0;
        Z_USTRLEN_P(object->current) = length;
 }
 
-static int text_iter_brk_char_key(text_iter_obj* object TSRMLS_DC)
+static int text_iter_brk_char_key(text_iter_obj* object, long flags TSRMLS_DC)
 {
        return object->u.brk.index;
 }
 
-static void text_iter_brk_char_next(text_iter_obj* object TSRMLS_DC)
+static int text_iter_brk_char_offset(text_iter_obj* object, long flags 
TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
-               if (object->u.brk.start != UBRK_DONE) {
+       return object->u.brk.cp_offset;
+}
+
+static void text_iter_brk_char_next(text_iter_obj* object, long flags 
TSRMLS_DC)
+{
+       if (text_iter_brk_char_valid(object, flags TSRMLS_CC)) {
+               if (flags & ITER_REVERSE) {
                        object->u.brk.end = object->u.brk.start;
                        object->u.brk.start = ubrk_previous(object->u.brk.iter);
-                       object->u.brk.index++;
-               }
-       } else {
-               if (object->u.brk.end != UBRK_DONE) {
+                       if (object->u.brk.end - object->u.brk.start > 1) {
+                               object->u.brk.cp_offset -= 
u_countChar32(object->text, object->u.brk.end - object->u.brk.start);
+                       } else {
+                               object->u.brk.cp_offset--;
+                       }
+                       if (object->u.brk.start == UBRK_DONE) {
+                               object->u.brk.end = UBRK_DONE;
+                       }
+               } else {
+                       if (object->u.brk.end - object->u.brk.start > 1) {
+                               object->u.brk.cp_offset += 
u_countChar32(object->text, object->u.brk.end - object->u.brk.start);
+                       } else {
+                               object->u.brk.cp_offset++;
+                       }
                        object->u.brk.start = object->u.brk.end;
                        object->u.brk.end = ubrk_next(object->u.brk.iter);
-                       object->u.brk.index++;
+                       if (object->u.brk.end == UBRK_DONE) {
+                               object->u.brk.start = UBRK_DONE;
+                       }
                }
+               object->u.brk.index++;
        }
 }
 
-static void text_iter_brk_char_rewind(text_iter_obj *object TSRMLS_DC)
+static void text_iter_brk_char_rewind(text_iter_obj *object, long flags 
TSRMLS_DC)
 {
-       if (object->flags & ITER_REVERSE) {
-               object->u.brk.end   = ubrk_last(object->u.brk.iter);
-               object->u.brk.start = ubrk_previous(object->u.brk.iter);
+       if (flags & ITER_REVERSE) {
+               object->u.brk.end       = ubrk_last(object->u.brk.iter);
+               object->u.brk.start     = ubrk_previous(object->u.brk.iter);
+               object->u.brk.cp_offset = u_countChar32(object->text, 
object->u.brk.start);
        } else {
-               object->u.brk.start = ubrk_first(object->u.brk.iter);
-               object->u.brk.end   = ubrk_next(object->u.brk.iter);
+               object->u.brk.start     = ubrk_first(object->u.brk.iter);
+               object->u.brk.end       = ubrk_next(object->u.brk.iter);
+               object->u.brk.cp_offset = 0;
        }
        object->u.brk.index = 0;
 }
@@ -308,6 +362,7 @@
        text_iter_brk_char_valid,
        text_iter_brk_char_current,
        text_iter_brk_char_key,
+       text_iter_brk_char_offset,
        text_iter_brk_char_next,
        text_iter_brk_char_rewind,
 };
@@ -338,7 +393,7 @@
 {
        text_iter_obj* obj = text_iter_to_obj(iter);
 
-       if (iter_ops[obj->type]->valid(obj TSRMLS_CC)) {
+       if (iter_ops[obj->type]->valid(obj, obj->flags TSRMLS_CC)) {
                return SUCCESS;
        } else {
                return FAILURE;
@@ -349,7 +404,7 @@
 {
        text_iter_obj* obj = text_iter_to_obj(iter);
 
-       iter_ops[obj->type]->current(obj TSRMLS_CC);
+       iter_ops[obj->type]->current(obj, obj->flags TSRMLS_CC);
        *data = &obj->current;
 }
 
@@ -357,7 +412,7 @@
 {
        text_iter_obj* obj = text_iter_to_obj(iter);
 
-       *int_key = iter_ops[obj->type]->key(obj TSRMLS_CC);
+       *int_key = iter_ops[obj->type]->key(obj, obj->flags TSRMLS_CC);
        return HASH_KEY_IS_LONG;
 }
 
@@ -365,14 +420,14 @@
 {
        text_iter_obj* obj = text_iter_to_obj(iter);
 
-       iter_ops[obj->type]->next(obj TSRMLS_CC);
+       iter_ops[obj->type]->next(obj, obj->flags TSRMLS_CC);
 }
 
 static void text_iter_rewind(zend_object_iterator* iter TSRMLS_DC)
 {
        text_iter_obj* obj = text_iter_to_obj(iter);
 
-       iter_ops[obj->type]->rewind(obj TSRMLS_CC);
+       iter_ops[obj->type]->rewind(obj, obj->flags TSRMLS_CC);
 }
 
 zend_object_iterator_funcs text_iter_funcs = {
@@ -488,7 +543,7 @@
                }
        }
 
-       iter_ops[intern->type]->rewind(intern TSRMLS_CC);
+       iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC);
 }
 
 PHP_METHOD(TextIterator, current)
@@ -496,7 +551,7 @@
        zval *object = getThis();
        text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
 
-       iter_ops[intern->type]->current(intern TSRMLS_CC);
+       iter_ops[intern->type]->current(intern, intern->flags TSRMLS_CC);
        RETURN_UNICODEL(Z_USTRVAL_P(intern->current), 
Z_USTRLEN_P(intern->current), 1);
 }
 
@@ -505,7 +560,12 @@
        zval *object = getThis();
        text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
 
-       iter_ops[intern->type]->next(intern TSRMLS_CC);
+       iter_ops[intern->type]->next(intern, intern->flags TSRMLS_CC);
+       if (iter_ops[intern->type]->valid(intern, intern->flags TSRMLS_CC)) {
+               RETURN_LONG(iter_ops[intern->type]->offset(intern, 
intern->flags TSRMLS_CC));
+       } else {
+               RETURN_LONG((long)UBRK_DONE);
+       }
 }
 
 PHP_METHOD(TextIterator, key)
@@ -513,7 +573,7 @@
        zval *object = getThis();
        text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
 
-       RETURN_LONG(iter_ops[intern->type]->key(intern TSRMLS_CC));
+       RETURN_LONG(iter_ops[intern->type]->key(intern, intern->flags 
TSRMLS_CC));
 }
 
 PHP_METHOD(TextIterator, valid)
@@ -521,7 +581,7 @@
        zval *object = getThis();
        text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
 
-       RETURN_BOOL(iter_ops[intern->type]->valid(intern TSRMLS_CC));
+       RETURN_BOOL(iter_ops[intern->type]->valid(intern, intern->flags 
TSRMLS_CC));
 }
 
 PHP_METHOD(TextIterator, rewind)
@@ -529,7 +589,8 @@
        zval *object = getThis();
        text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
 
-       iter_ops[intern->type]->rewind(intern TSRMLS_CC);
+       iter_ops[intern->type]->rewind(intern, intern->flags TSRMLS_CC);
+       RETURN_LONG(iter_ops[intern->type]->offset(intern, intern->flags 
TSRMLS_CC));
 }
 
 PHP_METHOD(TextIterator, offset)
@@ -537,7 +598,22 @@
        zval *object = getThis();
        text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
 
-       RETURN_LONG(intern->u.start);
+       RETURN_LONG(iter_ops[intern->type]->offset(intern, intern->flags 
TSRMLS_CC));
+}
+
+PHP_METHOD(TextIterator, previous)
+{
+       long flags;
+       zval *object = getThis();
+       text_iter_obj *intern = (text_iter_obj*) 
zend_object_store_get_object(object TSRMLS_CC);
+
+       flags = intern->flags | ITER_REVERSE;
+       iter_ops[intern->type]->next(intern, flags TSRMLS_CC);
+       if (iter_ops[intern->type]->valid(intern, flags TSRMLS_CC)) {
+               RETURN_LONG(iter_ops[intern->type]->offset(intern, flags 
TSRMLS_CC));
+       } else {
+               RETURN_LONG((long)UBRK_DONE);
+       }
 }
 
 static zend_function_entry text_iterator_funcs[] = {
@@ -551,6 +627,7 @@
        PHP_ME(TextIterator, rewind,      NULL, ZEND_ACC_PUBLIC)
 
        PHP_ME(TextIterator, offset,      NULL, ZEND_ACC_PUBLIC)
+       PHP_ME(TextIterator, previous,    NULL, ZEND_ACC_PUBLIC)
        {NULL, NULL, NULL}
 };
 
@@ -578,6 +655,8 @@
        zend_declare_class_constant_long(text_iterator_ce, "WORD", 
sizeof("WORD")-1, ITER_WORD TSRMLS_CC);
        zend_declare_class_constant_long(text_iterator_ce, "LINE", 
sizeof("LINE")-1, ITER_LINE TSRMLS_CC);
        zend_declare_class_constant_long(text_iterator_ce, "SENTENCE", 
sizeof("SENTENCE")-1, ITER_SENTENCE TSRMLS_CC);
+
+       zend_declare_class_constant_long(text_iterator_ce, "DONE", 
sizeof("DONE")-1, UBRK_DONE TSRMLS_CC);
 }
 
 /*

-- 
PHP CVS Mailing List (http://www.php.net/)
To unsubscribe, visit: http://www.php.net/unsub.php

Reply via email to