cvsuser     03/11/14 12:27:02

  Modified:    encodings dbcs.c singlebyte.c utf16.c utf32.c utf8.c
               include/parrot encoding.h string.h string_funcs.h
               src      chartype.c string.c
  Log:
  Implement string iterator and decode_and_advance function
  
  Revision  Changes    Path
  1.2       +20 -3     parrot/encodings/dbcs.c
  
  Index: dbcs.c
  ===================================================================
  RCS file: /cvs/public/parrot/encodings/dbcs.c,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -w -r1.1 -r1.2
  --- dbcs.c    3 Nov 2003 15:04:58 -0000       1.1
  +++ dbcs.c    14 Nov 2003 20:26:38 -0000      1.2
  @@ -1,7 +1,7 @@
   /* dbcs.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: dbcs.c,v 1.1 2003/11/03 15:04:58 petergibbs Exp $
  + *     $Id: dbcs.c,v 1.2 2003/11/14 20:26:38 petergibbs Exp $
    *  Overview:
    *     This defines the DBCS encoding routines.
    *  Data Structure and Algorithms:
  @@ -90,15 +90,32 @@
       return ptr;
   }
   
  +static UINTVAL
  +dbcs_decode_and_advance(struct string_iterator_t *i)
  +{
  +    const byte_t *ptr = (byte_t *)i->str->strstart + i->bytepos;
  +    if (*ptr < 128) {
  +        i->bytepos++;
  +        i->charpos++;
  +        return *ptr;
  +    }
  +    else {
  +        i->bytepos += 2;
  +        i->charpos++;
  +        return (*ptr << 8) | *(ptr+1);
  +    }
  +}
  +
   const ENCODING dbcs_encoding = {
       enum_encoding_dbcs,
       "dbcs",
  -    1,
  +    2,
       dbcs_characters,
       dbcs_decode,
       dbcs_encode,
       dbcs_skip_forward,
  -    dbcs_skip_backward
  +    dbcs_skip_backward,
  +    dbcs_decode_and_advance
   };
   
   /*
  
  
  
  1.17      +12 -2     parrot/encodings/singlebyte.c
  
  Index: singlebyte.c
  ===================================================================
  RCS file: /cvs/public/parrot/encodings/singlebyte.c,v
  retrieving revision 1.16
  retrieving revision 1.17
  diff -u -w -r1.16 -r1.17
  --- singlebyte.c      21 Jul 2003 18:00:37 -0000      1.16
  +++ singlebyte.c      14 Nov 2003 20:26:40 -0000      1.17
  @@ -1,7 +1,7 @@
   /* singlebyte.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: singlebyte.c,v 1.16 2003/07/21 18:00:37 chromatic Exp $
  + *     $Id: singlebyte.c,v 1.17 2003/11/14 20:26:40 petergibbs Exp $
    *  Overview:
    *     This defines the single byte encoding routines.
    *  Data Structure and Algorithms:
  @@ -59,6 +59,15 @@
       return bptr - n;
   }
   
  +static UINTVAL
  +singlebyte_decode_and_advance(struct string_iterator_t *i)
  +{
  +    const byte_t *ptr = (byte_t *)i->str->strstart + i->bytepos;
  +    i->bytepos++;
  +    i->charpos++;
  +    return *ptr;
  +}
  +
   const ENCODING singlebyte_encoding = {
       enum_encoding_singlebyte,
       "singlebyte",
  @@ -67,7 +76,8 @@
       singlebyte_decode,
       singlebyte_encode,
       singlebyte_skip_forward,
  -    singlebyte_skip_backward
  +    singlebyte_skip_backward,
  +    singlebyte_decode_and_advance
   };
   
   /*
  
  
  
  1.14      +31 -2     parrot/encodings/utf16.c
  
  Index: utf16.c
  ===================================================================
  RCS file: /cvs/public/parrot/encodings/utf16.c,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -w -r1.13 -r1.14
  --- utf16.c   21 Jul 2003 18:00:37 -0000      1.13
  +++ utf16.c   14 Nov 2003 20:26:40 -0000      1.14
  @@ -1,7 +1,7 @@
   /* utf16.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: utf16.c,v 1.13 2003/07/21 18:00:37 chromatic Exp $
  + *     $Id: utf16.c,v 1.14 2003/11/14 20:26:40 petergibbs Exp $
    *  Overview:
    *     This defines the UTF-16 encoding routines.
    *  Data Structure and Algorithms:
  @@ -131,6 +131,34 @@
       return u16ptr;
   }
   
  +static UINTVAL
  +utf16_decode_and_advance(struct string_iterator_t *i)
  +{
  +    const utf16_t *u16ptr = (char *)i->str->strstart + i->bytepos;
  +    UINTVAL c = *u16ptr++;
  +
  +    if (UNICODE_IS_HIGH_SURROGATE(c)) {
  +        utf16_t low = *u16ptr++;
  +
  +        if (!UNICODE_IS_LOW_SURROGATE(low)) {
  +            internal_exception(MALFORMED_UTF16,
  +                               "Malformed UTF-16 surrogate\n");
  +        }
  +
  +        c = UNICODE_DECODE_SURROGATE(c, low);
  +        i->bytepos += 4;
  +    }
  +    else if (UNICODE_IS_LOW_SURROGATE(c)) {
  +        internal_exception(MALFORMED_UTF16, "Malformed UTF-16 surrogate\n");
  +    }
  +    else {
  +        i->bytepos += 2;
  +    }
  +
  +    i->charpos++;
  +    return c;
  +}
  +
   const ENCODING utf16_encoding = {
       enum_encoding_utf16,
       "utf16",
  @@ -139,7 +167,8 @@
       utf16_decode,
       utf16_encode,
       utf16_skip_forward,
  -    utf16_skip_backward
  +    utf16_skip_backward,
  +    utf16_decode_and_advance
   };
   
   /*
  
  
  
  1.12      +12 -2     parrot/encodings/utf32.c
  
  Index: utf32.c
  ===================================================================
  RCS file: /cvs/public/parrot/encodings/utf32.c,v
  retrieving revision 1.11
  retrieving revision 1.12
  diff -u -w -r1.11 -r1.12
  --- utf32.c   21 Jul 2003 18:00:37 -0000      1.11
  +++ utf32.c   14 Nov 2003 20:26:40 -0000      1.12
  @@ -1,7 +1,7 @@
   /* utf32.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: utf32.c,v 1.11 2003/07/21 18:00:37 chromatic Exp $
  + *     $Id: utf32.c,v 1.12 2003/11/14 20:26:40 petergibbs Exp $
    *  Overview:
    *     This defines the UTF-32 encoding routines.
    *  Data Structure and Algorithms:
  @@ -64,6 +64,15 @@
       return u32ptr - n;
   }
   
  +static UINTVAL
  +utf32_decode_and_advance(struct string_iterator_t *i)
  +{
  +    const utf32_t *u32ptr = (utf32_t *)((char *)i->str->strstart + i->bytepos);
  +    i->bytepos += 4;
  +    i->charpos++;
  +    return *u32ptr;
  +}
  +
   const ENCODING utf32_encoding = {
       enum_encoding_utf32,
       "utf32",
  @@ -72,7 +81,8 @@
       utf32_decode,
       utf32_encode,
       utf32_skip_forward,
  -    utf32_skip_backward
  +    utf32_skip_backward,
  +    utf32_decode_and_advance
   };
   
   /*
  
  
  
  1.15      +39 -2     parrot/encodings/utf8.c
  
  Index: utf8.c
  ===================================================================
  RCS file: /cvs/public/parrot/encodings/utf8.c,v
  retrieving revision 1.14
  retrieving revision 1.15
  diff -u -w -r1.14 -r1.15
  --- utf8.c    21 Jul 2003 18:00:37 -0000      1.14
  +++ utf8.c    14 Nov 2003 20:26:40 -0000      1.15
  @@ -1,7 +1,7 @@
   /* utf8.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: utf8.c,v 1.14 2003/07/21 18:00:37 chromatic Exp $
  + *     $Id: utf8.c,v 1.15 2003/11/14 20:26:40 petergibbs Exp $
    *  Overview:
    *     This defines the UTF-8 encoding routines.
    *  Data Structure and Algorithms:
  @@ -133,6 +133,42 @@
       return u8ptr;
   }
   
  +static UINTVAL
  +utf8_decode_and_advance(struct string_iterator_t *i)
  +{
  +    const utf8_t *u8ptr = (char *)i->str->strstart + i->bytepos;
  +    UINTVAL c = *u8ptr;
  +
  +    if (UTF8_IS_START(c)) {
  +        UINTVAL len = UTF8SKIP(u8ptr);
  +        UINTVAL count;
  +
  +        c &= UTF8_START_MASK(len);
  +        i->bytepos += len;
  +//      for (count = 1; count < len; count++) {
  +        for (len--; len; len--) {
  +            u8ptr++;
  +            if (!UTF8_IS_CONTINUATION(*u8ptr)) {
  +                internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
  +            }
  +            c = UTF8_ACCUMULATE(c, *u8ptr);
  +        }
  +
  +        if (UNICODE_IS_SURROGATE(c)) {
  +            internal_exception(MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
  +        }
  +    }
  +    else if (!UNICODE_IS_INVARIANT(c)) {
  +        internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
  +    }
  +    else {
  +        i->bytepos++;
  +    }
  +
  +    i->charpos++;
  +    return c;
  +}
  +
   const ENCODING utf8_encoding = {
       enum_encoding_utf8,
       "utf8",
  @@ -141,7 +177,8 @@
       utf8_decode,
       utf8_encode,
       utf8_skip_forward,
  -    utf8_skip_backward
  +    utf8_skip_backward,
  +    utf8_decode_and_advance
   };
   
   /*
  
  
  
  1.23      +6 -3      parrot/include/parrot/encoding.h
  
  Index: encoding.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/encoding.h,v
  retrieving revision 1.22
  retrieving revision 1.23
  diff -u -w -r1.22 -r1.23
  --- encoding.h        3 Nov 2003 15:05:01 -0000       1.22
  +++ encoding.h        14 Nov 2003 20:26:55 -0000      1.23
  @@ -1,7 +1,7 @@
   /* encoding.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: encoding.h,v 1.22 2003/11/03 15:05:01 petergibbs Exp $
  + *     $Id: encoding.h,v 1.23 2003/11/14 20:26:55 petergibbs Exp $
    *  Overview:
    *     This is the api header for the string encoding subsystem
    *  Data Structure and Algorithms:
  @@ -25,6 +25,8 @@
   
   /* &end_gen */
   
  +struct string_iterator_t;
  +
   struct parrot_encoding_t {
       INTVAL index;
       const char *name;
  @@ -34,6 +36,7 @@
       void *(*encode) (void *ptr, Parrot_UInt c);
       const void *(*skip_forward) (const void *ptr, Parrot_UInt n);
       const void *(*skip_backward) (const void *ptr, Parrot_UInt n);
  +    Parrot_UInt(*decode_and_advance) (struct string_iterator_t *i);
   };
   
   typedef struct parrot_encoding_t* Parrot_Encoding;
  
  
  
  1.55      +8 -1      parrot/include/parrot/string.h
  
  Index: string.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/string.h,v
  retrieving revision 1.54
  retrieving revision 1.55
  diff -u -w -r1.54 -r1.55
  --- string.h  21 Jul 2003 18:00:42 -0000      1.54
  +++ string.h  14 Nov 2003 20:26:57 -0000      1.55
  @@ -1,7 +1,7 @@
   /* string.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string.h,v 1.54 2003/07/21 18:00:42 chromatic Exp $
  + *     $Id: string.h,v 1.55 2003/11/14 20:26:57 petergibbs Exp $
    *  Overview:
    *     This is the api header for the string subsystem
    *  Data Structure and Algorithms:
  @@ -35,6 +35,13 @@
       TAIL_moved_FLAG = 1 << 0
   } TAIL_flags;
   
  +/* String iterator */
  +typedef struct string_iterator_t {
  +  String *str;
  +  UINTVAL bytepos;
  +  UINTVAL charpos;
  +  UINTVAL (*decode_and_advance)(struct string_iterator_t *i);
  +} string_iterator;
   
   /* stringinfo parameters */
   
  
  
  
  1.30      +2 -1      parrot/include/parrot/string_funcs.h
  
  Index: string_funcs.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
  retrieving revision 1.29
  retrieving revision 1.30
  diff -u -w -r1.29 -r1.30
  --- string_funcs.h    14 Nov 2003 08:35:47 -0000      1.29
  +++ string_funcs.h    14 Nov 2003 20:26:58 -0000      1.30
  @@ -1,7 +1,7 @@
   /* string_funcs.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string_funcs.h,v 1.29 2003/11/14 08:35:47 leo Exp $
  + *     $Id: string_funcs.h,v 1.30 2003/11/14 20:26:58 petergibbs Exp $
    *  Overview:
    *     This is the api header for the string subsystem
    *  Data Structure and Algorithms:
  @@ -69,6 +69,7 @@
                  STRING *s2, STRING **dest);
   STRING *string_bitwise_xor(struct Parrot_Interp *interpreter, STRING *s1,
                  STRING *s2, STRING **dest);
  +void string_iterator_init(struct string_iterator_t *i, STRING *s);
   
   #endif
   
  
  
  
  1.21      +2 -2      parrot/src/chartype.c
  
  Index: chartype.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/chartype.c,v
  retrieving revision 1.20
  retrieving revision 1.21
  diff -u -w -r1.20 -r1.21
  --- chartype.c        6 Nov 2003 20:56:06 -0000       1.20
  +++ chartype.c        14 Nov 2003 20:27:02 -0000      1.21
  @@ -1,7 +1,7 @@
   /* chartype.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: chartype.c,v 1.20 2003/11/06 20:56:06 petergibbs Exp $
  + *     $Id: chartype.c,v 1.21 2003/11/14 20:27:02 petergibbs Exp $
    *  Overview:
    *     This defines the string character type subsystem
    *  Data Structure and Algorithms:
  @@ -115,7 +115,7 @@
               }
           }
           internal_exception(INVALID_CHARACTER,
  -                           "Invalid character for chartype\n");
  +                           "Invalid character <%X> for chartype\n",c);
           return 0;
       }
   }
  
  
  
  1.158     +18 -13    parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.157
  retrieving revision 1.158
  diff -u -w -r1.157 -r1.158
  --- string.c  14 Nov 2003 08:35:50 -0000      1.157
  +++ string.c  14 Nov 2003 20:27:02 -0000      1.158
  @@ -1,7 +1,7 @@
   /* string.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string.c,v 1.157 2003/11/14 08:35:50 leo Exp $
  + *     $Id: string.c,v 1.158 2003/11/14 20:27:02 petergibbs Exp $
    *  Overview:
    *     This is the api definitions for the string subsystem
    *  Data Structure and Algorithms:
  @@ -1013,9 +1013,9 @@
   INTVAL
   hash_string_equal(struct Parrot_Interp *interpreter, STRING *s1, STRING *s2)
   {
  -    const char *s1start, *s1end;
  -    const char *s2start;
  -    size_t len;
  +    struct string_iterator_t i1;
  +    struct string_iterator_t i2;
  +
       /*
        * both strings aren't null
        */
  @@ -1026,23 +1026,19 @@
       /*
        * both strings have equal amount of chars
        */
  -    s1start = s1->strstart;
  -    s2start = s2->strstart;
  -    len = (size_t) s1->bufused;
   
       /* speed up ascii, slow down general case
        */
       if (s1->encoding->index == enum_encoding_singlebyte &&
           s2->encoding->index == enum_encoding_singlebyte) {
  -        return memcmp(s1start, s2start, s1->bufused);
  +        return memcmp(s1->strstart, s2->strstart, s1->bufused);
       }
   
  -    s1end = s1start + len;
  -    while (s1start < s1end) {
  -        if (s1->encoding->decode(s1start) != s2->encoding->decode(s2start))
  +    string_iterator_init(&i1, s1);
  +    string_iterator_init(&i2, s2);
  +    while (i1.charpos < s1->strlen) {
  +        if (i1.decode_and_advance(&i1) != i2.decode_and_advance(&i2))
               return 1;
  -        s1start = s1->encoding->skip_forward(s1start, 1);
  -        s2start = s2->encoding->skip_forward(s2start, 1);
       }
       return 0;
   }
  @@ -1651,6 +1647,15 @@
       PObj_sysmem_CLEAR(s);
       /* Free up the memory */
       mem_sys_free(memory);
  +}
  +
  +void
  +string_iterator_init(struct string_iterator_t *i, STRING *s)
  +{
  +    i->str = s;
  +    i->bytepos = 0;
  +    i->charpos = 0;
  +    i->decode_and_advance = s->encoding->decode_and_advance;
   }
   
   /*
  
  
  

Reply via email to