op string_cs.t

Leopold Toetsch Mon, 28 Feb 2005 09:18:14 -0800

cvsuser     05/02/28 09:17:57


  Modified:    charset  ascii.c ascii.h iso-8859-1.c iso-8859-1.h
               encodings fixed_8.c
               include/parrot string_funcs.h
               ops      ops.num string.ops
               src      string.c
               t/op     string_cs.t
  Log:
  Strings. Finally. 4 - character classification
  * is_digit, _wordchar, _punctuation, _newline, _whitespace
    opcodes and interface functions
  * make a bitmask from typetables
  
  Revision  Changes    Path
  1.10      +106 -44   parrot/charset/ascii.c
  
  Index: ascii.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/ascii.c,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- ascii.c   28 Feb 2005 15:10:55 -0000      1.9
  +++ ascii.c   28 Feb 2005 17:17:51 -0000      1.10
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: ascii.c,v 1.9 2005/02/28 15:10:55 leo Exp $
  +$Id: ascii.c,v 1.10 2005/02/28 17:17:51 leo Exp $
   
   =head1 NAME
   
  @@ -21,21 +21,26 @@
   /* The encoding we prefer, given a choice */
   static ENCODING *preferred_encoding;
   
  +#define WHITESPACE 1
  +#define WORDCHAR 2
  +#define PUNCTUATION 4
  +#define DIGIT 8
  +
   static const unsigned char typetable[256] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
  -    1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
  -    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, /* 48-63 */
  -    3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
  -    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 80-95 */
  -    3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
  -    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 0, /* 112-127 */
  +    1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32-47 */
  +    0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 4, 4, 4, 4, 4, 4, 
/*48.*/
  +    4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
  +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, /* 80-95 */
  +    4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
  +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 0, /* 112-127 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 160-175 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 176-191 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 192-207 */
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 207-223 */
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 208-223 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 224-239 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 240-255 */
   };
  @@ -44,19 +49,13 @@
   ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
           unsigned char type, const unsigned char *table)
   {
  -    INTVAL retval = -1;
  -    INTVAL found = 0;
   
       for (; start < string->strlen; start++) {
           if (table[ENCODING_GET_CODEPOINT(interpreter, string, start)] == 
type) {
  -            found = 1;
  -            break;
  +            return start;
           }
       }
  -    if (found) {
  -        retval = start;
  -    }
  -    return retval;
  +    return -1;
   }
   
   INTVAL
  @@ -251,112 +250,175 @@
       return -1;
   }
   
  -/* Binary's always valid */
   static UINTVAL
  -validate(Interp *interpreter, STRING *source_string)
  +validate(Interp *interpreter, STRING *src)
   {
  +    UINTVAL codepoint, offset;
  +
  +    for (offset = 0; offset < string_length(interpreter, src); ++offset) {
  +        codepoint = ENCODING_GET_CODEPOINT(interpreter, src, offset);
  +        if (codepoint >= 0x80)
  +            return 0;
  +    }
       return 1;
   }
   
  -/* No word chars in binary data */
  +
   static INTVAL
   is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return 0;
  +    UINTVAL codepoint;
  +    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  +    return (typetable[codepoint] & WORDCHAR) ? 1 : 0;
   }
   
   static INTVAL
   find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return -1;
  +    return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
  +            typetable);
   }
   
   static INTVAL
   find_not_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return offset;
  +    return ascii_find_not_thing(interpreter, source_string, offset, WORDCHAR,
  +            typetable);
   }
   
   static INTVAL
   is_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return 0;
  +    UINTVAL codepoint;
  +    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  +    return (typetable[codepoint] == WHITESPACE);
   }
   
   static INTVAL
   find_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return -1;
  +    return ascii_find_thing(interpreter, source_string, offset, WHITESPACE,
  +            typetable);
   }
   
   static INTVAL
   find_not_whitespace(Interp *interpreter, STRING *source_string,
           UINTVAL offset)
   {
  -    return offset;
  +    return ascii_find_not_thing(interpreter, source_string, offset,
  +            WHITESPACE, typetable);
   }
   
   static INTVAL
   is_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return 0;
  +    UINTVAL codepoint;
  +    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  +    return (typetable[codepoint] & DIGIT) ? 1 : 0;
   }
   
   static INTVAL
   find_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return -1;
  +    return ascii_find_thing(interpreter, source_string, offset, DIGIT,
  +            typetable);
   }
   
   static INTVAL
   find_not_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return offset;
  +    return ascii_find_not_thing(interpreter, source_string, offset, DIGIT,
  +            typetable);
   }
   
   static INTVAL
   is_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return 0;
  +    UINTVAL codepoint;
  +    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  +    return (typetable[codepoint] == PUNCTUATION);
   }
   
   static INTVAL
   find_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return -1;
  +    return ascii_find_thing(interpreter, source_string, offset, PUNCTUATION,
  +            typetable);
   }
   
   static INTVAL
   find_not_punctuation(Interp *interpreter, STRING *source_string,
           UINTVAL offset)
   {
  -    return offset;
  +    return ascii_find_not_thing(interpreter, source_string, offset,
  +            PUNCTUATION, typetable);
   }
   
  -static INTVAL
  -is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
  +INTVAL
  +ascii_is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return 0;
  +    UINTVAL codepoint;
  +    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  +    return codepoint == 10;
   }
   
  -static INTVAL
  -find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
  +
  +INTVAL
  +ascii_find_newline(Interp *interpreter, STRING *string, UINTVAL start)
   {
  +    for (; start < string->strlen; start++) {
  +        if (ENCODING_GET_CODEPOINT(interpreter, string, start) == 10) {
  +            return start;
  +        }
  +    }
       return -1;
   }
   
  -static INTVAL
  -find_not_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
  +INTVAL
  +ascii_find_not_newline(Interp *interpreter, STRING *string, UINTVAL start)
  +{
  +    for (; start < string->strlen; start++) {
  +        if (ENCODING_GET_CODEPOINT(interpreter, string, start) != 10) {
  +            return start;
  +        }
  +    }
  +    return -1;
  +}
  +
  +INTVAL
  +ascii_find_word_boundary(Interp *interpreter, STRING *string,
  +        UINTVAL offset, const unsigned char *table)
   {
  -    return offset;
  +    UINTVAL c, len;
  +    int is_wc1, is_wc2;
  +
  +    len = string->strlen;
  +    if (!len)
  +        return -1;
  +    c = ENCODING_GET_CODEPOINT(interpreter, string, offset);
  +    is_wc1 = (table[c] & WORDCHAR) ? 1 : 0;
  +    /* begin of string */
  +    if (!offset && is_wc1)
  +        return 0;
  +    for (++offset; offset < len; offset++) {
  +        c = ENCODING_GET_CODEPOINT(interpreter, string, offset);
  +        is_wc2 = (table[c] & WORDCHAR) ? 1 : 0;
  +        if (is_wc1 ^ is_wc2)
  +            return offset - 1;
  +        is_wc1 = is_wc2;
  +    }
  +    /* end of string */
  +    if (is_wc1 && offset == len)
  +        return offset - 1;
  +    return -1;
   }
   
   static INTVAL
   find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL 
offset)
   {
  -    return -1;
  +  return ascii_find_word_boundary(interpreter, source_string,
  +          offset, typetable);
   }
  -
   static STRING *
   string_from_codepoint(Interp *interpreter, UINTVAL codepoint)
   {
  @@ -419,9 +481,9 @@
         is_punctuation,
         find_punctuation,
         find_not_punctuation,
  -      is_newline,
  -      find_newline,
  -      find_not_newline,
  +      ascii_is_newline,
  +      ascii_find_newline,
  +      ascii_find_not_newline,
         find_word_boundary,
         string_from_codepoint,
         compute_hash,
  
  
  
  1.8       +6 -5      parrot/charset/ascii.h
  
  Index: ascii.h
  ===================================================================
  RCS file: /cvs/public/parrot/charset/ascii.h,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- ascii.h   28 Feb 2005 15:10:55 -0000      1.7
  +++ ascii.h   28 Feb 2005 17:17:51 -0000      1.8
  @@ -1,7 +1,7 @@
   /* ascii.h
    *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: ascii.h,v 1.7 2005/02/28 15:10:55 leo Exp $
  + *     $Id: ascii.h,v 1.8 2005/02/28 17:17:51 leo Exp $
    *  Overview:
    *     This is the header for the ascii charset functions
    *  Data Structure and Algorithms:
  @@ -27,6 +27,11 @@
           UINTVAL offset, UINTVAL count);
   STRING *ascii_get_graphemes_inplace(Interp *, STRING *source_string,
           STRING *dest_string, UINTVAL offset, UINTVAL count);
  +INTVAL ascii_is_newline(Interp *, STRING *source_string, UINTVAL offset);
  +INTVAL ascii_find_newline(Interp *, STRING *source_string, UINTVAL offset);
  +INTVAL ascii_find_not_newline(Interp *, STRING *source_string, UINTVAL 
offset);
  +INTVAL ascii_find_word_boundary(Interp *, STRING *source_string,
  +        UINTVAL offset, const unsigned char *typetable);
   
   static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset, 
UINTVAL replace_count, STRING *insert_string);
   static void to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  @@ -56,10 +61,6 @@
   static INTVAL is_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
   static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
   static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
  -static INTVAL is_newline(Interp *, STRING *source_string, UINTVAL offset);
  -static INTVAL find_newline(Interp *, STRING *source_string, UINTVAL offset);
  -static INTVAL find_not_newline(Interp *, STRING *source_string, UINTVAL 
offset);
  -static INTVAL find_word_boundary(Interp *, STRING *source_string, UINTVAL 
offset);
   static size_t compute_hash(Interp *, STRING *source_string);
   CHARSET *Parrot_charset_ascii_init(Interp *);
   
  
  
  
  1.8       +29 -35    parrot/charset/iso-8859-1.c
  
  Index: iso-8859-1.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- iso-8859-1.c      28 Feb 2005 15:10:55 -0000      1.7
  +++ iso-8859-1.c      28 Feb 2005 17:17:51 -0000      1.8
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: iso-8859-1.c,v 1.7 2005/02/28 15:10:55 leo Exp $
  +$Id: iso-8859-1.c,v 1.8 2005/02/28 17:17:51 leo Exp $
   
   =head1 NAME
   
  @@ -23,22 +23,22 @@
   
   #define WHITESPACE 1
   #define WORDCHAR 2
  -#define PUNCTUATION 3
  -#define DIGIT 4
  +#define PUNCTUATION 4
  +#define DIGIT 8
   
   static const unsigned char typetable[256] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 0-15 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
  -    1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 32-47 */
  -    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, /* 48-63 */
  -    3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
  -    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, /* 80-95 */
  -    3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
  -    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 0, /* 112-127 */
  +    1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32-47 */
  +    0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 0xa, 4, 4, 4, 4, 4, 4, /* 
48 */
  +    4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
  +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, /* 80-95 */
  +    4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 95-111 */
  +    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 0, /* 112-127 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 128-143 */
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 144-159 */
  -    1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 160-175 */
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, /* 176-191 */
  +    1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 160-175 */
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, /* 176-191 */
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 192-207 */
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, /* 208-223 */
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 224-239 */
  @@ -274,26 +274,33 @@
       return retval;
   }
   
  -/* Binary's always valid */
  +
   static UINTVAL
  -validate(Interp *interpreter, STRING *source_string)
  +validate(Interp *interpreter, STRING *src)
   {
  +    UINTVAL codepoint, offset;
  +
  +    for (offset = 0; offset < string_length(interpreter, src); ++offset) {
  +        codepoint = ENCODING_GET_CODEPOINT(interpreter, src, offset);
  +        if (codepoint >= 0x100)
  +            return 0;
  +    }
       return 1;
   }
   
  -/* No word chars in binary data */
   static INTVAL
   is_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
       UINTVAL codepoint;
       codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  -    return (typetable[codepoint] == WORDCHAR);
  +    return (typetable[codepoint] & WORDCHAR) ? 1 : 0;
   }
   
   static INTVAL
   find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
  -    return ascii_find_thing(interpreter, source_string, offset, WORDCHAR, 
typetable);
  +    return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
  +            typetable);
   }
   
   static INTVAL
  @@ -331,7 +338,7 @@
   {
       UINTVAL codepoint;
       codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  -    return (typetable[codepoint] == DIGIT);
  +    return (typetable[codepoint] & DIGIT) ? 1 : 0;
   }
   
   static INTVAL
  @@ -372,20 +379,6 @@
   }
   
   static INTVAL
  -is_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
  -{
  -    UINTVAL codepoint;
  -    codepoint = ENCODING_GET_CODEPOINT(interpreter, source_string, offset);
  -    return codepoint == 13;
  -}
  -
  -static INTVAL
  -find_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
  -{
  -  return -1;
  -}
  -
  -static INTVAL
   find_not_newline(Interp *interpreter, STRING *source_string, UINTVAL offset)
   {
     return offset;
  @@ -394,7 +387,8 @@
   static INTVAL
   find_word_boundary(Interp *interpreter, STRING *source_string, UINTVAL 
offset)
   {
  -  return -1;
  +  return ascii_find_word_boundary(interpreter, source_string,
  +          offset, typetable);
   }
   
   static STRING *
  @@ -460,9 +454,9 @@
           is_punctuation,
           find_punctuation,
           find_not_punctuation,
  -        is_newline,
  -        find_newline,
  -        find_not_newline,
  +        ascii_is_newline,
  +        ascii_find_newline,
  +        ascii_find_not_newline,
           find_word_boundary,
           string_from_codepoint,
           compute_hash,
  
  
  
  1.6       +1 -5      parrot/charset/iso-8859-1.h
  
  Index: iso-8859-1.h
  ===================================================================
  RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- iso-8859-1.h      28 Feb 2005 15:10:55 -0000      1.5
  +++ iso-8859-1.h      28 Feb 2005 17:17:51 -0000      1.6
  @@ -1,7 +1,7 @@
   /* iso_8859_1.h
    *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: iso-8859-1.h,v 1.5 2005/02/28 15:10:55 leo Exp $
  + *     $Id: iso-8859-1.h,v 1.6 2005/02/28 17:17:51 leo Exp $
    *  Overview:
    *     This is the header for the iso_8859-1 charset functions
    *  Data Structure and Algorithms:
  @@ -41,10 +41,6 @@
   static INTVAL is_punctuation(Interp *interpreter, STRING *source_string, 
UINTVAL offset);
   static INTVAL find_punctuation(Interp *interpreter, STRING *source_string, 
UINTVAL offset);
   static INTVAL find_not_punctuation(Interp *interpreter, STRING 
*source_string, UINTVAL offset);
  -static INTVAL is_newline(Interp *interpreter, STRING *source_string, UINTVAL 
offset);
  -static INTVAL find_newline(Interp *interpreter, STRING *source_string, 
UINTVAL offset);
  -static INTVAL find_not_newline(Interp *interpreter, STRING *source_string, 
UINTVAL offset);
  -static INTVAL find_word_boundary(Interp *interpreter, STRING *source_string, 
UINTVAL offset);
   static size_t compute_hash(Interp *interpreter, STRING *source_string);
   CHARSET *Parrot_charset_iso_8859_1_init(Interp *interpreter);
   
  
  
  
  1.8       +3 -3      parrot/encodings/fixed_8.c
  
  Index: fixed_8.c
  ===================================================================
  RCS file: /cvs/public/parrot/encodings/fixed_8.c,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- fixed_8.c 27 Feb 2005 12:12:25 -0000      1.7
  +++ fixed_8.c 28 Feb 2005 17:17:53 -0000      1.8
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: fixed_8.c,v 1.7 2005/02/27 12:12:25 leo Exp $
  +$Id: fixed_8.c,v 1.8 2005/02/28 17:17:53 leo Exp $
   
   =head1 NAME
   
  @@ -51,7 +51,7 @@
   static UINTVAL
   get_byte(Interp *interpreter, const STRING *source_string, UINTVAL offset)
   {
  -    char *contents = source_string->strstart;
  +    unsigned char *contents = source_string->strstart;
       if (offset >= source_string->bufused) {
        internal_exception(0,
                "get_byte past the end of the buffer (%i of %i)",
  @@ -64,7 +64,7 @@
   set_byte(Interp *interpreter, const STRING *source_string,
        UINTVAL offset, UINTVAL byte)
   {
  -    char *contents;
  +    unsigned char *contents;
       if (offset >= source_string->bufused) {
        internal_exception(0, "set_byte past the end of the buffer");
       }
  
  
  
  1.48      +7 -1      parrot/include/parrot/string_funcs.h
  
  Index: string_funcs.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
  retrieving revision 1.47
  retrieving revision 1.48
  diff -u -r1.47 -r1.48
  --- string_funcs.h    27 Feb 2005 09:58:43 -0000      1.47
  +++ string_funcs.h    28 Feb 2005 17:17:54 -0000      1.48
  @@ -1,7 +1,7 @@
   /* string_funcs.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string_funcs.h,v 1.47 2005/02/27 09:58:43 leo Exp $
  + *     $Id: string_funcs.h,v 1.48 2005/02/28 17:17:54 leo Exp $
    *  Overview:
    *     This is the api header for the string subsystem
    *  Data Structure and Algorithms:
  @@ -102,6 +102,12 @@
   void string_downcase_inplace(Interp *, STRING *);
   void string_titlecase_inplace(Interp *, STRING *);
   
  +INTVAL Parrot_string_is_whitespace(Interp *, STRING *, INTVAL offset);
  +INTVAL Parrot_string_is_digit(Interp *, STRING *, INTVAL offset);
  +INTVAL Parrot_string_is_wordchar(Interp *, STRING *, INTVAL offset);
  +INTVAL Parrot_string_is_punctuation(Interp *, STRING *, INTVAL offset);
  +INTVAL Parrot_string_is_newline(Interp *, STRING *, INTVAL offset);
  +
   #endif /* PARROT_IN_CORE */
   #endif /* PARROT_STRING_FUNCS_H_GUARD */
   
  
  
  
  1.58      +20 -0     parrot/ops/ops.num
  
  Index: ops.num
  ===================================================================
  RCS file: /cvs/public/parrot/ops/ops.num,v
  retrieving revision 1.57
  retrieving revision 1.58
  diff -u -r1.57 -r1.58
  --- ops.num   28 Feb 2005 13:35:45 -0000      1.57
  +++ ops.num   28 Feb 2005 17:17:55 -0000      1.58
  @@ -1378,3 +1378,23 @@
   charsetname_s_ic               1348
   find_charset_i_s               1349
   find_charset_i_sc              1350
  +is_whitespace_i_s_i            1351
  +is_whitespace_i_s_ic           1352
  +is_whitespace_i_sc_i           1353
  +is_whitespace_i_sc_ic          1354
  +is_digit_i_s_i                 1355
  +is_digit_i_s_ic                1356
  +is_digit_i_sc_i                1357
  +is_digit_i_sc_ic               1358
  +is_wordchar_i_s_i              1359
  +is_wordchar_i_s_ic             1360
  +is_wordchar_i_sc_i             1361
  +is_wordchar_i_sc_ic            1362
  +is_punctuation_i_s_i           1363
  +is_punctuation_i_s_ic          1364
  +is_punctuation_i_sc_i          1365
  +is_punctuation_i_sc_ic         1366
  +is_newline_i_s_i               1367
  +is_newline_i_s_ic              1368
  +is_newline_i_sc_i              1369
  +is_newline_i_sc_ic             1370
  
  
  
  1.33      +48 -0     parrot/ops/string.ops
  
  Index: string.ops
  ===================================================================
  RCS file: /cvs/public/parrot/ops/string.ops,v
  retrieving revision 1.32
  retrieving revision 1.33
  diff -u -r1.32 -r1.33
  --- string.ops        28 Feb 2005 13:35:45 -0000      1.32
  +++ string.ops        28 Feb 2005 17:17:55 -0000      1.33
  @@ -655,6 +655,54 @@
     goto NEXT();
   }
   
  +=item B<is_whitespace>(out INT, in STR, in INT)
  +
  +Set $1 to 1 if the codepoint of string $2 at offset $3 is whitespace.
  +
  +=item B<is_wordchar>(out INT, in STR, in INT)
  +
  +Set $1 to 1 if the codepoint of string $2 at offset $3 is a wordchar.
  +
  +=item B<is_digit>(out INT, in STR, in INT)
  +
  +Set $1 to 1 if the codepoint of string $2 at offset $3 is a digit.
  +
  +=item B<is_punctuation>(out INT, in STR, in INT)
  +
  +Set $1 to 1 if the codepoint of string $2 at offset $3 is a punctuation char.
  +
  +=item B<is_newline>(out INT, in STR, in INT)
  +
  +Set $1 to 1 if the codepoint of string $2 at offset $3 is a newline char.
  +
  +=cut
  +
  +op is_whitespace(out INT, in STR, in INT) {
  +  $1 = Parrot_string_is_whitespace(interpreter, $2, $3);
  +  goto NEXT();
  +}
  +
  +op is_wordchar(out INT, in STR, in INT) {
  +  $1 = Parrot_string_is_wordchar(interpreter, $2, $3);
  +  goto NEXT();
  +}
  +
  +op is_digit(out INT, in STR, in INT) {
  +  $1 = Parrot_string_is_digit(interpreter, $2, $3);
  +  goto NEXT();
  +}
  +
  +op is_punctuation(out INT, in STR, in INT) {
  +  $1 = Parrot_string_is_punctuation(interpreter, $2, $3);
  +  goto NEXT();
  +}
  +
  +op is_newline(out INT, in STR, in INT) {
  +  $1 = Parrot_string_is_newline(interpreter, $2, $3);
  +  goto NEXT();
  +}
  +
  +
   =back
   
   =head1 COPYRIGHT
  
  
  
  1.236     +41 -1     parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.235
  retrieving revision 1.236
  diff -u -r1.235 -r1.236
  --- string.c  28 Feb 2005 13:35:46 -0000      1.235
  +++ string.c  28 Feb 2005 17:17:56 -0000      1.236
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: string.c,v 1.235 2005/02/28 13:35:46 leo Exp $
  +$Id: string.c,v 1.236 2005/02/28 17:17:56 leo Exp $
   
   =head1 NAME
   
  @@ -2503,6 +2503,46 @@
       return NULL;
   }
   
  +INTVAL
  +Parrot_string_is_whitespace(Interp *interpreter, STRING *s, INTVAL offset)
  +{
  +    if (!s)
  +        return 0;
  +    return CHARSET_IS_WHITESPACE(interpreter, s, offset);
  +}
  +
  +INTVAL
  +Parrot_string_is_digit(Interp *interpreter, STRING *s, INTVAL offset)
  +{
  +    if (!s)
  +        return 0;
  +    return CHARSET_IS_DIGIT(interpreter, s, offset);
  +}
  +
  +INTVAL
  +Parrot_string_is_wordchar(Interp *interpreter, STRING *s, INTVAL offset)
  +{
  +    if (!s)
  +        return 0;
  +    return CHARSET_IS_WORDCHAR(interpreter, s, offset);
  +}
  +
  +INTVAL
  +Parrot_string_is_punctuation(Interp *interpreter, STRING *s, INTVAL offset)
  +{
  +    if (!s)
  +        return 0;
  +    return CHARSET_IS_PUNCTUATION(interpreter, s, offset);
  +}
  +
  +INTVAL
  +Parrot_string_is_newline(Interp *interpreter, STRING *s, INTVAL offset)
  +{
  +    if (!s)
  +        return 0;
  +    return CHARSET_IS_NEWLINE(interpreter, s, offset);
  +}
  +
   /*
   
   =back
  
  
  
  1.4       +90 -2     parrot/t/op/string_cs.t
  
  Index: string_cs.t
  ===================================================================
  RCS file: /cvs/public/parrot/t/op/string_cs.t,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- string_cs.t       28 Feb 2005 15:10:58 -0000      1.3
  +++ string_cs.t       28 Feb 2005 17:17:57 -0000      1.4
  @@ -1,6 +1,6 @@
   #! perl -w
   # Copyright: 2001-2004 The Perl Foundation.  All Rights Reserved.
  -# $Id: string_cs.t,v 1.3 2005/02/28 15:10:58 leo Exp $
  +# $Id: string_cs.t,v 1.4 2005/02/28 17:17:57 leo Exp $
   
   =head1 NAME
   
  @@ -16,7 +16,7 @@
   
   =cut
   
  -use Parrot::Test tests => 7;
  +use Parrot::Test tests => 12;
   use Test::More;
   
   output_is( <<'CODE', <<OUTPUT, "basic syntax" );
  @@ -92,3 +92,91 @@
   Zaeiou_���
   OUTPUT
   
  +output_is( <<'CODE', <<OUTPUT, "is_whitespace");
  +    set S0, "a\t\n \xa0"
  +    is_whitespace I0, S0, 0
  +    is_whitespace I1, S0, 1
  +    is_whitespace I2, S0, 2
  +    is_whitespace I3, S0, 3
  +    set I4, 4
  +    is_whitespace I4, S0, I4
  +    print I0
  +    print I1
  +    print I2
  +    print I3
  +    print I4
  +    print "\n"
  +    set S0, ascii:"a\t\n "
  +    is_whitespace I0, S0, 0
  +    is_whitespace I1, S0, 1
  +    is_whitespace I2, S0, 2
  +    is_whitespace I3, S0, 3
  +    print I0
  +    print I1
  +    print I2
  +    print I3
  +    print "\n"
  +    end
  +CODE
  +01111
  +0111
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "is_wordchar");
  +    set S0, "az019-,._"
  +    length I1, S0
  +    set I2, 0
  +lp:
  +    is_wordchar I0, S0, I2
  +    print I0
  +    inc I2
  +    lt I2, I1, lp
  +    print "\n"
  +    end
  +CODE
  +111110001
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "is_digit");
  +    set S0, "az019-,._"
  +    length I1, S0
  +    set I2, 0
  +lp:
  +    is_digit I0, S0, I2
  +    print I0
  +    inc I2
  +    lt I2, I1, lp
  +    print "\n"
  +    end
  +CODE
  +001110000
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "is_punctuation");
  +    set S0, "az019-,._"
  +    length I1, S0
  +    set I2, 0
  +lp:
  +    is_punctuation I0, S0, I2
  +    print I0
  +    inc I2
  +    lt I2, I1, lp
  +    print "\n"
  +    end
  +CODE
  +000001110
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "is_newline");
  +    set S0, "a\n"
  +    is_newline I0, S0, 0
  +    print I0
  +    is_newline I0, S0, 1
  +    print I0
  +    print "\n"
  +    end
  +CODE
  +01
  +OUTPUT
  +
  +

cvs commit: parrot/t/op string_cs.t

Reply via email to