op string_cs.t

Leopold Toetsch Wed, 02 Mar 2005 05:47:40 -0800

cvsuser     05/03/02 05:47:33


  Modified:    charset  ascii.c iso-8859-1.c unicode.c
               ops      ops.num string.ops
               src      string.c string_primitives.c
               t/op     string_cs.t
  Log:
  Strings. Finally. 13 - some unicode conversions
  
  * convert to and from unicode - utf8 only
  * fix string unescaping, use a string iter
  * new bytelength opcode
  
  Revision  Changes    Path
  1.18      +29 -4     parrot/charset/ascii.c
  
  Index: ascii.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/ascii.c,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -r1.17 -r1.18
  --- ascii.c   2 Mar 2005 10:43:13 -0000       1.17
  +++ ascii.c   2 Mar 2005 13:47:28 -0000       1.18
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: ascii.c,v 1.17 2005/03/02 10:43:13 leo Exp $
  +$Id: ascii.c,v 1.18 2005/03/02 13:47:28 leo Exp $
   
   =head1 NAME
   
  @@ -137,10 +137,35 @@
   }
   
   STRING *
  -ascii_to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
  +ascii_to_unicode(Interp *interpreter, STRING *src, STRING *dest)
   {
  -    internal_exception(UNIMPLEMENTED,
  -            "to_unicode for iso-8859-1 not implemented");
  +    UINTVAL offs, c;
  +    String_iter iter;
  +
  +    if (dest) {
  +        dest->charset = Parrot_unicode_charset_ptr;
  +        dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, dest);
  +        Parrot_reallocate_string(interpreter, dest, src->strlen);
  +        ENCODING_ITER_INIT(interpreter, dest, &iter);
  +        for (offs = 0; offs < src->strlen; ++offs) {
  +            c = ENCODING_GET_BYTE(interpreter, src, offs);
  +            if (iter.bytepos >= PObj_buflen(dest) - 4) {
  +                UINTVAL need = (src->strlen - offs) * 1.5;
  +                if (need < 16)
  +                    need = 16;
  +                Parrot_reallocate_string(interpreter, dest,
  +                        PObj_buflen(dest) + need);
  +            }
  +            iter.set_and_advance(interpreter, &iter, c);
  +        }
  +        dest->bufused = iter.bytepos;
  +        dest->strlen  = iter.charpos;
  +        return dest;
  +    }
  +    else {
  +        internal_exception(UNIMPLEMENTED,
  +                "to_unicode inplace for iso-8859-1 not implemented");
  +    }
       return NULL;
   }
   
  
  
  
  1.13      +23 -4     parrot/charset/iso-8859-1.c
  
  Index: iso-8859-1.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
  retrieving revision 1.12
  retrieving revision 1.13
  diff -u -r1.12 -r1.13
  --- iso-8859-1.c      1 Mar 2005 17:25:44 -0000       1.12
  +++ iso-8859-1.c      2 Mar 2005 13:47:28 -0000       1.13
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: iso-8859-1.c,v 1.12 2005/03/01 17:25:44 leo Exp $
  +$Id: iso-8859-1.c,v 1.13 2005/03/02 13:47:28 leo Exp $
   
   =head1 NAME
   
  @@ -64,10 +64,29 @@
   }
   
   static STRING *
  -from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
  +from_charset(Interp *interpreter, STRING *src, STRING *dest)
   {
  -    internal_exception(UNIMPLEMENTED, "Can't do this yet");
  -    return NULL;
  +    UINTVAL offs, c;
  +    String_iter iter;
  +
  +    if (dest) {
  +        Parrot_reallocate_string(interpreter, dest, src->strlen);
  +        dest->bufused = src->strlen;
  +        dest->strlen  = src->strlen;
  +    }
  +    ENCODING_ITER_INIT(interpreter, src, &iter);
  +    for (offs = 0; offs < src->strlen; ++offs) {
  +        c = iter.get_and_advance(interpreter, &iter);
  +        if (c >= 0x100) {
  +            EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
  +        }
  +        if (dest)
  +            ENCODING_SET_BYTE(interpreter, dest, offs, c);
  +    }
  +    if (dest)
  +        return dest;
  +    src->charset = Parrot_ascii_charset_ptr;
  +    return src;
   }
   
   static STRING *
  
  
  
  1.2       +22 -5     parrot/charset/unicode.c
  
  Index: unicode.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/unicode.c,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- unicode.c 2 Mar 2005 09:03:25 -0000       1.1
  +++ unicode.c 2 Mar 2005 13:47:28 -0000       1.2
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2005 The Perl Foundation.  All Rights Reserved.
  -$Id: unicode.c,v 1.1 2005/03/02 09:03:25 leo Exp $
  +$Id: unicode.c,v 1.2 2005/03/02 13:47:28 leo Exp $
   
   =head1 NAME
   
  @@ -50,10 +50,19 @@
   }
   
   static STRING*
  -to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING 
*dest)
  +to_charset(Interp *interpreter, STRING *src,
  +        CHARSET *new_charset, STRING *dest)
   {
  -    UNIMPL;
  -    return NULL;
  +    charset_converter_t conversion_func;
  +
  +    if ((conversion_func = Parrot_find_charset_converter(interpreter,
  +                    src->charset, new_charset))) {
  +         return conversion_func(interpreter, src, dest);
  +    }
  +    else {
  +        return new_charset->from_charset(interpreter, src, dest);
  +
  +    }
   }
   
   static STRING*
  @@ -64,8 +73,16 @@
   }
   
   static STRING*
  -from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
  +from_charset(Interp *interpreter, STRING *src, STRING *dest)
   {
  +    if (src->charset == Parrot_unicode_charset_ptr) {
  +        if (!dest) {
  +            /* inplace ok */
  +            return src;
  +        }
  +        Parrot_reuse_COW_reference(interpreter, src, dest);
  +        return dest;
  +    }
       UNIMPL;
       return NULL;
   }
  
  
  
  1.61      +2 -0      parrot/ops/ops.num
  
  Index: ops.num
  ===================================================================
  RCS file: /cvs/public/parrot/ops/ops.num,v
  retrieving revision 1.60
  retrieving revision 1.61
  diff -u -r1.60 -r1.61
  --- ops.num   1 Mar 2005 14:19:47 -0000       1.60
  +++ ops.num   2 Mar 2005 13:47:29 -0000       1.61
  @@ -1428,3 +1428,5 @@
   trans_charset_s_s_ic           1398
   trans_charset_s_sc_i           1399
   trans_charset_s_sc_ic          1400
  +bytelength_i_s                 1401
  +bytelength_i_sc                1402
  
  
  
  1.36      +17 -0     parrot/ops/string.ops
  
  Index: string.ops
  ===================================================================
  RCS file: /cvs/public/parrot/ops/string.ops,v
  retrieving revision 1.35
  retrieving revision 1.36
  diff -u -r1.35 -r1.36
  --- string.ops        1 Mar 2005 14:19:47 -0000       1.35
  +++ string.ops        2 Mar 2005 13:47:29 -0000       1.36
  @@ -155,6 +155,10 @@
   
   Set $1 to the length (in characters) of the string in $2.
   
  +=item B<bytelength>(out INT, in STR)
  +
  +Set $1 to the length (in bytes) of the string in $2.
  +
   =cut
   
   inline op length(out INT, in STR) :base_mem {
  @@ -162,6 +166,19 @@
     goto NEXT();
   }
   
  +inline op bytelength(out INT, in STR) :base_mem {
  +  UINTVAL n;
  +  STRING *s = $2;
  +  if (!s)
  +    n = 0;
  +  else {
  +    n = s->bufused;
  +    assert(n == ENCODING_BYTES(interpreter, $2));
  +  }
  +  $1 = n;
  +  goto NEXT();
  +}
  +
   =item B<pin>(inout STR)
   
   Make the memory in $1 immobile. This memory will I<not> be moved by
  
  
  
  1.242     +22 -19    parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.241
  retrieving revision 1.242
  diff -u -r1.241 -r1.242
  --- string.c  2 Mar 2005 10:43:16 -0000       1.241
  +++ string.c  2 Mar 2005 13:47:32 -0000       1.242
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: string.c,v 1.241 2005/03/02 10:43:16 leo Exp $
  +$Id: string.c,v 1.242 2005/03/02 13:47:32 leo Exp $
   
   =head1 NAME
   
  @@ -318,12 +318,6 @@
               mem_sys_free((void*)data_dir); /* cast away the constness */
       }
   
  -/* --- Perhaps these should be uncommented - Leo
  -    encoding_init();
  -    chartype_init();
  -    string_native_type = chartype_lookup("usascii");
  -    string_unicode_type = chartype_lookup("unicode");
  -*/
       /*
        * initialize the constant string table
        */
  @@ -615,16 +609,8 @@
               "string_make: no charset name specified");
       }
   
  -    if (strcmp(charset_name, "iso-8859-1") == 0 ) {
  -        charset = Parrot_iso_8859_1_charset_ptr;
  -    }
  -    else if (strcmp(charset_name, "ascii") == 0 ) {
  -        charset = Parrot_ascii_charset_ptr;
  -    }
  -    else if (strcmp(charset_name, "binary") == 0 ) {
  -        charset = Parrot_binary_charset_ptr;
  -    }
  -    else {
  +    charset = Parrot_find_charset(interpreter, charset_name);
  +    if (!charset) {
           internal_exception(UNIMPLEMENTED,
                   "Can't make '%s' charset strings", charset_name);
       }
  @@ -2284,6 +2270,8 @@
       UINTVAL offs, d;
       Parrot_UInt4 r;
       UINTVAL flags;
  +    String_iter iter;
  +    ENCODING *encoding;
   
       if (delimiter && clength)
           --clength;
  @@ -2293,6 +2281,15 @@
       else
           flags |= PObj_private7_FLAG;  /* Pythonic unicode flag */
       result = string_make(interpreter, cstring, clength, charset, flags);
  +    result->strlen = clength;
  +
  +    ENCODING_ITER_INIT(interpreter, result, &iter);
  +    /*
  +     * reset encoding so that the destination encoding isn't used
  +     * TODO if an encoding is given too just use it
  +     */
  +    encoding = result->encoding;
  +    result->encoding = Parrot_fixed_8_encoding_ptr;
   
       for (offs = d = 0; offs < clength; ++offs) {
           r = CHARSET_GET_CODEPOINT(interpreter, result, offs);
  @@ -2306,13 +2303,19 @@
               --offs;
           }
           if (d == offs) {
  +            /* we did it in place - no action */
               ++d;
  +            iter.bytepos++;
  +            iter.charpos++;
               continue;
           }
  -        CHARSET_SET_CODEPOINT(interpreter, result, d++, r);
  +        assert(d < offs);
  +        iter.set_and_advance(interpreter, &iter, r);
  +        ++d;
       }
  +    result->encoding = encoding;
       result->strlen = d;
  -    result->bufused = string_max_bytes(interpreter, result, d);
  +    result->bufused = iter.bytepos;
   
       return result;
   }
  
  
  
  1.11      +11 -11    parrot/src/string_primitives.c
  
  Index: string_primitives.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string_primitives.c,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- string_primitives.c       27 Feb 2005 09:58:47 -0000      1.10
  +++ string_primitives.c       2 Mar 2005 13:47:32 -0000       1.11
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: string_primitives.c,v 1.10 2005/02/27 09:58:47 leo Exp $
  +$Id: string_primitives.c,v 1.11 2005/03/02 13:47:32 leo Exp $
   
   =head1 NAME
   
  @@ -154,11 +154,11 @@
       UINTVAL charcount = 0;
       UINTVAL len = string_length(interpreter, string);
       /* Well, not right now */
  -    codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +    codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
       switch (codepoint) {
       case 'x':
           ++*offset;
  -        codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +        codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
           if (codepoint >= '0' && codepoint <= '9') {
               workchar = codepoint - '0';
           } else if (codepoint >= 'a' && codepoint <= 'f') {
  @@ -171,7 +171,7 @@
           ++*offset;
           if (*offset < len) {
               workchar *= 16;
  -            codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +            codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
               if (codepoint >= '0' && codepoint <= '9') {
                   workchar += codepoint - '0';
               } else if (codepoint >= 'a' && codepoint <= 'f') {
  @@ -188,7 +188,7 @@
           return workchar;
       case 'c':
           ++*offset;
  -        codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +        codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
           if (codepoint >= 'A' && codepoint <= 'Z') {
               workchar = codepoint - 'A' + 1;
           } else {
  @@ -198,7 +198,7 @@
           return workchar;
       case 'u':
           ++*offset;
  -        codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +        codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
           if (codepoint >= '0' && codepoint <= '9') {
               workchar = codepoint - '0';
           } else if (codepoint >= 'a' && codepoint <= 'f') {
  @@ -212,7 +212,7 @@
           for (charcount = 1; charcount < 4; charcount++) {
               if (*offset < len) {
                   workchar *= 16;
  -                codepoint = CHARSET_GET_CODEPOINT(interpreter, string, 
*offset);
  +                codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
                   if (codepoint >= '0' && codepoint <= '9') {
                       workchar += codepoint - '0';
                   } else if (codepoint >= 'a' && codepoint <= 'f') {
  @@ -230,7 +230,7 @@
           return workchar;
       case 'U':
           ++*offset;
  -        codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +        codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
           if (codepoint >= '0' && codepoint <= '9') {
               workchar = codepoint - '0';
           } else if (codepoint >= 'a' && codepoint <= 'f') {
  @@ -244,7 +244,7 @@
           for (charcount = 1; charcount < 8; charcount++) {
               if (*offset < len) {
                   workchar *= 16;
  -                codepoint = CHARSET_GET_CODEPOINT(interpreter, string, 
*offset);
  +                codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
                   if (codepoint >= '0' && codepoint <= '9') {
                       workchar += codepoint - '0';
                   } else if (codepoint >= 'a' && codepoint <= 'f') {
  @@ -274,7 +274,7 @@
           ++*offset;
           if (*offset < len) {
               workchar *= 8;
  -            codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +            codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
               if (codepoint >= '0' && codepoint <= '7') {
                   workchar += codepoint - '0';
               } else {
  @@ -286,7 +286,7 @@
           ++*offset;
           if (*offset < len) {
               workchar *= 8;
  -            codepoint = CHARSET_GET_CODEPOINT(interpreter, string, *offset);
  +            codepoint = CHARSET_GET_BYTE(interpreter, string, *offset);
               if (codepoint >= '0' && codepoint <= '7') {
                   workchar += codepoint - '0';
               } else {
  
  
  
  1.8       +46 -2     parrot/t/op/string_cs.t
  
  Index: string_cs.t
  ===================================================================
  RCS file: /cvs/public/parrot/t/op/string_cs.t,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- string_cs.t       1 Mar 2005 15:41:31 -0000       1.7
  +++ string_cs.t       2 Mar 2005 13:47:33 -0000       1.8
  @@ -1,6 +1,6 @@
   #! perl -w
   # Copyright: 2001-2004 The Perl Foundation.  All Rights Reserved.
  -# $Id: string_cs.t,v 1.7 2005/03/01 15:41:31 leo Exp $
  +# $Id: string_cs.t,v 1.8 2005/03/02 13:47:33 leo Exp $
   
   =head1 NAME
   
  @@ -16,7 +16,7 @@
   
   =cut
   
  -use Parrot::Test tests => 26;
  +use Parrot::Test tests => 28;
   use Test::More;
   
   output_is( <<'CODE', <<OUTPUT, "basic syntax" );
  @@ -401,3 +401,47 @@
   iso-8859-1
   OUTPUT
   
  +output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i iso-8859-1 to unicode");
  +    set S0, "abc_�_"
  +    find_charset I0, "unicode"
  +    trans_charset S1, S0, I0
  +    print S1
  +    print "\n"
  +    charset I0, S1
  +    charsetname S2, I0
  +    print S2
  +    print "\n"
  +    length I2, S1
  +    print I2
  +    print "\n"
  +    end
  +CODE
  +abc_\xc3\xa4_
  +unicode
  +6
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i unicode to iso-8859-1");
  +    set S0, unicode:"abc_\xe4_"
  +    bytelength I2, S0        # XXX its 7 for utf8 only
  +    print I2
  +    print "\n"
  +    find_charset I0, "iso-8859-1"
  +    trans_charset S1, S0, I0
  +    print S1
  +    print "\n"
  +    charset I0, S1
  +    charsetname S2, I0
  +    print S2
  +    print "\n"
  +    length I2, S1
  +    print I2
  +    print "\n"
  +    end
  +CODE
  +7
  +abc_�_
  +iso-8859-1
  +6
  +OUTPUT
  +

cvs commit: parrot/t/op string_cs.t

Reply via email to