cvsuser     05/03/01 06:19:49

  Modified:    charset  ascii.c ascii.h binary.c binary.h iso-8859-1.c
                        iso-8859-1.h
               include/parrot charset.h exceptions.h string_funcs.h
               ops      ops.num string.ops
               src      charset.c string.c
               t/op     string_cs.t
  Log:
  Strings. Finally. 8 - charset conversion
  * trans_charset opcodes
  * Parrot_string_trans_charset() interface
  * charset converter registration and lookup
  * adapt converter function signature to take a dest STRING
  * iso-8859-1 to ascii conversion
  
  Revision  Changes    Path
  1.13      +13 -18    parrot/charset/ascii.c
  
  Index: ascii.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/ascii.c,v
  retrieving revision 1.12
  retrieving revision 1.13
  diff -u -r1.12 -r1.13
  --- ascii.c   1 Mar 2005 11:06:26 -0000       1.12
  +++ ascii.c   1 Mar 2005 14:19:45 -0000       1.13
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: ascii.c,v 1.12 2005/03/01 11:06:26 leo Exp $
  +$Id: ascii.c,v 1.13 2005/03/01 14:19:45 leo Exp $
   
   =head1 NAME
   
  @@ -95,37 +95,33 @@
               offset, count, dest_string);
   }
   
  -static void
  -to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
  +static STRING *
  +to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING 
*dest)
   {
       internal_exception(UNIMPLEMENTED, "to_charset for ascii not 
implemented");
  +    return NULL;
   }
   
  -static STRING *
  -copy_to_charset(Interp *interpreter, STRING *source_string,
  -        CHARSET *new_charset)
  -{
  -  STRING *return_string = NULL;
  -
  -  return return_string;
  -}
   
  -static void
  -to_unicode(Interp *interpreter, STRING *source_string)
  +static STRING *
  +to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "to_unicode for ascii not 
implemented");
  +    return NULL;
   }
   
  -static void
  -from_charset(Interp *interpreter, STRING *source_string)
  +static STRING *
  +from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "Can't do this yet");
  +    return NULL;
   }
   
  -static void
  -from_unicode(Interp *interpreter, STRING *source_string)
  +static STRING *
  +from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "Can't do this yet");
  +    return NULL;
   }
   
   /* A noop. can't compose ascii */
  @@ -511,7 +507,6 @@
         ascii_get_graphemes_inplace,
         set_graphemes,
         to_charset,
  -      copy_to_charset,
         to_unicode,
         from_charset,
         from_unicode,
  
  
  
  1.10      +1 -5      parrot/charset/ascii.h
  
  Index: ascii.h
  ===================================================================
  RCS file: /cvs/public/parrot/charset/ascii.h,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- ascii.h   1 Mar 2005 11:06:26 -0000       1.9
  +++ ascii.h   1 Mar 2005 14:19:45 -0000       1.10
  @@ -1,7 +1,7 @@
   /* ascii.h
    *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: ascii.h,v 1.9 2005/03/01 11:06:26 leo Exp $
  + *     $Id: ascii.h,v 1.10 2005/03/01 14:19:45 leo Exp $
    *  Overview:
    *     This is the header for the ascii charset functions
    *  Data Structure and Algorithms:
  @@ -40,10 +40,6 @@
           const STRING *search_string, UINTVAL offset);
   size_t ascii_compute_hash(Interp *, STRING *source_string);
   
  -static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset, 
UINTVAL replace_count, STRING *insert_string);
  -static void to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  -static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  -static void to_unicode(Interp *, STRING *source_string);
   static void compose(Interp *, STRING *source_string);
   static void decompose(Interp *, STRING *source_string);
   static void upcase(Interp *, STRING *source_string);
  
  
  
  1.10      +13 -20    parrot/charset/binary.c
  
  Index: binary.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/binary.c,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- binary.c  1 Mar 2005 11:06:26 -0000       1.9
  +++ binary.c  1 Mar 2005 14:19:45 -0000       1.10
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: binary.c,v 1.9 2005/03/01 11:06:26 leo Exp $
  +$Id: binary.c,v 1.10 2005/03/01 14:19:45 leo Exp $
   
   =head1 NAME
   
  @@ -36,38 +36,32 @@
               replace_count, insert_string);
   }
   
  -static void
  -to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
  +static STRING*
  +to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING 
*dest)
   {
       internal_exception(UNIMPLEMENTED, "to_charset for binary not 
implemented");
  +    return NULL;
   }
   
  -static STRING *
  -copy_to_charset(Interp *interpreter, STRING *source_string,
  -        CHARSET *new_charset)
  -{
  -    STRING *return_string = NULL;
  -    internal_exception(UNIMPLEMENTED,
  -            "copy_to_charset for binary not implemented");
  -    return return_string;
  -}
  -
  -static void
  -to_unicode(Interp *interpreter, STRING *source_string)
  +static STRING*
  +to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "to_unicode for binary not 
implemented");
  +    return NULL;
   }
   
  -static void
  -from_charset(Interp *interpreter, STRING *source_string)
  +static STRING*
  +from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "Can't do this yet");
  +    return NULL;
   }
   
  -static void
  -from_unicode(Interp *interpreter, STRING *source_string)
  +static STRING *
  +from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "Can't do this yet");
  +    return NULL;
   }
   
   /* A noop. can't compose binary */
  @@ -262,7 +256,6 @@
         ascii_get_graphemes_inplace,
         set_graphemes,
         to_charset,
  -      copy_to_charset,
         to_unicode,
         from_charset,
         from_unicode,
  
  
  
  1.7       +1 -4      parrot/charset/binary.h
  
  Index: binary.h
  ===================================================================
  RCS file: /cvs/public/parrot/charset/binary.h,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- binary.h  1 Mar 2005 11:06:26 -0000       1.6
  +++ binary.h  1 Mar 2005 14:19:45 -0000       1.7
  @@ -1,7 +1,7 @@
   /* binary.h
    *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: binary.h,v 1.6 2005/03/01 11:06:26 leo Exp $
  + *     $Id: binary.h,v 1.7 2005/03/01 14:19:45 leo Exp $
    *  Overview:
    *     This is the header for the binary charset functions
    *  Data Structure and Algorithms:
  @@ -13,9 +13,6 @@
   #if !defined(PARROT_CHARSET_BINARY_H_GUARD)
   #define PARROT_CHARSET_BINARY_H_GUARD
   
  -static void to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  -static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  -static void to_unicode(Interp *, STRING *source_string);
   static void compose(Interp *, STRING *source_string);
   static void decompose(Interp *, STRING *source_string);
   static void upcase(Interp *, STRING *source_string);
  
  
  
  1.10      +56 -31    parrot/charset/iso-8859-1.c
  
  Index: iso-8859-1.c
  ===================================================================
  RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- iso-8859-1.c      1 Mar 2005 11:06:26 -0000       1.9
  +++ iso-8859-1.c      1 Mar 2005 14:19:45 -0000       1.10
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: iso-8859-1.c,v 1.9 2005/03/01 11:06:26 leo Exp $
  +$Id: iso-8859-1.c,v 1.10 2005/03/01 14:19:45 leo Exp $
   
   =head1 NAME
   
  @@ -18,6 +18,17 @@
   #include "iso-8859-1.h"
   #include "ascii.h"
   
  +#ifdef EXCEPTION
  +#  undef EXCEPTION
  +#endif
  +
  +/*
  + * TODO check interpreter error and warnings setting
  + */
  +
  +#define EXCEPTION(err, str) \
  +    real_exception(interpreter, NULL, err, str)
  +
   /* The encoding we prefer, given a choice */
   static ENCODING *preferred_encoding;
   
  @@ -55,51 +66,43 @@
               replace_count, insert_string);
   }
   
  -static void
  -from_charset(Interp *interpreter, STRING *source_string)
  +static STRING *
  +from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "Can't do this yet");
  +    return NULL;
   }
   
  -static void
  -from_unicode(Interp *interpreter, STRING *source_string)
  +static STRING *
  +from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
   {
       internal_exception(UNIMPLEMENTED, "Can't do this yet");
  +    return NULL;
   }
   
   
  -static void
  -to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
  +static STRING *
  +to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
   {
  -    charset_converter_t conversion_func;
  -    if ((conversion_func = Parrot_find_charset_converter(interpreter,
  -                    source_string->charset, new_charset))) {
  -        /*
  -         * XXX conversion_func has wrong signature ?
  -         *
  -         * conversion_func(interpreter, new_charset, source_string);
  -         */
  -    }
  -    else {
  -        to_unicode(interpreter, source_string);
  -        new_charset->from_charset(interpreter, source_string);
  -    }
  +    internal_exception(UNIMPLEMENTED,
  +            "to_unicode for iso-8859-1 not implemented");
  +    return NULL;
   }
   
   static STRING *
  -copy_to_charset(Interp *interpreter, STRING *source_string,
  -        CHARSET *new_charset)
  +to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING 
*dest)
   {
  -  STRING *return_string = NULL;
  +    charset_converter_t conversion_func;
   
  -  return return_string;
  -}
  +    if ((conversion_func = Parrot_find_charset_converter(interpreter,
  +                    src->charset, new_charset))) {
  +         return conversion_func(interpreter, src, dest);
  +    }
  +    else {
  +        STRING *res = to_unicode(interpreter, src, dest);
  +        return new_charset->from_charset(interpreter, res, dest);
   
  -static void
  -to_unicode(Interp *interpreter, STRING *source_string)
  -{
  -    internal_exception(UNIMPLEMENTED,
  -            "to_unicode for iso-8859-1 not implemented");
  +    }
   }
   
   /* A noop. can't compose iso-8859-1 */
  @@ -367,7 +370,6 @@
           ascii_get_graphemes_inplace,
           set_graphemes,
           to_charset,
  -        copy_to_charset,
           to_unicode,
           from_charset,
           from_unicode,
  @@ -417,6 +419,29 @@
       return return_set;
   }
   
  +STRING *
  +charset_cvt_iso_8859_1_to_ascii(Interp *interpreter, STRING *src, STRING 
*dest)
  +{
  +    UINTVAL offs, c;
  +    if (dest) {
  +        Parrot_reallocate_string(interpreter, dest, src->strlen);
  +        dest->bufused = src->bufused;
  +        dest->strlen  = src->strlen;
  +    }
  +    for (offs = 0; offs < src->strlen; ++offs) {
  +        c = ENCODING_GET_BYTE(interpreter, src, offs);
  +        if (c >= 0x80) {
  +            EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
  +        }
  +        if (dest)
  +            ENCODING_SET_BYTE(interpreter, dest, offs, c);
  +    }
  +    if (dest)
  +        return dest;
  +    src->charset = Parrot_ascii_charset_ptr;
  +    return src;
  +}
  +
   /*
    * Local variables:
    * c-indentation-style: bsd
  
  
  
  1.8       +3 -4      parrot/charset/iso-8859-1.h
  
  Index: iso-8859-1.h
  ===================================================================
  RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- iso-8859-1.h      1 Mar 2005 11:06:26 -0000       1.7
  +++ iso-8859-1.h      1 Mar 2005 14:19:45 -0000       1.8
  @@ -1,7 +1,7 @@
   /* iso_8859_1.h
    *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: iso-8859-1.h,v 1.7 2005/03/01 11:06:26 leo Exp $
  + *     $Id: iso-8859-1.h,v 1.8 2005/03/01 14:19:45 leo Exp $
    *  Overview:
    *     This is the header for the iso_8859-1 charset functions
    *  Data Structure and Algorithms:
  @@ -14,9 +14,6 @@
   #define PARROT_CHARSET_ISO_8859_1_H_GUARD
   
   static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset, 
UINTVAL replace_count, STRING *insert_string);
  -static void to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  -static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET 
*new_charset);
  -static void to_unicode(Interp *, STRING *source_string);
   static void compose(Interp *, STRING *source_string);
   static void decompose(Interp *, STRING *source_string);
   static void upcase(Interp *, STRING *source_string);
  @@ -39,6 +36,8 @@
   static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
   static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
   
  +STRING *charset_cvt_iso_8859_1_to_ascii(Interp *, STRING *src, STRING *dest);
  +
   CHARSET *Parrot_charset_iso_8859_1_init(Interp *);
   
   #endif /* PARROT_CHARSET_ISO_8859_1_H_GUARD */
  
  
  
  1.9       +15 -11    parrot/include/parrot/charset.h
  
  Index: charset.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/charset.h,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- charset.h 1 Mar 2005 08:30:56 -0000       1.8
  +++ charset.h 1 Mar 2005 14:19:46 -0000       1.9
  @@ -1,7 +1,7 @@
   /* charset.h
    *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: charset.h,v 1.8 2005/03/01 08:30:56 leo Exp $
  + *     $Id: charset.h,v 1.9 2005/03/01 14:19:46 leo Exp $
    *  Overview:
    *     This is the header for the 8-bit fixed-width encoding
    *  Data Structure and Algorithms:
  @@ -35,11 +35,14 @@
   typedef STRING *(*charset_get_graphemes_t)(Interp *, STRING *source_string, 
UINTVAL offset, UINTVAL count);
   typedef STRING *(*charset_get_graphemes_inplace_t)(Interp *, STRING 
*source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);
   typedef void (*charset_set_graphemes_t)(Interp *, STRING *source_string, 
UINTVAL offset, UINTVAL replace_count, STRING *insert_string);
  -typedef void (*charset_to_charset_t)(Interp *, STRING *source_string, 
CHARSET *new_charset);
  -typedef STRING *(*charset_copy_to_charset_t)(Interp *, STRING 
*source_string, CHARSET *new_charset);
  -typedef void (*charset_to_unicode_t)(Interp *, STRING *source_string);
  -typedef void (*charset_from_charset_t)(Interp *, STRING *source_string);
  -typedef void (*charset_from_unicode_t)(Interp *, STRING *source_string);
  +
  +typedef STRING * (*charset_to_charset_t)(Interp *, STRING *source_string,
  +        CHARSET *new_charset, STRING *dest);
  +typedef STRING * (*charset_to_unicode_t)(Interp *, STRING *src, STRING 
*dest);
  +typedef STRING * (*charset_from_charset_t)(Interp *, STRING *source_string,
  +        STRING *dest);
  +typedef STRING * (*charset_from_unicode_t)(Interp *, STRING *source_string,
  +        STRING *dest);
   typedef void (*charset_compose_t)(Interp *, STRING *source_string);
   typedef void (*charset_decompose_t)(Interp *, STRING *source_string);
   typedef void (*charset_upcase_t)(Interp *, STRING *source_string);
  @@ -77,14 +80,17 @@
   INTVAL Parrot_register_charset(Interp *, const char *charsetname, CHARSET 
*charset);
   INTVAL Parrot_make_default_charset(Interp *, const char *charsetname, 
CHARSET *charset);
   CHARSET *Parrot_default_charset(Interp *);
  -typedef INTVAL (*charset_converter_t)(Interp *, CHARSET *lhs, CHARSET *rhs);
  +typedef STRING* (*charset_converter_t)(Interp *, STRING *src, STRING *dst);
   charset_converter_t Parrot_find_charset_converter(Interp *, CHARSET *lhs, 
CHARSET *rhs);
  +void Parrot_register_charset_converter(Interp *,
  +        CHARSET *lhs, CHARSET *rhs, charset_converter_t func);
   
   void Parrot_deinit_charsets(Interp *);
   INTVAL Parrot_charset_number(Interp *, STRING *charsetname);
   STRING* Parrot_charset_name(Interp *, INTVAL);
   const char* Parrot_charset_c_name(Interp *, INTVAL);
   INTVAL Parrot_charset_number_of_str(Interp *, STRING *src);
  +CHARSET* Parrot_get_charset(Interp *, INTVAL number_of_charset);
   
   struct _charset {
       const char *name;
  @@ -92,7 +98,6 @@
       charset_get_graphemes_inplace_t get_graphemes_inplace;
       charset_set_graphemes_t set_graphemes;
       charset_to_charset_t to_charset;
  -    charset_copy_to_charset_t copy_to_charset;
       charset_to_unicode_t to_unicode;
       charset_from_charset_t from_charset;
       charset_from_unicode_t from_unicode;
  @@ -132,9 +137,8 @@
   #define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((CHARSET 
*)source->charset)->get_graphemes(interpreter, source, offset, count)
   #define CHARSET_GET_GRAPHEMES_INPLACE(interp, source, dest, offset, count) 
((CHARSET *)source->charset)->get_graphemes(interpreter, source, dest, offset, 
count)
   #define CHARSET_SET_GRAPHEMES(interp, source, offset, replace_count, insert) 
((CHARSET *)source->charset)->set_graphemes(interpreter, source, offset, 
replace_count, insert)
  -#define CHARSET_TO_CHARSET(interp, source, new_charset) ((CHARSET 
*)source->charset)->to_charset(interpreter, source, new_charset)
  -#define CHARSET_COPY_TO_CHARSET(interp, source, new_charset) ((CHARSET 
*)source->charset)->copy_to_charset(interpreter, source, new_charset)
  -#define CHARSET_TO_UNICODE(interp, source) ((CHARSET 
*)source->charset)->to_unicode(interpreter, source)
  +#define CHARSET_TO_CHARSET(interp, source, new_charset, dest) ((CHARSET 
*)source->charset)->to_charset(interpreter, source, new_charset, dest)
  +#define CHARSET_TO_UNICODE(interp, source, dest) ((CHARSET 
*)source->charset)->to_unicode(interpreter, source, dest)
   #define CHARSET_COMPOSE(interp, source) ((CHARSET 
*)source->charset)->compose(interpreter, source)
   #define CHARSET_DECOMPOSE(interp, source) ((CHARSET 
*)source->charset)->decompose(interpreter, source)
   #define CHARSET_UPCASE(interp, source) ((CHARSET 
*)source->charset)->upcase(interpreter, source)
  
  
  
  1.54      +3 -2      parrot/include/parrot/exceptions.h
  
  Index: exceptions.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/exceptions.h,v
  retrieving revision 1.53
  retrieving revision 1.54
  diff -u -r1.53 -r1.54
  --- exceptions.h      6 Jan 2005 00:42:05 -0000       1.53
  +++ exceptions.h      1 Mar 2005 14:19:46 -0000       1.54
  @@ -1,7 +1,7 @@
   /* exceptions.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: exceptions.h,v 1.53 2005/01/06 00:42:05 rubys Exp $
  + *     $Id: exceptions.h,v 1.54 2005/03/01 14:19:46 leo Exp $
    *  Overview:
    *     define the internal interpreter exceptions
    *  Data Structure and Algorithms:
  @@ -119,7 +119,8 @@
           WRITE_TO_CONSTCLASS,
           NOSPAWN,
           INTERNAL_NOT_IMPLEMENTED,
  -        ERR_OVERFLOW
  +        ERR_OVERFLOW,
  +        LOSSY_CONVERSION
   } exception_type_enum;
   
   /* &end_gen */
  
  
  
  1.50      +3 -1      parrot/include/parrot/string_funcs.h
  
  Index: string_funcs.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
  retrieving revision 1.49
  retrieving revision 1.50
  diff -u -r1.49 -r1.50
  --- string_funcs.h    28 Feb 2005 18:01:22 -0000      1.49
  +++ string_funcs.h    1 Mar 2005 14:19:46 -0000       1.50
  @@ -1,7 +1,7 @@
   /* string_funcs.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: string_funcs.h,v 1.49 2005/02/28 18:01:22 leo Exp $
  + *     $Id: string_funcs.h,v 1.50 2005/03/01 14:19:46 leo Exp $
    *  Overview:
    *     This is the api header for the string subsystem
    *  Data Structure and Algorithms:
  @@ -115,6 +115,8 @@
   INTVAL Parrot_string_find_newline(Interp *, STRING *, INTVAL offset);
   INTVAL Parrot_string_find_word_boundary(Interp *, STRING *, INTVAL offset);
   
  +STRING* Parrot_string_trans_charset(Interp *, STRING *src,
  +        INTVAL charset_nr, STRING *dest);
   
   #endif /* PARROT_IN_CORE */
   #endif /* PARROT_STRING_FUNCS_H_GUARD */
  
  
  
  1.60      +6 -0      parrot/ops/ops.num
  
  Index: ops.num
  ===================================================================
  RCS file: /cvs/public/parrot/ops/ops.num,v
  retrieving revision 1.59
  retrieving revision 1.60
  diff -u -r1.59 -r1.60
  --- ops.num   28 Feb 2005 18:01:24 -0000      1.59
  +++ ops.num   1 Mar 2005 14:19:47 -0000       1.60
  @@ -1422,3 +1422,9 @@
   find_word_boundary_i_s_ic      1392
   find_word_boundary_i_sc_i      1393
   find_word_boundary_i_sc_ic     1394
  +trans_charset_s_i              1395
  +trans_charset_s_ic             1396
  +trans_charset_s_s_i            1397
  +trans_charset_s_s_ic           1398
  +trans_charset_s_sc_i           1399
  +trans_charset_s_sc_ic          1400
  
  
  
  1.35      +21 -0     parrot/ops/string.ops
  
  Index: string.ops
  ===================================================================
  RCS file: /cvs/public/parrot/ops/string.ops,v
  retrieving revision 1.34
  retrieving revision 1.35
  diff -u -r1.34 -r1.35
  --- string.ops        28 Feb 2005 18:01:24 -0000      1.34
  +++ string.ops        1 Mar 2005 14:19:47 -0000       1.35
  @@ -634,6 +634,16 @@
   Return the charset number of the charset named $2. If the charset doesn't
   exit, throw an exception.
   
  +=item B<trans_charset>(inout STR, in INT)
  +
  +Change the string to have the specified charset.
  +
  +=item B<trans_charset>(out STR, in STR, in INT)
  +
  +Create a string $1 from $2 with the specified charset.
  +
  +Both functions may throw an exception on information loss.
  +
   =cut
   
   op charset(out INT, in STR) :base_core {
  @@ -655,6 +665,17 @@
     goto NEXT();
   }
   
  +op trans_charset(inout STR, in INT) {
  +  $1 = Parrot_string_trans_charset(interpreter, $1, $2, NULL);
  +  goto NEXT();
  +}
  +
  +op trans_charset(out STR, in STR, in INT) {
  +  STRING *dest = new_string_header(interpreter, 0);
  +  $1 = Parrot_string_trans_charset(interpreter, $2, $3, dest);
  +  goto NEXT();
  +}
  +
   =item B<is_whitespace>(out INT, in STR, in INT)
   
   Set $1 to 1 if the codepoint of string $2 at offset $3 is whitespace.
  
  
  
  1.9       +60 -1     parrot/src/charset.c
  
  Index: charset.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/charset.c,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- charset.c 1 Mar 2005 08:31:02 -0000       1.8
  +++ charset.c 1 Mar 2005 14:19:48 -0000       1.9
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2004 The Perl Foundation.  All Rights Reserved.
  -$Id: charset.c,v 1.8 2005/03/01 08:31:02 leo Exp $
  +$Id: charset.c,v 1.9 2005/03/01 14:19:48 leo Exp $
   
   =head1 NAME
   
  @@ -14,6 +14,7 @@
   
   #define PARROT_NO_EXTERN_CHARSET_PTRS
   #include "parrot/parrot.h"
  +#include "../charset/iso-8859-1.h"
   
   CHARSET *Parrot_iso_8859_1_charset_ptr;
   CHARSET *Parrot_binary_charset_ptr;
  @@ -26,8 +27,15 @@
    */
   
   typedef struct {
  +    CHARSET *to;
  +    charset_converter_t func;
  +} To_converter;
  +
  +typedef struct {
       CHARSET *charset;
       STRING  *name;
  +    int n_converters;
  +    To_converter *to_converters;
   } One_charset;
   
   typedef struct {
  @@ -126,6 +134,14 @@
       return all_charsets->set[number_of_charset].name;
   }
   
  +CHARSET*
  +Parrot_get_charset(Interp *interpreter, INTVAL number_of_charset)
  +{
  +    if (number_of_charset >= all_charsets->n_charsets)
  +        return NULL;
  +    return all_charsets->set[number_of_charset].charset;
  +}
  +
   const char *
   Parrot_charset_c_name(Interp *interpreter, INTVAL number_of_charset)
   {
  @@ -153,6 +169,7 @@
       all_charsets->n_charsets++;
       all_charsets->set[n].charset = charset;
       all_charsets->set[n].name = const_string(interpreter, charsetname);
  +    all_charsets->set[n].n_converters = 0;
   
       return 1;
   }
  @@ -183,6 +200,9 @@
       }
       if (!strcmp("ascii", charsetname)) {
           Parrot_ascii_charset_ptr = charset;
  +        Parrot_register_charset_converter(interpreter,
  +                Parrot_iso_8859_1_charset_ptr, charset,
  +                charset_cvt_iso_8859_1_to_ascii);
           return register_charset(interpreter, charsetname, charset);
       }
       return 0;
  @@ -202,12 +222,51 @@
       return Parrot_default_charset_ptr;
   }
   
  +
   charset_converter_t
   Parrot_find_charset_converter(Interp *interpreter, CHARSET *lhs, CHARSET 
*rhs)
   {
  +    int i, j, n, nc;
  +
  +    n = all_charsets->n_charsets;
  +    for (i = 0; i < n; ++i) {
  +        if (lhs == all_charsets->set[i].charset) {
  +            One_charset *left = all_charsets->set + i;
  +
  +            nc = left->n_converters;
  +            for (j = 0; j < nc; ++j) {
  +                if (left->to_converters[j].to == rhs)
  +                    return left->to_converters[j].func;
  +            }
  +        }
  +    }
       return NULL;
   }
   
  +void
  +Parrot_register_charset_converter(Interp *interpreter,
  +        CHARSET *lhs, CHARSET *rhs, charset_converter_t func)
  +{
  +    int i, n, nc;
  +
  +    n = all_charsets->n_charsets;
  +    for (i = 0; i < n; ++i) {
  +        if (lhs == all_charsets->set[i].charset) {
  +            One_charset *left = all_charsets->set + i;
  +
  +            nc = left->n_converters++;
  +            if (nc) {
  +                left->to_converters = mem_sys_realloc(left->to_converters,
  +                        sizeof(To_converter) * (nc + 1));
  +            }
  +            else
  +                left->to_converters = mem_sys_allocate(sizeof(To_converter));
  +            left->to_converters[nc].to = rhs;
  +            left->to_converters[nc].func = func;
  +        }
  +    }
  +}
  +
   /*
    * Local variables:
    * c-indentation-style: bsd
  
  
  
  1.238     +36 -1     parrot/src/string.c
  
  Index: string.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/string.c,v
  retrieving revision 1.237
  retrieving revision 1.238
  diff -u -r1.237 -r1.238
  --- string.c  28 Feb 2005 18:01:28 -0000      1.237
  +++ string.c  1 Mar 2005 14:19:48 -0000       1.238
  @@ -1,6 +1,6 @@
   /*
   Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
  -$Id: string.c,v 1.237 2005/02/28 18:01:28 leo Exp $
  +$Id: string.c,v 1.238 2005/03/01 14:19:48 leo Exp $
   
   =head1 NAME
   
  @@ -2591,6 +2591,41 @@
       return CHARSET_FIND_WORD_BOUNDARY(interpreter, s, offset);
   }
   
  +STRING*
  +Parrot_string_trans_charset(Interp *interpreter, STRING *src,
  +        INTVAL charset_nr, STRING *dest)
  +{
  +    CHARSET *new_charset;
  +
  +    if (!src)
  +        return NULL;
  +    new_charset = Parrot_get_charset(interpreter, charset_nr);
  +    if (!new_charset)
  +        real_exception(interpreter, NULL, INVALID_CHARTYPE,
  +                "charset #%d not found", (int) charset_nr);
  +    /*
  +     * dest is an empty string header or NULL, if an inplace
  +     * operation is desired
  +     */
  +    if (dest) {
  +        if (new_charset == src->charset) {
  +            Parrot_reuse_COW_reference(interpreter, src, dest);
  +            dest->charset = new_charset;
  +            /* keep encoding */
  +            return dest;
  +        }
  +        dest->charset = new_charset;
  +        /* XXX prefered encoding for charset */
  +        dest->encoding = PARROT_DEFAULT_ENCODING;
  +    }
  +    else {
  +        if (new_charset == src->charset) {
  +            return src;
  +        }
  +    }
  +    return CHARSET_TO_CHARSET(interpreter, src, new_charset, dest);
  +}
  +
   /*
   
   =back
  
  
  
  1.6       +62 -2     parrot/t/op/string_cs.t
  
  Index: string_cs.t
  ===================================================================
  RCS file: /cvs/public/parrot/t/op/string_cs.t,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- string_cs.t       28 Feb 2005 18:01:30 -0000      1.5
  +++ string_cs.t       1 Mar 2005 14:19:49 -0000       1.6
  @@ -1,6 +1,6 @@
   #! perl -w
   # Copyright: 2001-2004 The Perl Foundation.  All Rights Reserved.
  -# $Id: string_cs.t,v 1.5 2005/02/28 18:01:30 leo Exp $
  +# $Id: string_cs.t,v 1.6 2005/03/01 14:19:49 leo Exp $
   
   =head1 NAME
   
  @@ -16,7 +16,7 @@
   
   =cut
   
  -use Parrot::Test tests => 16;
  +use Parrot::Test tests => 20;
   use Test::More;
   
   output_is( <<'CODE', <<OUTPUT, "basic syntax" );
  @@ -245,3 +245,63 @@
   CODE
   0 2 3 6 -1 ok
   OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i");
  +    set S0, "abc"
  +    find_charset I0, "ascii"
  +    trans_charset S1, S0, I0
  +    print S1
  +    print "\n"
  +    charset I0, S1
  +    charsetname S2, I0
  +    print S2
  +    print "\n"
  +    end
  +CODE
  +abc
  +ascii
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i");
  +    set S1, "abc"
  +    find_charset I0, "ascii"
  +    trans_charset S1, I0
  +    print S1
  +    print "\n"
  +    charset I0, S1
  +    charsetname S2, I0
  +    print S2
  +    print "\n"
  +    end
  +CODE
  +abc
  +ascii
  +OUTPUT
  +
  +
  +output_like( <<'CODE', <<OUTPUT, "trans_charset_s_i - lossy");
  +    set S1, "abc�"
  +    find_charset I0, "ascii"
  +    trans_charset S1, I0
  +    print "never\n"
  +    end
  +CODE
  +/lossy conversion to ascii/
  +OUTPUT
  +
  +output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i - same");
  +    set S1, ascii:"abc"
  +    find_charset I0, "ascii"
  +    trans_charset S1, I0
  +    print S1
  +    print "\n"
  +    charset I0, S1
  +    charsetname S2, I0
  +    print S2
  +    print "\n"
  +    end
  +CODE
  +abc
  +ascii
  +OUTPUT
  +
  
  
  

Reply via email to