[svn:parrot] r9901 - in trunk: charset encodings include/parrot io src

leo Fri, 11 Nov 2005 06:27:33 -0800

Author: leo
Date: Fri Nov 11 06:27:10 2005
New Revision: 9901

Modified:
   trunk/charset/unicode.c
   trunk/encodings/fixed_8.c
   trunk/encodings/fixed_8.h
   trunk/encodings/ucs2.c
   trunk/encodings/utf16.c
   trunk/encodings/utf8.c
   trunk/include/parrot/encoding.h
   trunk/io/io_utf8.c
   trunk/src/string.c
Log:
unicode encodings - simplify to_encoding


* actually to_encoding is a few lines longer now
* but does the work of copy_to_enconding now too
* this gets rid of code code duplication and half
  of the unimplemented conversions
* start adding doc


Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c     (original)
+++ trunk/charset/unicode.c     Fri Nov 11 06:27:10 2005
@@ -123,7 +123,7 @@ downcase(Interp *interpreter, STRING *sr
     UErrorCode err;
     int dest_len, src_len;
 
-    Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+    src = Parrot_utf16_encoding_ptr->to_encoding(interpreter, src, NULL);
     /*
 U_CAPI int32_t U_EXPORT2
 u_strToLower(UChar *dest, int32_t destCapacity,

Modified: trunk/encodings/fixed_8.c
==============================================================================
--- trunk/encodings/fixed_8.c   (original)
+++ trunk/encodings/fixed_8.c   Fri Nov 11 06:27:10 2005
@@ -19,23 +19,14 @@ This file implements the encoding functi
 
 #define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl fixed_8")
 
-/* This function needs to go through and get all the code points one
-   by one and turn them into a byte */
-static void
-to_encoding(Interp *interpreter, STRING *source_string)
-{
-    UNIMPL;
-}
-
 static STRING *
-copy_to_encoding(Interp *interpreter, STRING *source_string)
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
 {
-    STRING *return_string = NULL;
-
     UNIMPL;
-    return return_string;
+    return NULL;
 }
 
+
 /* codepoints are bytes, so delegate */
 static UINTVAL
 get_codepoint(Interp *interpreter, const STRING *source_string,
@@ -214,7 +205,6 @@ Parrot_encoding_fixed_8_init(Interp *int
        "fixed_8",
        1, /* Max bytes per codepoint */
        to_encoding,
-       copy_to_encoding,
        get_codepoint,
        set_codepoint,
        get_byte,

Modified: trunk/encodings/fixed_8.h
==============================================================================
--- trunk/encodings/fixed_8.h   (original)
+++ trunk/encodings/fixed_8.h   Fri Nov 11 06:27:10 2005
@@ -13,8 +13,6 @@
 #if !defined(PARROT_ENCODING_FIXED_8_H_GUARD)
 #define PARROT_ENCODING_FIXED_8_H_GUARD
 
-static void to_encoding(Interp *interpreter, STRING *source_string);
-static STRING *copy_to_encoding(Interp *interpreter, STRING *source_string);
 static UINTVAL get_codepoint(Interp *interpreter, const STRING *source_string, 
UINTVAL offset);
 static void set_codepoint(Interp *interpreter, STRING *source_string, UINTVAL 
offset, UINTVAL codepoint);
 static UINTVAL get_byte(Interp *interpreter, const STRING *source_string, 
UINTVAL offset);

Modified: trunk/encodings/ucs2.c
==============================================================================
--- trunk/encodings/ucs2.c      (original)
+++ trunk/encodings/ucs2.c      Fri Nov 11 06:27:10 2005
@@ -33,37 +33,18 @@ UCS-2 encoding with the help of the ICU 
 static void iter_init(Interp *, String *src, String_iter *iter);
 
 
-static void
-to_encoding(Interp *interpreter, STRING *src)
-{
-    if (src->encoding == Parrot_ucs2_encoding_ptr)
-        return;
-    Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
-    /*
-     * conversion to utf16 downgrads to ucs-2 if possible - check result
-     */
-    if (src->encoding == Parrot_utf16_encoding_ptr) {
-        real_exception(interpreter, NULL, E_UnicodeError,
-            "can't convert string with surrogates to ucs2");
-    }
-}
-
 static STRING *
-copy_to_encoding(Interp *interpreter, STRING *src)
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
 {
-    STRING *dest;
-
-    if (src->encoding == Parrot_ucs2_encoding_ptr)
-        return string_copy(interpreter, src);
-    dest = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, src);
+    STRING *result = Parrot_utf16_encoding_ptr->to_encoding(interpreter, src, 
dest);
     /*
      * conversion to utf16 downgrads to ucs-2 if possible - check result
      */
-    if (dest->encoding == Parrot_utf16_encoding_ptr) {
+    if (result->encoding == Parrot_utf16_encoding_ptr) {
         real_exception(interpreter, NULL, E_UnicodeError,
             "can't convert string with surrogates to ucs2");
     }
-    return dest;
+    return result;
 }
 
 static UINTVAL
@@ -258,7 +239,6 @@ Parrot_encoding_ucs2_init(Interp *interp
        "ucs2",
        2, /* Max bytes per codepoint 0 .. 0x10ffff */
        to_encoding,
-       copy_to_encoding,
        get_codepoint,
        set_codepoint,
        get_byte,

Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c     (original)
+++ trunk/encodings/utf16.c     Fri Nov 11 06:27:10 2005
@@ -32,10 +32,20 @@ UTF-16 encoding with the help of the ICU
 
 
 static void iter_init(Interp *, String *src, String_iter *iter);
-/* This function needs to go through and get all the code points one
-   by one and turn them into a utf16 sequence */
-static void
-to_encoding(Interp *interpreter, STRING *src)
+
+/*
+
+=item C<static STRING *to_encoding(Interp *, STRING *src, STRING *dest)>
+
+Convert string C<src> to this encoding. If C<dest> is set
+fill it with the converted result, else operate inplace.
+
+=cut
+
+*/
+
+static STRING *
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
 {
 #if PARROT_HAS_ICU
     UErrorCode err;
@@ -43,18 +53,29 @@ to_encoding(Interp *interpreter, STRING 
     UChar *p;
 #endif
     int src_len;
+    int in_place = dest == NULL;
+    STRING *result;
 
     if (src->encoding == Parrot_utf16_encoding_ptr ||
-        src->encoding == Parrot_ucs2_encoding_ptr)
-        return;
+            src->encoding == Parrot_ucs2_encoding_ptr)
+        return in_place ? src : string_copy(interpreter, src);
     /*
      * TODO adapt string creation functions
      */
-    src->charset  = Parrot_unicode_charset_ptr;
-    src->encoding = Parrot_utf16_encoding_ptr;
     src_len = src->strlen;
-    if (!src_len)
-        return;
+    if (in_place) {
+        result = src;
+    }
+    else {
+        result = dest;
+    }
+    result->charset  = Parrot_unicode_charset_ptr;
+    result->encoding = Parrot_utf16_encoding_ptr;
+    result->strlen = src_len;
+    if (!src_len) {
+        result->encoding = Parrot_ucs2_encoding_ptr;
+        return result;
+    }
     /*
        u_strFromUTF8(UChar *dest,
        int32_t destCapacity,
@@ -62,11 +83,18 @@ to_encoding(Interp *interpreter, STRING 
        const char *src,
        int32_t srcLength,
        UErrorCode *pErrorCode);
-     */
+       */
 #if PARROT_HAS_ICU
-    /* need intermediate memory */
-    p = mem_sys_allocate(src_len * sizeof(UChar));
-    if (src->charset == Parrot_iso_8859_1_charset_ptr) {
+    if (in_place) {
+        /* need intermediate memory */
+        p = mem_sys_allocate(src_len * sizeof(UChar));
+    }
+    else {
+        Parrot_reallocate_string(interpreter, dest, sizeof(UChar) * src_len);
+        p = dest->strstart;
+    }
+    if (src->charset == Parrot_iso_8859_1_charset_ptr ||
+            src->charset == Parrot_ascii_charset_ptr) {
         for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len) {
             p[dest_len] = (UChar)((unsigned char*)src->strstart)[dest_len];
         }
@@ -79,80 +107,32 @@ to_encoding(Interp *interpreter, STRING 
             /*
              * have to resize - required len in UChars is in dest_len
              */
-            p = mem_sys_realloc(p, dest_len * sizeof(UChar));
+            if (in_place)
+                p = mem_sys_realloc(p, dest_len * sizeof(UChar));
+            else {
+                Parrot_reallocate_string(interpreter, dest, sizeof(UChar) * 
dest_len);
+                p = dest->strstart;
+            }
             u_strFromUTF8(p, dest_len,
                     &dest_len, src->strstart, src->bufused, &err);
             assert(U_SUCCESS(err));
         }
     }
-    src->bufused = dest_len * sizeof(UChar);
-    Parrot_reallocate_string(interpreter, src, src->bufused);
-    memcpy(src->strstart, p, src->bufused);
-    mem_sys_free(p);
-
-    /* downgrade if possible */
-    if (dest_len == (int)src->strlen)
-        src->encoding = Parrot_ucs2_encoding_ptr;
-#else
-    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
-            "no ICU lib loaded");
-#endif
-}
-
-static STRING *
-copy_to_encoding(Interp *interpreter, STRING *src)
-{
-    STRING *dest;
-#if PARROT_HAS_ICU
-    UErrorCode err;
-    int dest_len;
-#endif
-    int src_len;
-
-    if (src->encoding == Parrot_utf16_encoding_ptr ||
-            src->encoding == Parrot_ucs2_encoding_ptr)
-        return string_copy(interpreter, src);
-
-    src_len  = src->strlen;
-    if (!src_len) {
-        return string_make_direct(interpreter, NULL, 0,
-                Parrot_utf16_encoding_ptr,
-                Parrot_unicode_charset_ptr, 0);
-
-    }
-#if PARROT_HAS_ICU
-    dest = string_make_direct(interpreter, NULL, sizeof(UChar) * src_len,
-                Parrot_utf16_encoding_ptr,
-                Parrot_unicode_charset_ptr, 0);
-    dest->strlen = src_len;
-    if (src->charset == Parrot_iso_8859_1_charset_ptr) {
-        UChar *p = (UChar*) dest->strstart;
-        for (dest_len = 0; dest_len < (int)src->strlen; ++dest_len, ++p) {
-            *p = (UChar)((unsigned char*)src->strstart)[dest_len];
-        }
-    }
-    else {
-        err = U_ZERO_ERROR;
-        u_strFromUTF8(dest->strstart, src_len,
-                &dest_len, src->strstart, src->bufused, &err);
-        if (!U_SUCCESS(err)) {
-            Parrot_allocate_string(interpreter, dest, sizeof(UChar) * 
dest_len);
-            u_strFromUTF8(dest->strstart, dest_len,
-                    &dest_len, src->strstart, src->bufused, &err);
-            assert(U_SUCCESS(err));
-        }
+    result->bufused = dest_len * sizeof(UChar);
+    if (in_place) {
+        Parrot_reallocate_string(interpreter, src, src->bufused);
+        memcpy(src->strstart, p, src->bufused);
+        mem_sys_free(p);
     }
-    dest->bufused = dest_len * sizeof(UChar);
+
     /* downgrade if possible */
     if (dest_len == (int)src->strlen)
-        dest->encoding = Parrot_ucs2_encoding_ptr;
-
+        result->encoding = Parrot_ucs2_encoding_ptr;
 #else
     real_exception(interpreter, NULL, E_LibraryNotLoadedError,
             "no ICU lib loaded");
 #endif
-
-    return dest;
+    return result;
 }
 
 static UINTVAL
@@ -367,7 +347,6 @@ Parrot_encoding_utf16_init(Interp *inter
        "utf16",
        4, /* Max bytes per codepoint 0 .. 0x10ffff */
        to_encoding,
-       copy_to_encoding,
        get_codepoint,
        set_codepoint,
        get_byte,

Modified: trunk/encodings/utf8.c
==============================================================================
--- trunk/encodings/utf8.c      (original)
+++ trunk/encodings/utf8.c      Fri Nov 11 06:27:10 2005
@@ -307,54 +307,78 @@ utf8_set_position(Interp *interpreter, S
 }
 
 
-/* This function needs to go through and get all the code points one
-   by one and turn them into a utf8 sequence */
-static void
-to_encoding(Interp *interpreter, STRING *src)
-{
-    if (src->encoding == Parrot_utf8_encoding_ptr)
-        return;
-    UNIMPL;
-}
-
 static STRING *
-copy_to_encoding(Interp *interpreter, STRING *src)
+to_encoding(Interp *interpreter, STRING *src, STRING *dest)
 {
-    STRING *dest;
-    String_iter src_iter, dest_iter;
-    UINTVAL offs, c;
+    STRING *result;
+    String_iter src_iter;
+    UINTVAL offs, c, dest_len, dest_pos, src_len;
+    int in_place = dest == NULL;
+    unsigned char *new_pos, *pos, *p;
 
     if (src->encoding == Parrot_utf8_encoding_ptr)
-        return string_copy(interpreter, src);
+        return in_place ? src : string_copy(interpreter, src);
+    src_len = src->strlen;
+    if (in_place) {
+        result = src;
+    }
+    else {
+        result = dest;
+    }
 
-    /*
-     * TODO adapt string creation functions
-     */
-    dest = string_make_empty(interpreter, enum_stringrep_one, src->strlen);
-    dest->charset  = Parrot_unicode_charset_ptr;
-    dest->encoding = Parrot_utf8_encoding_ptr;
-    dest->strlen   = src->strlen;
+    result->charset  = Parrot_unicode_charset_ptr;
+    result->encoding = Parrot_utf8_encoding_ptr;
+    result->strlen   = src_len;
 
     if (!src->strlen)
         return dest;
 
-    ENCODING_ITER_INIT(interpreter, src, &src_iter);
-    ENCODING_ITER_INIT(interpreter, dest, &dest_iter);
-
-    for (offs = 0; offs < src->strlen; ++offs) {
-        c = src_iter.get_and_advance(interpreter, &src_iter);
-        if (dest_iter.bytepos >= PObj_buflen(dest) - 4) {
-            UINTVAL need = (src->strlen - offs) * 1.5;
-            if (need < 16)
-                need = 16;
-            Parrot_reallocate_string(interpreter, dest,
-                    PObj_buflen(dest) + need);
+    if (in_place) {
+        /* need intermediate memory */
+        p = mem_sys_allocate(src_len);
+    }
+    else {
+        Parrot_reallocate_string(interpreter, dest, src_len);
+        p = dest->strstart;
+    }
+    if (src->charset == Parrot_iso_8859_1_charset_ptr ||
+            src->charset == Parrot_ascii_charset_ptr) {
+        for (dest_len = 0; dest_len < src_len; ++dest_len) {
+            p[dest_len] = ((unsigned char*)src->strstart)[dest_len];
+        }
+        result->bufused = dest_len;
+    }
+    else {
+        ENCODING_ITER_INIT(interpreter, src, &src_iter);
+        dest_len = src_len;
+        dest_pos = 0;
+        for (offs = 0; offs < src_len; ++offs) {
+            c = src_iter.get_and_advance(interpreter, &src_iter);
+            if (dest_len - dest_pos < 6) {
+                UINTVAL need = (src->strlen - offs) * 1.5;
+                if (need < 16)
+                    need = 16;
+                dest_len += need;
+                if (in_place)
+                    p = mem_sys_realloc(p, dest_len);
+                else {
+                    Parrot_reallocate_string(interpreter, dest, dest_len);
+                    p = dest->strstart;
+                }
+            }
+
+            pos = p + dest_pos;
+            new_pos = utf8_encode(pos, c);
+            dest_pos += (new_pos - pos);
         }
-        dest_iter.set_and_advance(interpreter, &dest_iter, c);
+        result->bufused = dest_pos;
+    }
+    if (in_place) {
+        Parrot_reallocate_string(interpreter, src, src->bufused);
+        memcpy(src->strstart, p, src->bufused);
+        mem_sys_free(p);
     }
-    assert(dest->strlen  == dest_iter.charpos);
-    dest->bufused = dest_iter.bytepos;
-    return dest;
+    return result;
 }
 
 static UINTVAL
@@ -533,7 +557,6 @@ Parrot_encoding_utf8_init(Interp *interp
        "utf8",
        4, /* Max bytes per codepoint 0 .. 0x10ffff */
        to_encoding,
-       copy_to_encoding,
        get_codepoint,
        set_codepoint,
        get_byte,

Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h     (original)
+++ trunk/include/parrot/encoding.h     Fri Nov 11 06:27:10 2005
@@ -15,8 +15,7 @@
 
 #include "parrot/parrot.h"
 
-typedef void (*encoding_to_encoding_t)(Interp*, STRING *string);
-typedef STRING *(*encoding_copy_to_encoding_t)(Interp*, STRING *string);
+typedef STRING * (*encoding_to_encoding_t)(Interp*, STRING *src, STRING *dest);
 typedef UINTVAL (*encoding_get_codepoint_t)(Interp*, const STRING *src, 
UINTVAL offset);
 typedef void (*encoding_set_codepoint_t)(Interp*, STRING *src, UINTVAL offset, 
UINTVAL codepoint);
 typedef UINTVAL (*encoding_get_byte_t)(Interp*, const STRING *src, UINTVAL 
offset);
@@ -42,7 +41,6 @@ struct _encoding {
     const char *name;
     UINTVAL max_bytes_per_codepoint;
     encoding_to_encoding_t to_encoding;
-    encoding_copy_to_encoding_t copy_to_encoding;
     encoding_get_codepoint_t get_codepoint;
     encoding_set_codepoint_t  set_codepoint;
     encoding_get_byte_t  get_byte;
@@ -91,10 +89,6 @@ const char * Parrot_encoding_c_name(Inte
 
 #define ENCODING_MAX_BYTES_PER_CODEPOINT(i, src) \
     ((ENCODING *)src->encoding)->max_bytes_per_codepoint
-#define ENCODING_TO_ENCODING(i, src, offset, count) \
-    ((ENCODING *)src->encoding)->to_encoding(i, src, offset, count)
-#define ENCODING_COPY_TO_ENCODING(i, src) \
-    ((ENCODING *)src->encoding)->copy_to_encoding(i, src)
 #define ENCODING_GET_CODEPOINT(i, src, offset) \
     ((ENCODING *)src->encoding)->get_codepoint(i, src, offset)
 #define ENCODING_SET_CODEPOINT(i, src, offset, codepoint) \

Modified: trunk/io/io_utf8.c
==============================================================================
--- trunk/io/io_utf8.c  (original)
+++ trunk/io/io_utf8.c  Fri Nov 11 06:27:10 2005
@@ -52,7 +52,8 @@ PIO_utf8_write(theINTERP, ParrotIOLayer 
     if (s->encoding == Parrot_utf8_encoding_ptr)
         return PIO_write_down(interpreter, l->down, io, s);
 
-    dest = Parrot_utf8_encoding_ptr->copy_to_encoding(interpreter, s);
+    dest = Parrot_utf8_encoding_ptr->to_encoding(interpreter, s,
+            new_string_header(interpreter, 0));
     return PIO_write_down(interpreter, l->down, io, dest);
 }
 

Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c  (original)
+++ trunk/src/string.c  Fri Nov 11 06:27:10 2005
@@ -484,8 +484,9 @@ string_append(Interp *interpreter,
         a->charset = cs;
     else {
         /* upgrade to utf16 */
-        Parrot_utf16_encoding_ptr->to_encoding(interpreter, a);
-        b = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, b);
+        Parrot_utf16_encoding_ptr->to_encoding(interpreter, a, NULL);
+        b = Parrot_utf16_encoding_ptr->to_encoding(interpreter, b,
+                new_string_header(interpreter, 0));
         /*
          * result could be mixed ucs2 / utf16
          */
@@ -1178,8 +1179,9 @@ string_replace(Interp *interpreter, STRI
 
     /* may have different reps..... */
     if ( !(cs = string_rep_compatible(interpreter, src, rep, &enc))) {
-        Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
-        rep = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, rep);
+        Parrot_utf16_encoding_ptr->to_encoding(interpreter, src, NULL);
+        rep = Parrot_utf16_encoding_ptr->to_encoding(interpreter, rep,
+                new_string_header(interpreter, 0));
     }
     else {
         src->charset = cs;
@@ -2659,8 +2661,12 @@ Parrot_string_find_not_cclass(Interp *in
 Parrot_string_trans_charset(Interp *interpreter, STRING *src,
         INTVAL charset_nr, STRING *dest)>
 
-If C<dest> == NULL convert  C<src> to the given charset inplace, else
-return a copy of C<src> with the charset in dest.
+=item C< STRING*
+Parrot_string_trans_encoding(Interp *interpreter, STRING *src,
+        INTVAL charset_nr, STRING *dest)>
+
+If C<dest> == NULL convert  C<src> to the given charset or encoding inplace,
+else return a copy of C<src> with the charset/encoding in dest.
 
 =cut
 
@@ -2718,20 +2724,18 @@ Parrot_string_trans_encoding(Interp *int
      * operation is desired
      */
     if (dest) {
+        dest->encoding = new_encoding;
         if (new_encoding == src->encoding) {
             Parrot_reuse_COW_reference(interpreter, src, dest);
-            dest->encoding = new_encoding;
             return dest;
         }
-        return new_encoding->copy_to_encoding(interpreter, src);
     }
     else {
         if (new_encoding == src->encoding) {
             return src;
         }
     }
-    new_encoding->to_encoding(interpreter, src);
-    return src;
+    return new_encoding->to_encoding(interpreter, src, dest);
 }
 
 /*

[svn:parrot] r9901 - in trunk: charset encodings include/parrot io src

Reply via email to