Author: leo
Date: Thu Nov 10 01:54:28 2005
New Revision: 9875

Added:
   trunk/encodings/ucs2.c   (contents, props changed)
   trunk/encodings/ucs2.h   (contents, props changed)
Modified:
   trunk/MANIFEST
   trunk/encodings/utf16.c
   trunk/include/parrot/encoding.h
   trunk/src/charset.c
   trunk/src/encoding.c
Log:
unicode improvements - one more encoding

* implement ucs2 encoding
* downgrade utf16 to ucs2 if no surrogates


Modified: trunk/MANIFEST
==============================================================================
--- trunk/MANIFEST      (original)
+++ trunk/MANIFEST      Thu Nov 10 01:54:28 2005
@@ -445,8 +445,10 @@ editor/filetype_parrot.vim              
 editor/indent_imc.vim                             [devel]
 encodings/fixed_8.c                               []
 encodings/fixed_8.h                               []
-encodings/utf16.c                                  []
-encodings/utf16.h                                  []
+encodings/ucs2.c                                  []
+encodings/ucs2.h                                  []
+encodings/utf16.c                                 []
+encodings/utf16.h                                 []
 encodings/utf8.c                                  []
 encodings/utf8.h                                  []
 examples/README                                   [main]doc

Added: trunk/encodings/ucs2.c
==============================================================================
--- (empty file)
+++ trunk/encodings/ucs2.c      Thu Nov 10 01:54:28 2005
@@ -0,0 +1,285 @@
+/*
+Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
+$Id$
+
+=head1 NAME
+
+encodings/ucs2.c - UCS-2 encoding
+
+=head1 DESCRIPTION
+
+UCS-2 encoding with the help of the ICU library.
+
+=head2 Functions
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "parrot/unicode.h"
+
+#include "ucs2.h"
+
+#if PARROT_HAS_ICU
+#include <unicode/ustring.h>
+#endif
+
+#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl ucs2")
+
+
+static void iter_init(Interp *, String *src, String_iter *iter);
+
+
+static void
+to_encoding(Interp *interpreter, STRING *src)
+{
+    if (src->encoding == Parrot_ucs2_encoding_ptr)
+        return;
+    Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+    /*
+     * conversion to utf16 downgrads to ucs-2 if possible - check result
+     */
+    if (src->encoding == Parrot_utf16_encoding_ptr) {
+        real_exception(interpreter, NULL, E_UnicodeError,
+            "can't convert string with surrogates to ucs2");
+    }
+}
+
+static STRING *
+copy_to_encoding(Interp *interpreter, STRING *src)
+{
+    STRING *dest;
+
+    if (src->encoding == Parrot_ucs2_encoding_ptr)
+        return string_copy(interpreter, src);
+    dest = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, src);
+    /*
+     * conversion to utf16 downgrads to ucs-2 if possible - check result
+     */
+    if (dest->encoding == Parrot_utf16_encoding_ptr) {
+        real_exception(interpreter, NULL, E_UnicodeError,
+            "can't convert string with surrogates to ucs2");
+    }
+    return dest;
+}
+
+static UINTVAL
+get_codepoint(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+    UChar *s = (UChar*) src->strstart;
+    return s[offset];
+}
+
+static void
+set_codepoint(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL codepoint)
+{
+    UChar *s = (UChar*) src->strstart;
+    s[offset] = codepoint;
+}
+
+static UINTVAL
+get_byte(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+    UNIMPL;
+    return 0;
+}
+
+static void
+set_byte(Interp *interpreter, const STRING *src,
+       UINTVAL offset, UINTVAL byte)
+{
+    UNIMPL;
+}
+
+static STRING *
+get_codepoints(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count)
+{
+    String_iter iter;
+    UINTVAL start;
+    STRING *return_string = Parrot_make_COW_reference(interpreter,
+           src);
+    return_string->encoding = src->encoding;
+    return_string->charset = src->charset;
+    iter_init(interpreter, src, &iter);
+    iter.set_position(interpreter, &iter, offset);
+    start = iter.bytepos;
+    return_string->strstart = (char *)return_string->strstart + start ;
+    iter.set_position(interpreter, &iter, offset + count);
+    return_string->bufused = iter.bytepos - start;
+    return_string->strlen = count;
+    return_string->hashval = 0;
+    return return_string;
+}
+
+static STRING *
+get_bytes(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count)
+{
+    UNIMPL;
+    return NULL;
+}
+
+
+static STRING *
+get_codepoints_inplace(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *dest_string)
+{
+
+    UNIMPL;
+    return NULL;
+}
+
+static STRING *
+get_bytes_inplace(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *return_string)
+{
+    UNIMPL;
+    return NULL;
+}
+
+static void
+set_codepoints(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *new_codepoints)
+{
+    UNIMPL;
+}
+
+static void
+set_bytes(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *new_bytes)
+{
+    UNIMPL;
+}
+
+/* Unconditionally makes the string be in this encoding, if that's
+   valid */
+static void
+become_encoding(Interp *interpreter, STRING *src)
+{
+    UNIMPL;
+}
+
+
+static UINTVAL
+codepoints(Interp *interpreter, STRING *src)
+{
+    UNIMPL;
+    return 0;
+}
+
+static UINTVAL
+bytes(Interp *interpreter, STRING *src)
+{
+    return src->bufused;
+}
+
+#if PARROT_HAS_ICU
+static UINTVAL
+ucs2_decode_and_advance(Interp *interpreter, String_iter *i)
+{
+    UChar *s = (UChar*) i->str->strstart;
+    UINTVAL c, pos;
+    pos = i->bytepos / sizeof(UChar);
+    /* TODO either make sure that we don't go past end or use SAFE
+     *      iter versions
+     */
+    c = s[pos++];
+    i->charpos++;
+    i->bytepos = pos * sizeof(UChar);
+    return c;
+}
+
+static void
+ucs2_encode_and_advance(Interp *interpreter, String_iter *i, UINTVAL c)
+{
+    UChar *s = (UChar*) i->str->strstart;
+    UINTVAL pos;
+    pos = i->bytepos / sizeof(UChar);
+    s[pos++] = c;
+    i->charpos++;
+    i->bytepos = pos * sizeof(UChar);
+}
+
+static void
+ucs2_set_position(Interp *interpreter, String_iter *i, UINTVAL n)
+{
+    i->charpos = n;
+    i->bytepos = n * sizeof(UChar);
+}
+
+#endif
+static void
+iter_init(Interp *interpreter, String *src, String_iter *iter)
+{
+    iter->str = src;
+    iter->bytepos = iter->charpos = 0;
+#if PARROT_HAS_ICU
+    iter->get_and_advance = ucs2_decode_and_advance;
+    iter->set_and_advance = ucs2_encode_and_advance;
+    iter->set_position =    ucs2_set_position;
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
+#endif
+}
+
+ENCODING *
+Parrot_encoding_ucs2_init(Interp *interpreter)
+{
+    ENCODING *return_encoding = Parrot_new_encoding(interpreter);
+
+    static const ENCODING base_encoding = {
+       "ucs2",
+       2, /* Max bytes per codepoint 0 .. 0x10ffff */
+       to_encoding,
+       copy_to_encoding,
+       get_codepoint,
+       set_codepoint,
+       get_byte,
+       set_byte,
+       get_codepoints,
+       get_codepoints_inplace,
+       get_bytes,
+       get_bytes_inplace,
+       set_codepoints,
+       set_bytes,
+       become_encoding,
+       codepoints,
+       bytes,
+        iter_init
+    };
+    memcpy(return_encoding, &base_encoding, sizeof(ENCODING));
+    Parrot_register_encoding(interpreter, "ucs2", return_encoding);
+    return return_encoding;
+}
+
+/*
+
+=back
+
+=head1 SEE ALSO
+
+F<encodings/fixed_8.c>,
+F<encodings/utf8.c>,
+F<src/string.c>,
+F<include/parrot/string.h>,
+F<docs/string.pod>.
+
+=cut
+
+*/
+
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/

Added: trunk/encodings/ucs2.h
==============================================================================
--- (empty file)
+++ trunk/encodings/ucs2.h      Thu Nov 10 01:54:28 2005
@@ -0,0 +1,27 @@
+/* ucs2.h
+ *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
+ *  CVS Info
+ *     $Id$
+ *  Overview:
+ *     This is the header for the ucs2 fixed-width encoding.
+ *  Data Structure and Algorithms:
+ *  History:
+ *  Notes:
+ *  References:
+ */
+
+#if !defined(PARROT_ENCODING_UCS2_H_GUARD)
+#define PARROT_ENCODING_UCS2_H_GUARD
+
+ENCODING *Parrot_encoding_ucs2_init(Interp *);
+
+#endif /* PARROT_ENCODING_UCS2_H_GUARD */
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/

Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c     (original)
+++ trunk/encodings/utf16.c     Thu Nov 10 01:54:28 2005
@@ -81,6 +81,10 @@ to_encoding(Interp *interpreter, STRING 
     Parrot_reallocate_string(interpreter, src, src->bufused);
     memcpy(src->strstart, p, src->bufused);
     mem_sys_free(p);
+
+    /* downgrade if possible */
+    if (dest_len == (int)src->strlen)
+        src->encoding = Parrot_ucs2_encoding_ptr;
 #else
     real_exception(interpreter, NULL, E_LibraryNotLoadedError,
             "no ICU lib loaded");
@@ -124,7 +128,10 @@ copy_to_encoding(Interp *interpreter, ST
         assert(U_SUCCESS(err));
     }
     dest->bufused = dest_len * sizeof(UChar);
-    
+    /* downgrade if possible */
+    if (dest_len == (int)src->strlen)
+        src->encoding = Parrot_ucs2_encoding_ptr;
+
 #else
     real_exception(interpreter, NULL, E_LibraryNotLoadedError,
             "no ICU lib loaded");
@@ -340,7 +347,7 @@ Parrot_encoding_utf16_init(Interp *inter
 
     static const ENCODING base_encoding = {
        "utf16",
-       2, /* Max bytes per codepoint 0 .. 0x10ffff */
+       4, /* Max bytes per codepoint 0 .. 0x10ffff */
        to_encoding,
        copy_to_encoding,
        get_codepoint,

Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h     (original)
+++ trunk/include/parrot/encoding.h     Thu Nov 10 01:54:28 2005
@@ -65,6 +65,7 @@ typedef struct _encoding ENCODING;
 extern ENCODING *Parrot_fixed_8_encoding_ptr;
 extern ENCODING *Parrot_utf8_encoding_ptr;
 extern ENCODING *Parrot_utf16_encoding_ptr;
+extern ENCODING *Parrot_ucs2_encoding_ptr;
 extern ENCODING *Parrot_default_encoding_ptr;
 #endif
 

Modified: trunk/src/charset.c
==============================================================================
--- trunk/src/charset.c (original)
+++ trunk/src/charset.c Thu Nov 10 01:54:28 2005
@@ -18,6 +18,7 @@ These are parrot's generic charset handl
 #include "../encodings/fixed_8.h"
 #include "../encodings/utf8.h"
 #include "../encodings/utf16.h"
+#include "../encodings/ucs2.h"
 
 #include "../charset/ascii.h"
 #include "../charset/binary.h"
@@ -249,6 +250,7 @@ Parrot_charsets_encodings_init(Interp *i
      */
     Parrot_encoding_fixed_8_init(interpreter);
     Parrot_encoding_utf8_init(interpreter);
+    Parrot_encoding_ucs2_init(interpreter);
     Parrot_encoding_utf16_init(interpreter);
 
     Parrot_charset_ascii_init(interpreter);

Modified: trunk/src/encoding.c
==============================================================================
--- trunk/src/encoding.c        (original)
+++ trunk/src/encoding.c        Thu Nov 10 01:54:28 2005
@@ -18,6 +18,7 @@ These are parrot's generic encoding hand
 ENCODING *Parrot_default_encoding_ptr;
 ENCODING *Parrot_fixed_8_encoding_ptr;
 ENCODING *Parrot_utf8_encoding_ptr;
+ENCODING *Parrot_ucs2_encoding_ptr;
 ENCODING *Parrot_utf16_encoding_ptr;
 
 typedef struct {
@@ -204,6 +205,10 @@ Parrot_register_encoding(Interp *interpr
         Parrot_utf16_encoding_ptr = encoding;
         return register_encoding(interpreter, encodingname, encoding);
     }
+    if (!strcmp("ucs2", encodingname)) {
+        Parrot_ucs2_encoding_ptr = encoding;
+        return register_encoding(interpreter, encodingname, encoding);
+    }
     return 0;
 }
 

Reply via email to