Author: leo
Date: Wed Nov  9 08:41:12 2005
New Revision: 9857

Added:
   trunk/encodings/utf16.c   (contents, props changed)
   trunk/encodings/utf16.h   (contents, props changed)
Modified:
   trunk/MANIFEST
   trunk/charset/unicode.c
   trunk/include/parrot/encoding.h
   trunk/src/charset.c
   trunk/src/encoding.c
   trunk/t/op/string_cs.t
Log:
unicode improvements

* implement parts of utf16 encoding (conversion, iterator)
* implement unicode.downcase
* a simple test


Modified: trunk/MANIFEST
==============================================================================
--- trunk/MANIFEST      (original)
+++ trunk/MANIFEST      Wed Nov  9 08:41:12 2005
@@ -445,6 +445,8 @@ editor/filetype_parrot.vim              
 editor/indent_imc.vim                             [devel]
 encodings/fixed_8.c                               []
 encodings/fixed_8.h                               []
+encodings/utf16.c                                  []
+encodings/utf16.h                                  []
 encodings/utf8.c                                  []
 encodings/utf8.h                                  []
 examples/README                                   [main]doc

Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c     (original)
+++ trunk/charset/unicode.c     Wed Nov  9 08:41:12 2005
@@ -121,9 +121,31 @@ upcase(Interp *interpreter, STRING *sour
 }
 
 static void
-downcase(Interp *interpreter, STRING *source_string)
+downcase(Interp *interpreter, STRING *src)
 {
-    UNIMPL;
+#if PARROT_HAS_ICU
+
+    UErrorCode err;
+    int result_len;
+
+    Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+    /*
+U_CAPI int32_t U_EXPORT2
+u_strToLower(UChar *dest, int32_t destCapacity,
+             const UChar *src, int32_t srcLength,
+             const char *locale,
+             UErrorCode *pErrorCode);
+     */
+    result_len = u_strToLower(src->strstart, PObj_buflen(src) / 2,
+            src->strstart, src->strlen,
+            NULL,       /* locale = default */
+            &err);
+    assert(!err);
+    src->bufused = result_len * 2;
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
+#endif
 }
 
 static void
@@ -349,7 +371,7 @@ find_not_cclass(Interp *interpreter, PAR
             real_exception(interpreter, NULL, E_LibraryNotLoadedError,
                     "no ICU lib loaded");
 #endif
-        } 
+        }
         else {
             if (!(Parrot_iso_8859_1_typetable[codepoint] & flags)) {
                 return pos;

Added: trunk/encodings/utf16.c
==============================================================================
--- (empty file)
+++ trunk/encodings/utf16.c     Wed Nov  9 08:41:12 2005
@@ -0,0 +1,365 @@
+/*
+Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
+$Id$
+
+=head1 NAME
+
+encodings/utf16.c - UTF-16 encoding
+
+=head1 DESCRIPTION
+
+UTF-16 encoding with the help of the ICU library.
+
+=head2 Functions
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "parrot/unicode.h"
+
+#include "utf16.h"
+
+#if PARROT_HAS_ICU
+#include <unicode/utf16.h>
+#include <unicode/ustring.h>
+#endif
+
+#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl utf16")
+
+
+static void iter_init(Interp *, String *src, String_iter *iter);
+/* This function needs to go through and get all the code points one
+   by one and turn them into a utf16 sequence */
+static void
+to_encoding(Interp *interpreter, STRING *src)
+{
+#if PARROT_HAS_ICU
+    UErrorCode err;
+    int dest_len;
+    UChar *p;
+#endif
+    int src_len;
+
+    if (src->encoding == Parrot_utf16_encoding_ptr)
+        return;
+    /*
+     * TODO adapt string creation functions
+     */
+    Parrot_reallocate_string(interpreter, src, 2 * src->strlen);
+    src->charset  = Parrot_unicode_charset_ptr;
+    src->encoding = Parrot_utf16_encoding_ptr;
+    src_len = src->strlen;
+    if (!src_len)
+        return;
+    /*
+       u_strFromUTF8(UChar *dest,
+       int32_t destCapacity,
+       int32_t *pDestLength,
+       const char *src,
+       int32_t srcLength,
+       UErrorCode *pErrorCode);
+     */
+#if PARROT_HAS_ICU
+    err = U_ZERO_ERROR;
+    /* XXX these inplace operations are all shit (sorry) */
+    p = mem_sys_allocate(PObj_buflen(src));
+    u_strFromUTF8(p, PObj_buflen(src) / 2,
+            &dest_len, src->strstart, src->bufused, &err);
+    assert(!err);       /* TODO */
+    src->bufused = dest_len * 2;;
+    memcpy(src->strstart, p, src->bufused);
+    mem_sys_free(p);
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
+#endif
+}
+
+static STRING *
+copy_to_encoding(Interp *interpreter, STRING *src)
+{
+    STRING *dest;
+#if PARROT_HAS_ICU
+    UErrorCode err;
+    int dest_len;
+#endif
+
+    if (src->encoding == Parrot_utf16_encoding_ptr)
+        return string_copy(interpreter, src);
+
+    /*
+     * TODO adapt string creation functions
+     */
+    dest = new_string_header(interpreter, 0);
+    Parrot_allocate_string(interpreter, dest, 2 * src->strlen);
+    dest->charset  = Parrot_unicode_charset_ptr;
+    dest->encoding = Parrot_utf16_encoding_ptr;
+    dest->strlen   = src->strlen;
+    if (!src->strlen)
+        return dest;
+#if PARROT_HAS_ICU
+    err = U_ZERO_ERROR;
+    u_strFromUTF8(dest->strstart, dest->bufused,
+            &dest_len, src->strstart, src->bufused, &err);
+    assert(!err);       /* TODO */
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
+#endif
+
+    return dest;
+}
+
+static UINTVAL
+get_codepoint(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+    const void *start;
+
+    UNIMPL;
+    return 0;
+}
+
+static void
+set_codepoint(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL codepoint)
+{
+    const void *start;
+    void *p;
+    UNIMPL;
+}
+
+static UINTVAL
+get_byte(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+    unsigned char *contents = src->strstart;
+    if (offset >= src->bufused) {
+/*     internal_exception(0,
+               "get_byte past the end of the buffer (%i of %i)",
+               offset, src->bufused);*/
+       return 0;
+    }
+    return contents[offset];
+}
+
+static void
+set_byte(Interp *interpreter, const STRING *src,
+       UINTVAL offset, UINTVAL byte)
+{
+    unsigned char *contents;
+    if (offset >= src->bufused) {
+       internal_exception(0, "set_byte past the end of the buffer");
+    }
+    contents = src->strstart;
+    contents[offset] = byte;
+}
+
+static STRING *
+get_codepoints(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count)
+{
+    String_iter iter;
+    UINTVAL start;
+    STRING *return_string = Parrot_make_COW_reference(interpreter,
+           src);
+    return_string->encoding = src->encoding;
+    return_string->charset = src->charset;
+    iter_init(interpreter, src, &iter);
+    iter.set_position(interpreter, &iter, offset);
+    start = iter.bytepos;
+    return_string->strstart = (char *)return_string->strstart + start ;
+    iter.set_position(interpreter, &iter, offset + count);
+    return_string->bufused = iter.bytepos - start;
+    return_string->strlen = count;
+    return_string->hashval = 0;
+    return return_string;
+}
+
+static STRING *
+get_bytes(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count)
+{
+    STRING *return_string = Parrot_make_COW_reference(interpreter,
+           src);
+    return_string->encoding = src->encoding;    /* XXX */
+    return_string->charset = src->charset;
+
+    return_string->strstart = (char *)return_string->strstart + offset ;
+    return_string->bufused = count;
+
+    return_string->strlen = count;
+    return_string->hashval = 0;
+
+    return return_string;
+}
+
+
+static STRING *
+get_codepoints_inplace(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *dest_string)
+{
+
+    UNIMPL;
+    return NULL;
+}
+
+static STRING *
+get_bytes_inplace(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *return_string)
+{
+    UNIMPL;
+    return NULL;
+}
+
+static void
+set_codepoints(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *new_codepoints)
+{
+    UNIMPL;
+}
+
+static void
+set_bytes(Interp *interpreter, STRING *src,
+       UINTVAL offset, UINTVAL count, STRING *new_bytes)
+{
+    UNIMPL;
+}
+
+/* Unconditionally makes the string be in this encoding, if that's
+   valid */
+static void
+become_encoding(Interp *interpreter, STRING *src)
+{
+    UNIMPL;
+}
+
+
+static UINTVAL
+codepoints(Interp *interpreter, STRING *src)
+{
+    String_iter iter;
+    /*
+     * this is used to initially calculate src->strlen,
+     * therefore we must scan the whole string
+     */
+    iter_init(interpreter, src, &iter);
+    while (iter.bytepos < src->bufused)
+        iter.get_and_advance(interpreter, &iter);
+    return iter.charpos;
+}
+
+static UINTVAL
+bytes(Interp *interpreter, STRING *src)
+{
+    return src->bufused;
+}
+
+#if PARROT_HAS_ICU
+static UINTVAL
+utf16_decode_and_advance(Interp *interpreter, String_iter *i)
+{
+    UChar *s = (UChar*) i->str->strstart;
+    UINTVAL c, pos;
+    pos = i->bytepos / 2;
+    U16_NEXT_UNSAFE(s, pos, c);
+    i->charpos++;
+    i->bytepos = pos * 2;
+    return c;
+}
+
+static void
+utf16_encode_and_advance(Interp *interpreter, String_iter *i, UINTVAL c)
+{
+    UChar *s = (UChar*) i->str->strstart;
+    UINTVAL pos;
+    pos = i->bytepos / 2;
+    U16_APPEND_UNSAFE(s, pos, c);
+    i->charpos++;
+    i->bytepos = pos * 2;
+}
+
+static void
+utf16_set_position(Interp *interpreter, String_iter *i, UINTVAL n)
+{
+    UChar *s = (UChar*) i->str->strstart;
+    UINTVAL pos;
+    pos = 0;
+    U16_FWD_N_UNSAFE(s, pos, n);
+    i->charpos = n;
+    i->bytepos = pos * 2;
+}
+
+#endif
+static void
+iter_init(Interp *interpreter, String *src, String_iter *iter)
+{
+    iter->str = src;
+    iter->bytepos = iter->charpos = 0;
+#if PARROT_HAS_ICU
+    iter->get_and_advance = utf16_decode_and_advance;
+    iter->set_and_advance = utf16_encode_and_advance;
+    iter->set_position =    utf16_set_position;
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
+#endif
+}
+
+ENCODING *
+Parrot_encoding_utf16_init(Interp *interpreter)
+{
+    ENCODING *return_encoding = Parrot_new_encoding(interpreter);
+
+    static const ENCODING base_encoding = {
+       "utf16",
+       2, /* Max bytes per codepoint 0 .. 0x10ffff */
+       to_encoding,
+       copy_to_encoding,
+       get_codepoint,
+       set_codepoint,
+       get_byte,
+       set_byte,
+       get_codepoints,
+       get_codepoints_inplace,
+       get_bytes,
+       get_bytes_inplace,
+       set_codepoints,
+       set_bytes,
+       become_encoding,
+       codepoints,
+       bytes,
+        iter_init
+    };
+    memcpy(return_encoding, &base_encoding, sizeof(ENCODING));
+    Parrot_register_encoding(interpreter, "utf16", return_encoding);
+    return return_encoding;
+}
+
+/*
+
+=back
+
+=head1 SEE ALSO
+
+F<encodings/fixed_8.c>,
+F<encodings/utf8.c>,
+F<src/string.c>,
+F<include/parrot/string.h>,
+F<docs/string.pod>.
+
+=cut
+
+*/
+
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/

Added: trunk/encodings/utf16.h
==============================================================================
--- (empty file)
+++ trunk/encodings/utf16.h     Wed Nov  9 08:41:12 2005
@@ -0,0 +1,27 @@
+/* utf16.h
+ *  Copyright: 2004 The Perl Foundation.  All Rights Reserved.
+ *  CVS Info
+ *     $Id$
+ *  Overview:
+ *     This is the header for the utf16 variable-width encoding.
+ *  Data Structure and Algorithms:
+ *  History:
+ *  Notes:
+ *  References:
+ */
+
+#if !defined(PARROT_ENCODING_UTF16_H_GUARD)
+#define PARROT_ENCODING_UTF16_H_GUARD
+
+ENCODING *Parrot_encoding_utf16_init(Interp *);
+
+#endif /* PARROT_ENCODING_UTF16_H_GUARD */
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/

Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h     (original)
+++ trunk/include/parrot/encoding.h     Wed Nov  9 08:41:12 2005
@@ -64,6 +64,7 @@ typedef struct _encoding ENCODING;
 #if !defined PARROT_NO_EXTERN_ENCODING_PTRS
 extern ENCODING *Parrot_fixed_8_encoding_ptr;
 extern ENCODING *Parrot_utf8_encoding_ptr;
+extern ENCODING *Parrot_utf16_encoding_ptr;
 extern ENCODING *Parrot_default_encoding_ptr;
 #endif
 

Modified: trunk/src/charset.c
==============================================================================
--- trunk/src/charset.c (original)
+++ trunk/src/charset.c Wed Nov  9 08:41:12 2005
@@ -17,6 +17,7 @@ These are parrot's generic charset handl
 
 #include "../encodings/fixed_8.h"
 #include "../encodings/utf8.h"
+#include "../encodings/utf16.h"
 
 #include "../charset/ascii.h"
 #include "../charset/binary.h"
@@ -246,6 +247,7 @@ Parrot_charsets_encodings_init(Interp *i
      */
     Parrot_encoding_fixed_8_init(interpreter);
     Parrot_encoding_utf8_init(interpreter);
+    Parrot_encoding_utf16_init(interpreter);
 
     Parrot_charset_ascii_init(interpreter);
     Parrot_charset_iso_8859_1_init(interpreter);

Modified: trunk/src/encoding.c
==============================================================================
--- trunk/src/encoding.c        (original)
+++ trunk/src/encoding.c        Wed Nov  9 08:41:12 2005
@@ -18,6 +18,7 @@ These are parrot's generic encoding hand
 ENCODING *Parrot_default_encoding_ptr;
 ENCODING *Parrot_fixed_8_encoding_ptr;
 ENCODING *Parrot_utf8_encoding_ptr;
+ENCODING *Parrot_utf16_encoding_ptr;
 
 /* Yep, this needs to be a char * parameter -- it's tough to load in
    encodings and such for strings if we can't be sure we've got enough
@@ -38,6 +39,9 @@ Parrot_find_encoding(Interp *interpreter
     if (!strcmp("utf8", encodingname)) {
         return Parrot_utf8_encoding_ptr;
     }
+    if (!strcmp("utf16", encodingname)) {
+        return Parrot_utf16_encoding_ptr;
+    }
     return NULL;
 }
 
@@ -64,6 +68,10 @@ Parrot_register_encoding(Interp *interpr
         Parrot_utf8_encoding_ptr = encoding;
         return 1;
     }
+    if (!strcmp("utf16", encodingname)) {
+        Parrot_utf16_encoding_ptr = encoding;
+        return 1;
+    }
     return 0;
 }
 

Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t      (original)
+++ trunk/t/op/string_cs.t      Wed Nov  9 08:41:12 2005
@@ -16,7 +16,7 @@ Tests charset support.
 
 =cut
 
-use Parrot::Test tests => 31;
+use Parrot::Test tests => 32;
 use Test::More;
 
 output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -504,3 +504,17 @@ CODE
 abcdefg
 abcdefg
 OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "unicode downcase");
+    set S0, iso-8859-1:"T�TSCH"
+    find_charset I0, "unicode"
+    trans_charset S1, S0, I0
+    downcase S1
+    getstdout P0          # need to convert back to utf8
+    push P0, "utf8"       # push utf8 output layer
+    print S1
+    print "\n"
+    end
+CODE
+t\xc3\xb6tsch
+OUTPUT

Reply via email to