Author: leo
Date: Wed Nov 9 08:41:12 2005
New Revision: 9857
Added:
trunk/encodings/utf16.c (contents, props changed)
trunk/encodings/utf16.h (contents, props changed)
Modified:
trunk/MANIFEST
trunk/charset/unicode.c
trunk/include/parrot/encoding.h
trunk/src/charset.c
trunk/src/encoding.c
trunk/t/op/string_cs.t
Log:
unicode improvements
* implement parts of utf16 encoding (conversion, iterator)
* implement unicode.downcase
* a simple test
Modified: trunk/MANIFEST
==============================================================================
--- trunk/MANIFEST (original)
+++ trunk/MANIFEST Wed Nov 9 08:41:12 2005
@@ -445,6 +445,8 @@ editor/filetype_parrot.vim
editor/indent_imc.vim [devel]
encodings/fixed_8.c []
encodings/fixed_8.h []
+encodings/utf16.c []
+encodings/utf16.h []
encodings/utf8.c []
encodings/utf8.h []
examples/README [main]doc
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Wed Nov 9 08:41:12 2005
@@ -121,9 +121,31 @@ upcase(Interp *interpreter, STRING *sour
}
static void
-downcase(Interp *interpreter, STRING *source_string)
+downcase(Interp *interpreter, STRING *src)
{
- UNIMPL;
+#if PARROT_HAS_ICU
+
+ UErrorCode err;
+ int result_len;
+
+ Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+ /*
+U_CAPI int32_t U_EXPORT2
+u_strToLower(UChar *dest, int32_t destCapacity,
+ const UChar *src, int32_t srcLength,
+ const char *locale,
+ UErrorCode *pErrorCode);
+ */
+ result_len = u_strToLower(src->strstart, PObj_buflen(src) / 2,
+ src->strstart, src->strlen,
+ NULL, /* locale = default */
+ &err);
+ assert(!err);
+ src->bufused = result_len * 2;
+#else
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+#endif
}
static void
@@ -349,7 +371,7 @@ find_not_cclass(Interp *interpreter, PAR
real_exception(interpreter, NULL, E_LibraryNotLoadedError,
"no ICU lib loaded");
#endif
- }
+ }
else {
if (!(Parrot_iso_8859_1_typetable[codepoint] & flags)) {
return pos;
Added: trunk/encodings/utf16.c
==============================================================================
--- (empty file)
+++ trunk/encodings/utf16.c Wed Nov 9 08:41:12 2005
@@ -0,0 +1,365 @@
+/*
+Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
+$Id$
+
+=head1 NAME
+
+encodings/utf16.c - UTF-16 encoding
+
+=head1 DESCRIPTION
+
+UTF-16 encoding with the help of the ICU library.
+
+=head2 Functions
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "parrot/unicode.h"
+
+#include "utf16.h"
+
+#if PARROT_HAS_ICU
+#include <unicode/utf16.h>
+#include <unicode/ustring.h>
+#endif
+
+#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl utf16")
+
+
+static void iter_init(Interp *, String *src, String_iter *iter);
+/* This function needs to go through and get all the code points one
+ by one and turn them into a utf16 sequence */
+static void
+to_encoding(Interp *interpreter, STRING *src)
+{
+#if PARROT_HAS_ICU
+ UErrorCode err;
+ int dest_len;
+ UChar *p;
+#endif
+ int src_len;
+
+ if (src->encoding == Parrot_utf16_encoding_ptr)
+ return;
+ /*
+ * TODO adapt string creation functions
+ */
+ Parrot_reallocate_string(interpreter, src, 2 * src->strlen);
+ src->charset = Parrot_unicode_charset_ptr;
+ src->encoding = Parrot_utf16_encoding_ptr;
+ src_len = src->strlen;
+ if (!src_len)
+ return;
+ /*
+ u_strFromUTF8(UChar *dest,
+ int32_t destCapacity,
+ int32_t *pDestLength,
+ const char *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode);
+ */
+#if PARROT_HAS_ICU
+ err = U_ZERO_ERROR;
+ /* XXX these inplace operations are all shit (sorry) */
+ p = mem_sys_allocate(PObj_buflen(src));
+ u_strFromUTF8(p, PObj_buflen(src) / 2,
+ &dest_len, src->strstart, src->bufused, &err);
+ assert(!err); /* TODO */
+ src->bufused = dest_len * 2;;
+ memcpy(src->strstart, p, src->bufused);
+ mem_sys_free(p);
+#else
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+#endif
+}
+
+static STRING *
+copy_to_encoding(Interp *interpreter, STRING *src)
+{
+ STRING *dest;
+#if PARROT_HAS_ICU
+ UErrorCode err;
+ int dest_len;
+#endif
+
+ if (src->encoding == Parrot_utf16_encoding_ptr)
+ return string_copy(interpreter, src);
+
+ /*
+ * TODO adapt string creation functions
+ */
+ dest = new_string_header(interpreter, 0);
+ Parrot_allocate_string(interpreter, dest, 2 * src->strlen);
+ dest->charset = Parrot_unicode_charset_ptr;
+ dest->encoding = Parrot_utf16_encoding_ptr;
+ dest->strlen = src->strlen;
+ if (!src->strlen)
+ return dest;
+#if PARROT_HAS_ICU
+ err = U_ZERO_ERROR;
+ u_strFromUTF8(dest->strstart, dest->bufused,
+ &dest_len, src->strstart, src->bufused, &err);
+ assert(!err); /* TODO */
+#else
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+#endif
+
+ return dest;
+}
+
+static UINTVAL
+get_codepoint(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+ const void *start;
+
+ UNIMPL;
+ return 0;
+}
+
+static void
+set_codepoint(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL codepoint)
+{
+ const void *start;
+ void *p;
+ UNIMPL;
+}
+
+static UINTVAL
+get_byte(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+ unsigned char *contents = src->strstart;
+ if (offset >= src->bufused) {
+/* internal_exception(0,
+ "get_byte past the end of the buffer (%i of %i)",
+ offset, src->bufused);*/
+ return 0;
+ }
+ return contents[offset];
+}
+
+static void
+set_byte(Interp *interpreter, const STRING *src,
+ UINTVAL offset, UINTVAL byte)
+{
+ unsigned char *contents;
+ if (offset >= src->bufused) {
+ internal_exception(0, "set_byte past the end of the buffer");
+ }
+ contents = src->strstart;
+ contents[offset] = byte;
+}
+
+static STRING *
+get_codepoints(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count)
+{
+ String_iter iter;
+ UINTVAL start;
+ STRING *return_string = Parrot_make_COW_reference(interpreter,
+ src);
+ return_string->encoding = src->encoding;
+ return_string->charset = src->charset;
+ iter_init(interpreter, src, &iter);
+ iter.set_position(interpreter, &iter, offset);
+ start = iter.bytepos;
+ return_string->strstart = (char *)return_string->strstart + start ;
+ iter.set_position(interpreter, &iter, offset + count);
+ return_string->bufused = iter.bytepos - start;
+ return_string->strlen = count;
+ return_string->hashval = 0;
+ return return_string;
+}
+
+static STRING *
+get_bytes(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count)
+{
+ STRING *return_string = Parrot_make_COW_reference(interpreter,
+ src);
+ return_string->encoding = src->encoding; /* XXX */
+ return_string->charset = src->charset;
+
+ return_string->strstart = (char *)return_string->strstart + offset ;
+ return_string->bufused = count;
+
+ return_string->strlen = count;
+ return_string->hashval = 0;
+
+ return return_string;
+}
+
+
+static STRING *
+get_codepoints_inplace(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *dest_string)
+{
+
+ UNIMPL;
+ return NULL;
+}
+
+static STRING *
+get_bytes_inplace(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *return_string)
+{
+ UNIMPL;
+ return NULL;
+}
+
+static void
+set_codepoints(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *new_codepoints)
+{
+ UNIMPL;
+}
+
+static void
+set_bytes(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *new_bytes)
+{
+ UNIMPL;
+}
+
+/* Unconditionally makes the string be in this encoding, if that's
+ valid */
+static void
+become_encoding(Interp *interpreter, STRING *src)
+{
+ UNIMPL;
+}
+
+
+static UINTVAL
+codepoints(Interp *interpreter, STRING *src)
+{
+ String_iter iter;
+ /*
+ * this is used to initially calculate src->strlen,
+ * therefore we must scan the whole string
+ */
+ iter_init(interpreter, src, &iter);
+ while (iter.bytepos < src->bufused)
+ iter.get_and_advance(interpreter, &iter);
+ return iter.charpos;
+}
+
+static UINTVAL
+bytes(Interp *interpreter, STRING *src)
+{
+ return src->bufused;
+}
+
+#if PARROT_HAS_ICU
+static UINTVAL
+utf16_decode_and_advance(Interp *interpreter, String_iter *i)
+{
+ UChar *s = (UChar*) i->str->strstart;
+ UINTVAL c, pos;
+ pos = i->bytepos / 2;
+ U16_NEXT_UNSAFE(s, pos, c);
+ i->charpos++;
+ i->bytepos = pos * 2;
+ return c;
+}
+
+static void
+utf16_encode_and_advance(Interp *interpreter, String_iter *i, UINTVAL c)
+{
+ UChar *s = (UChar*) i->str->strstart;
+ UINTVAL pos;
+ pos = i->bytepos / 2;
+ U16_APPEND_UNSAFE(s, pos, c);
+ i->charpos++;
+ i->bytepos = pos * 2;
+}
+
+static void
+utf16_set_position(Interp *interpreter, String_iter *i, UINTVAL n)
+{
+ UChar *s = (UChar*) i->str->strstart;
+ UINTVAL pos;
+ pos = 0;
+ U16_FWD_N_UNSAFE(s, pos, n);
+ i->charpos = n;
+ i->bytepos = pos * 2;
+}
+
+#endif
+static void
+iter_init(Interp *interpreter, String *src, String_iter *iter)
+{
+ iter->str = src;
+ iter->bytepos = iter->charpos = 0;
+#if PARROT_HAS_ICU
+ iter->get_and_advance = utf16_decode_and_advance;
+ iter->set_and_advance = utf16_encode_and_advance;
+ iter->set_position = utf16_set_position;
+#else
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+#endif
+}
+
+ENCODING *
+Parrot_encoding_utf16_init(Interp *interpreter)
+{
+ ENCODING *return_encoding = Parrot_new_encoding(interpreter);
+
+ static const ENCODING base_encoding = {
+ "utf16",
+ 2, /* Max bytes per codepoint 0 .. 0x10ffff */
+ to_encoding,
+ copy_to_encoding,
+ get_codepoint,
+ set_codepoint,
+ get_byte,
+ set_byte,
+ get_codepoints,
+ get_codepoints_inplace,
+ get_bytes,
+ get_bytes_inplace,
+ set_codepoints,
+ set_bytes,
+ become_encoding,
+ codepoints,
+ bytes,
+ iter_init
+ };
+ memcpy(return_encoding, &base_encoding, sizeof(ENCODING));
+ Parrot_register_encoding(interpreter, "utf16", return_encoding);
+ return return_encoding;
+}
+
+/*
+
+=back
+
+=head1 SEE ALSO
+
+F<encodings/fixed_8.c>,
+F<encodings/utf8.c>,
+F<src/string.c>,
+F<include/parrot/string.h>,
+F<docs/string.pod>.
+
+=cut
+
+*/
+
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/
Added: trunk/encodings/utf16.h
==============================================================================
--- (empty file)
+++ trunk/encodings/utf16.h Wed Nov 9 08:41:12 2005
@@ -0,0 +1,27 @@
+/* utf16.h
+ * Copyright: 2004 The Perl Foundation. All Rights Reserved.
+ * CVS Info
+ * $Id$
+ * Overview:
+ * This is the header for the utf16 variable-width encoding.
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#if !defined(PARROT_ENCODING_UTF16_H_GUARD)
+#define PARROT_ENCODING_UTF16_H_GUARD
+
+ENCODING *Parrot_encoding_utf16_init(Interp *);
+
+#endif /* PARROT_ENCODING_UTF16_H_GUARD */
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/
Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h (original)
+++ trunk/include/parrot/encoding.h Wed Nov 9 08:41:12 2005
@@ -64,6 +64,7 @@ typedef struct _encoding ENCODING;
#if !defined PARROT_NO_EXTERN_ENCODING_PTRS
extern ENCODING *Parrot_fixed_8_encoding_ptr;
extern ENCODING *Parrot_utf8_encoding_ptr;
+extern ENCODING *Parrot_utf16_encoding_ptr;
extern ENCODING *Parrot_default_encoding_ptr;
#endif
Modified: trunk/src/charset.c
==============================================================================
--- trunk/src/charset.c (original)
+++ trunk/src/charset.c Wed Nov 9 08:41:12 2005
@@ -17,6 +17,7 @@ These are parrot's generic charset handl
#include "../encodings/fixed_8.h"
#include "../encodings/utf8.h"
+#include "../encodings/utf16.h"
#include "../charset/ascii.h"
#include "../charset/binary.h"
@@ -246,6 +247,7 @@ Parrot_charsets_encodings_init(Interp *i
*/
Parrot_encoding_fixed_8_init(interpreter);
Parrot_encoding_utf8_init(interpreter);
+ Parrot_encoding_utf16_init(interpreter);
Parrot_charset_ascii_init(interpreter);
Parrot_charset_iso_8859_1_init(interpreter);
Modified: trunk/src/encoding.c
==============================================================================
--- trunk/src/encoding.c (original)
+++ trunk/src/encoding.c Wed Nov 9 08:41:12 2005
@@ -18,6 +18,7 @@ These are parrot's generic encoding hand
ENCODING *Parrot_default_encoding_ptr;
ENCODING *Parrot_fixed_8_encoding_ptr;
ENCODING *Parrot_utf8_encoding_ptr;
+ENCODING *Parrot_utf16_encoding_ptr;
/* Yep, this needs to be a char * parameter -- it's tough to load in
encodings and such for strings if we can't be sure we've got enough
@@ -38,6 +39,9 @@ Parrot_find_encoding(Interp *interpreter
if (!strcmp("utf8", encodingname)) {
return Parrot_utf8_encoding_ptr;
}
+ if (!strcmp("utf16", encodingname)) {
+ return Parrot_utf16_encoding_ptr;
+ }
return NULL;
}
@@ -64,6 +68,10 @@ Parrot_register_encoding(Interp *interpr
Parrot_utf8_encoding_ptr = encoding;
return 1;
}
+ if (!strcmp("utf16", encodingname)) {
+ Parrot_utf16_encoding_ptr = encoding;
+ return 1;
+ }
return 0;
}
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Wed Nov 9 08:41:12 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 31;
+use Parrot::Test tests => 32;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -504,3 +504,17 @@ CODE
abcdefg
abcdefg
OUTPUT
+
+output_is( <<'CODE', <<"OUTPUT", "unicode downcase");
+ set S0, iso-8859-1:"T�TSCH"
+ find_charset I0, "unicode"
+ trans_charset S1, S0, I0
+ downcase S1
+ getstdout P0 # need to convert back to utf8
+ push P0, "utf8" # push utf8 output layer
+ print S1
+ print "\n"
+ end
+CODE
+t\xc3\xb6tsch
+OUTPUT