Author: leo
Date: Thu Nov 10 01:54:28 2005
New Revision: 9875
Added:
trunk/encodings/ucs2.c (contents, props changed)
trunk/encodings/ucs2.h (contents, props changed)
Modified:
trunk/MANIFEST
trunk/encodings/utf16.c
trunk/include/parrot/encoding.h
trunk/src/charset.c
trunk/src/encoding.c
Log:
unicode improvements - one more encoding
* implement ucs2 encoding
* downgrade utf16 to ucs2 if no surrogates
Modified: trunk/MANIFEST
==============================================================================
--- trunk/MANIFEST (original)
+++ trunk/MANIFEST Thu Nov 10 01:54:28 2005
@@ -445,8 +445,10 @@ editor/filetype_parrot.vim
editor/indent_imc.vim [devel]
encodings/fixed_8.c []
encodings/fixed_8.h []
-encodings/utf16.c []
-encodings/utf16.h []
+encodings/ucs2.c []
+encodings/ucs2.h []
+encodings/utf16.c []
+encodings/utf16.h []
encodings/utf8.c []
encodings/utf8.h []
examples/README [main]doc
Added: trunk/encodings/ucs2.c
==============================================================================
--- (empty file)
+++ trunk/encodings/ucs2.c Thu Nov 10 01:54:28 2005
@@ -0,0 +1,285 @@
+/*
+Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
+$Id$
+
+=head1 NAME
+
+encodings/ucs2.c - UCS-2 encoding
+
+=head1 DESCRIPTION
+
+UCS-2 encoding with the help of the ICU library.
+
+=head2 Functions
+
+=over 4
+
+=cut
+
+*/
+
+#include "parrot/parrot.h"
+#include "parrot/unicode.h"
+
+#include "ucs2.h"
+
+#if PARROT_HAS_ICU
+#include <unicode/ustring.h>
+#endif
+
+#define UNIMPL internal_exception(UNIMPLEMENTED, "unimpl ucs2")
+
+
+static void iter_init(Interp *, String *src, String_iter *iter);
+
+
+static void
+to_encoding(Interp *interpreter, STRING *src)
+{
+ if (src->encoding == Parrot_ucs2_encoding_ptr)
+ return;
+ Parrot_utf16_encoding_ptr->to_encoding(interpreter, src);
+ /*
+ * conversion to utf16 downgrads to ucs-2 if possible - check result
+ */
+ if (src->encoding == Parrot_utf16_encoding_ptr) {
+ real_exception(interpreter, NULL, E_UnicodeError,
+ "can't convert string with surrogates to ucs2");
+ }
+}
+
+static STRING *
+copy_to_encoding(Interp *interpreter, STRING *src)
+{
+ STRING *dest;
+
+ if (src->encoding == Parrot_ucs2_encoding_ptr)
+ return string_copy(interpreter, src);
+ dest = Parrot_utf16_encoding_ptr->copy_to_encoding(interpreter, src);
+ /*
+ * conversion to utf16 downgrads to ucs-2 if possible - check result
+ */
+ if (dest->encoding == Parrot_utf16_encoding_ptr) {
+ real_exception(interpreter, NULL, E_UnicodeError,
+ "can't convert string with surrogates to ucs2");
+ }
+ return dest;
+}
+
+static UINTVAL
+get_codepoint(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+ UChar *s = (UChar*) src->strstart;
+ return s[offset];
+}
+
+static void
+set_codepoint(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL codepoint)
+{
+ UChar *s = (UChar*) src->strstart;
+ s[offset] = codepoint;
+}
+
+static UINTVAL
+get_byte(Interp *interpreter, const STRING *src, UINTVAL offset)
+{
+ UNIMPL;
+ return 0;
+}
+
+static void
+set_byte(Interp *interpreter, const STRING *src,
+ UINTVAL offset, UINTVAL byte)
+{
+ UNIMPL;
+}
+
+static STRING *
+get_codepoints(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count)
+{
+ String_iter iter;
+ UINTVAL start;
+ STRING *return_string = Parrot_make_COW_reference(interpreter,
+ src);
+ return_string->encoding = src->encoding;
+ return_string->charset = src->charset;
+ iter_init(interpreter, src, &iter);
+ iter.set_position(interpreter, &iter, offset);
+ start = iter.bytepos;
+ return_string->strstart = (char *)return_string->strstart + start ;
+ iter.set_position(interpreter, &iter, offset + count);
+ return_string->bufused = iter.bytepos - start;
+ return_string->strlen = count;
+ return_string->hashval = 0;
+ return return_string;
+}
+
+static STRING *
+get_bytes(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count)
+{
+ UNIMPL;
+ return NULL;
+}
+
+
+static STRING *
+get_codepoints_inplace(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *dest_string)
+{
+
+ UNIMPL;
+ return NULL;
+}
+
+static STRING *
+get_bytes_inplace(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *return_string)
+{
+ UNIMPL;
+ return NULL;
+}
+
+static void
+set_codepoints(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *new_codepoints)
+{
+ UNIMPL;
+}
+
+static void
+set_bytes(Interp *interpreter, STRING *src,
+ UINTVAL offset, UINTVAL count, STRING *new_bytes)
+{
+ UNIMPL;
+}
+
+/* Unconditionally makes the string be in this encoding, if that's
+ valid */
+static void
+become_encoding(Interp *interpreter, STRING *src)
+{
+ UNIMPL;
+}
+
+
+static UINTVAL
+codepoints(Interp *interpreter, STRING *src)
+{
+ UNIMPL;
+ return 0;
+}
+
+static UINTVAL
+bytes(Interp *interpreter, STRING *src)
+{
+ return src->bufused;
+}
+
+#if PARROT_HAS_ICU
+static UINTVAL
+ucs2_decode_and_advance(Interp *interpreter, String_iter *i)
+{
+ UChar *s = (UChar*) i->str->strstart;
+ UINTVAL c, pos;
+ pos = i->bytepos / sizeof(UChar);
+ /* TODO either make sure that we don't go past end or use SAFE
+ * iter versions
+ */
+ c = s[pos++];
+ i->charpos++;
+ i->bytepos = pos * sizeof(UChar);
+ return c;
+}
+
+static void
+ucs2_encode_and_advance(Interp *interpreter, String_iter *i, UINTVAL c)
+{
+ UChar *s = (UChar*) i->str->strstart;
+ UINTVAL pos;
+ pos = i->bytepos / sizeof(UChar);
+ s[pos++] = c;
+ i->charpos++;
+ i->bytepos = pos * sizeof(UChar);
+}
+
+static void
+ucs2_set_position(Interp *interpreter, String_iter *i, UINTVAL n)
+{
+ i->charpos = n;
+ i->bytepos = n * sizeof(UChar);
+}
+
+#endif
+static void
+iter_init(Interp *interpreter, String *src, String_iter *iter)
+{
+ iter->str = src;
+ iter->bytepos = iter->charpos = 0;
+#if PARROT_HAS_ICU
+ iter->get_and_advance = ucs2_decode_and_advance;
+ iter->set_and_advance = ucs2_encode_and_advance;
+ iter->set_position = ucs2_set_position;
+#else
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+#endif
+}
+
+ENCODING *
+Parrot_encoding_ucs2_init(Interp *interpreter)
+{
+ ENCODING *return_encoding = Parrot_new_encoding(interpreter);
+
+ static const ENCODING base_encoding = {
+ "ucs2",
+ 2, /* Max bytes per codepoint 0 .. 0x10ffff */
+ to_encoding,
+ copy_to_encoding,
+ get_codepoint,
+ set_codepoint,
+ get_byte,
+ set_byte,
+ get_codepoints,
+ get_codepoints_inplace,
+ get_bytes,
+ get_bytes_inplace,
+ set_codepoints,
+ set_bytes,
+ become_encoding,
+ codepoints,
+ bytes,
+ iter_init
+ };
+ memcpy(return_encoding, &base_encoding, sizeof(ENCODING));
+ Parrot_register_encoding(interpreter, "ucs2", return_encoding);
+ return return_encoding;
+}
+
+/*
+
+=back
+
+=head1 SEE ALSO
+
+F<encodings/fixed_8.c>,
+F<encodings/utf8.c>,
+F<src/string.c>,
+F<include/parrot/string.h>,
+F<docs/string.pod>.
+
+=cut
+
+*/
+
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/
Added: trunk/encodings/ucs2.h
==============================================================================
--- (empty file)
+++ trunk/encodings/ucs2.h Thu Nov 10 01:54:28 2005
@@ -0,0 +1,27 @@
+/* ucs2.h
+ * Copyright: 2004 The Perl Foundation. All Rights Reserved.
+ * CVS Info
+ * $Id$
+ * Overview:
+ * This is the header for the ucs2 fixed-width encoding.
+ * Data Structure and Algorithms:
+ * History:
+ * Notes:
+ * References:
+ */
+
+#if !defined(PARROT_ENCODING_UCS2_H_GUARD)
+#define PARROT_ENCODING_UCS2_H_GUARD
+
+ENCODING *Parrot_encoding_ucs2_init(Interp *);
+
+#endif /* PARROT_ENCODING_UCS2_H_GUARD */
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+*/
Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c (original)
+++ trunk/encodings/utf16.c Thu Nov 10 01:54:28 2005
@@ -81,6 +81,10 @@ to_encoding(Interp *interpreter, STRING
Parrot_reallocate_string(interpreter, src, src->bufused);
memcpy(src->strstart, p, src->bufused);
mem_sys_free(p);
+
+ /* downgrade if possible */
+ if (dest_len == (int)src->strlen)
+ src->encoding = Parrot_ucs2_encoding_ptr;
#else
real_exception(interpreter, NULL, E_LibraryNotLoadedError,
"no ICU lib loaded");
@@ -124,7 +128,10 @@ copy_to_encoding(Interp *interpreter, ST
assert(U_SUCCESS(err));
}
dest->bufused = dest_len * sizeof(UChar);
-
+ /* downgrade if possible */
+ if (dest_len == (int)src->strlen)
+ src->encoding = Parrot_ucs2_encoding_ptr;
+
#else
real_exception(interpreter, NULL, E_LibraryNotLoadedError,
"no ICU lib loaded");
@@ -340,7 +347,7 @@ Parrot_encoding_utf16_init(Interp *inter
static const ENCODING base_encoding = {
"utf16",
- 2, /* Max bytes per codepoint 0 .. 0x10ffff */
+ 4, /* Max bytes per codepoint 0 .. 0x10ffff */
to_encoding,
copy_to_encoding,
get_codepoint,
Modified: trunk/include/parrot/encoding.h
==============================================================================
--- trunk/include/parrot/encoding.h (original)
+++ trunk/include/parrot/encoding.h Thu Nov 10 01:54:28 2005
@@ -65,6 +65,7 @@ typedef struct _encoding ENCODING;
extern ENCODING *Parrot_fixed_8_encoding_ptr;
extern ENCODING *Parrot_utf8_encoding_ptr;
extern ENCODING *Parrot_utf16_encoding_ptr;
+extern ENCODING *Parrot_ucs2_encoding_ptr;
extern ENCODING *Parrot_default_encoding_ptr;
#endif
Modified: trunk/src/charset.c
==============================================================================
--- trunk/src/charset.c (original)
+++ trunk/src/charset.c Thu Nov 10 01:54:28 2005
@@ -18,6 +18,7 @@ These are parrot's generic charset handl
#include "../encodings/fixed_8.h"
#include "../encodings/utf8.h"
#include "../encodings/utf16.h"
+#include "../encodings/ucs2.h"
#include "../charset/ascii.h"
#include "../charset/binary.h"
@@ -249,6 +250,7 @@ Parrot_charsets_encodings_init(Interp *i
*/
Parrot_encoding_fixed_8_init(interpreter);
Parrot_encoding_utf8_init(interpreter);
+ Parrot_encoding_ucs2_init(interpreter);
Parrot_encoding_utf16_init(interpreter);
Parrot_charset_ascii_init(interpreter);
Modified: trunk/src/encoding.c
==============================================================================
--- trunk/src/encoding.c (original)
+++ trunk/src/encoding.c Thu Nov 10 01:54:28 2005
@@ -18,6 +18,7 @@ These are parrot's generic encoding hand
ENCODING *Parrot_default_encoding_ptr;
ENCODING *Parrot_fixed_8_encoding_ptr;
ENCODING *Parrot_utf8_encoding_ptr;
+ENCODING *Parrot_ucs2_encoding_ptr;
ENCODING *Parrot_utf16_encoding_ptr;
typedef struct {
@@ -204,6 +205,10 @@ Parrot_register_encoding(Interp *interpr
Parrot_utf16_encoding_ptr = encoding;
return register_encoding(interpreter, encodingname, encoding);
}
+ if (!strcmp("ucs2", encodingname)) {
+ Parrot_ucs2_encoding_ptr = encoding;
+ return register_encoding(interpreter, encodingname, encoding);
+ }
return 0;
}