Author: leo
Date: Thu Nov 10 13:00:37 2005
New Revision: 9890
Modified:
trunk/charset/ascii.c
trunk/charset/binary.c
trunk/charset/iso-8859-1.c
trunk/charset/unicode.c
trunk/encodings/utf16.c
trunk/include/parrot/charset.h
trunk/src/string.c
Log:
charsets - again
* removed 3 unneeded additional conversion functions
remove macros and vtable hooks
* simplify to_charset for all charsets
Modified: trunk/charset/ascii.c
==============================================================================
--- trunk/charset/ascii.c (original)
+++ trunk/charset/ascii.c Thu Nov 10 13:00:37 2005
@@ -80,38 +80,34 @@ ascii_get_graphemes_inplace(Interp *inte
offset, count, dest_string);
}
-
static STRING *
-from_charset(Interp *interpreter, STRING *src, STRING *dest)
+to_ascii(Interp *interpreter, STRING *src, STRING *dest)
{
- UINTVAL offs, c;
String_iter iter;
+ UINTVAL c, len, offs;
+ unsigned char *p;
+ len = src->strlen;
if (dest) {
- Parrot_reallocate_string(interpreter, dest, src->strlen);
- dest->bufused = src->strlen;
- dest->strlen = src->strlen;
+ Parrot_reallocate_string(interpreter, dest, len);
+ }
+ else {
+ /* the string can't grow - replace inplace */
+ dest = src;
}
+ p = dest->strstart;
ENCODING_ITER_INIT(interpreter, src, &iter);
- for (offs = 0; offs < src->strlen; ++offs) {
+ for (offs = 0; offs < len; ++offs) {
c = iter.get_and_advance(interpreter, &iter);
- if (c >= 0x80) {
- EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
- }
- if (dest)
- ENCODING_SET_BYTE(interpreter, dest, offs, c);
- }
- if (dest)
- return dest;
- src->charset = Parrot_ascii_charset_ptr;
- return src;
-}
-
-static STRING *
-from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- internal_exception(UNIMPLEMENTED, "Can't do this yet");
- return NULL;
+ if (c >= 128)
+ real_exception(interpreter, NULL, LOSSY_CONVERSION,
+ "can't convert unicode string to ascii");
+ *p++ = (unsigned char)c;
+ }
+ dest->bufused = dest->strlen = len;
+ dest->charset = Parrot_ascii_charset_ptr;
+ dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, dest);
+ return dest;
}
static STRING *
@@ -131,19 +127,16 @@ to_unicode(Interp *interpreter, STRING *
}
static STRING *
-to_charset(Interp *interpreter, STRING *src,
- CHARSET *new_charset, STRING *dest)
+to_charset(Interp *interpreter, STRING *src, STRING *dest)
{
charset_converter_t conversion_func;
if ((conversion_func = Parrot_find_charset_converter(interpreter,
- src->charset, new_charset))) {
+ src->charset, Parrot_ascii_charset_ptr))) {
return conversion_func(interpreter, src, dest);
}
else {
- STRING *res = to_unicode(interpreter, src, dest);
- return new_charset->from_charset(interpreter, res, dest);
-
+ return to_ascii(interpreter, src, dest);
}
}
@@ -453,9 +446,6 @@ Parrot_charset_ascii_init(Interp *interp
ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- to_unicode,
- from_charset,
- from_unicode,
compose,
decompose,
upcase,
Modified: trunk/charset/binary.c
==============================================================================
--- trunk/charset/binary.c (original)
+++ trunk/charset/binary.c Thu Nov 10 13:00:37 2005
@@ -34,33 +34,17 @@ set_graphemes(Interp *interpreter, STRIN
}
static STRING*
-to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING
*dest)
+to_charset(Interp *interpreter, STRING *src, STRING *dest)
{
+ charset_converter_t conversion_func;
+ if ((conversion_func = Parrot_find_charset_converter(interpreter,
+ src->charset, Parrot_binary_charset_ptr))) {
+ return conversion_func(interpreter, src, dest);
+ }
internal_exception(UNIMPLEMENTED, "to_charset for binary not implemented");
return NULL;
}
-static STRING*
-to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- internal_exception(UNIMPLEMENTED, "to_unicode for binary not implemented");
- return NULL;
-}
-
-static STRING*
-from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- internal_exception(UNIMPLEMENTED, "Can't do this yet");
- return NULL;
-}
-
-static STRING *
-from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- internal_exception(UNIMPLEMENTED, "Can't do this yet");
- return NULL;
-}
-
/* A noop. can't compose binary */
static void
compose(Interp *interpreter, STRING *source_string)
@@ -174,9 +158,6 @@ Parrot_charset_binary_init(Interp *inter
ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- to_unicode,
- from_charset,
- from_unicode,
compose,
decompose,
upcase,
Modified: trunk/charset/iso-8859-1.c
==============================================================================
--- trunk/charset/iso-8859-1.c (original)
+++ trunk/charset/iso-8859-1.c Thu Nov 10 13:00:37 2005
@@ -40,7 +40,7 @@ set_graphemes(Interp *interpreter, STRIN
}
static STRING *
-from_charset(Interp *interpreter, STRING *src, STRING *dest)
+to_latin1(Interp *interpreter, STRING *src, STRING *dest)
{
UINTVAL offs, c;
String_iter iter;
@@ -49,27 +49,21 @@ from_charset(Interp *interpreter, STRING
Parrot_reallocate_string(interpreter, dest, src->strlen);
dest->bufused = src->strlen;
dest->strlen = src->strlen;
+ dest->charset = Parrot_iso_8859_1_charset_ptr;
+ dest->encoding = Parrot_fixed_8_encoding_ptr;
}
+ else
+ internal_exception(UNIMPLEMENTED,
+ "to_charset inplace for latin1 not implemented");
ENCODING_ITER_INIT(interpreter, src, &iter);
for (offs = 0; offs < src->strlen; ++offs) {
c = iter.get_and_advance(interpreter, &iter);
if (c >= 0x100) {
EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
}
- if (dest)
- ENCODING_SET_BYTE(interpreter, dest, offs, c);
+ ENCODING_SET_BYTE(interpreter, dest, offs, c);
}
- if (dest)
- return dest;
- src->charset = Parrot_ascii_charset_ptr;
- return src;
-}
-
-static STRING *
-from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- internal_exception(UNIMPLEMENTED, "Can't do this yet");
- return NULL;
+ return dest;
}
static STRING *
@@ -106,19 +100,16 @@ to_unicode(Interp *interpreter, STRING *
}
static STRING *
-to_charset(Interp *interpreter, STRING *src,
- CHARSET *new_charset, STRING *dest)
+to_charset(Interp *interpreter, STRING *src, STRING *dest)
{
charset_converter_t conversion_func;
if ((conversion_func = Parrot_find_charset_converter(interpreter,
- src->charset, new_charset))) {
+ src->charset, Parrot_iso_8859_1_charset_ptr))) {
return conversion_func(interpreter, src, dest);
}
else {
- STRING *res = to_unicode(interpreter, src, dest);
- return new_charset->from_charset(interpreter, res, dest);
-
+ return to_latin1(interpreter, src, dest);
}
}
@@ -336,9 +327,6 @@ Parrot_charset_iso_8859_1_init(Interp *i
ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- to_unicode,
- from_charset,
- from_unicode,
compose,
decompose,
upcase,
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Thu Nov 10 13:00:37 2005
@@ -58,49 +58,44 @@ get_graphemes_inplace(Interp *interprete
}
static STRING*
-to_charset(Interp *interpreter, STRING *src,
- CHARSET *new_charset, STRING *dest)
+to_charset(Interp *interpreter, STRING *src, STRING *dest)
{
charset_converter_t conversion_func;
+ String_iter iter;
+ UINTVAL c, len, offs;
if ((conversion_func = Parrot_find_charset_converter(interpreter,
- src->charset, new_charset))) {
+ src->charset, Parrot_unicode_charset_ptr))) {
return conversion_func(interpreter, src, dest);
}
- else {
- return new_charset->from_charset(interpreter, src, dest);
-
- }
-}
-
-static STRING*
-to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- UNIMPL;
- return NULL;
-}
-
-static STRING*
-from_charset(Interp *interpreter, STRING *src, STRING *dest)
-{
- if (src->charset == Parrot_unicode_charset_ptr) {
- if (!dest) {
- /* inplace ok */
- return src;
+ len = src->strlen;
+ if (dest) {
+ Parrot_reallocate_string(interpreter, dest, len);
+ dest->charset = Parrot_unicode_charset_ptr;
+ dest->encoding = CHARSET_GET_PREFERRED_ENCODING(interpreter, dest);
+ ENCODING_ITER_INIT(interpreter, dest, &iter);
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = ENCODING_GET_CODEPOINT(interpreter, src, offs);
+ if (iter.bytepos >= PObj_buflen(dest) - 4) {
+ UINTVAL need = (UINTVAL)( (src->strlen - offs) * 1.5 );
+ if (need < 16)
+ need = 16;
+ Parrot_reallocate_string(interpreter, dest,
+ PObj_buflen(dest) + need);
+ }
+ iter.set_and_advance(interpreter, &iter, c);
}
- Parrot_reuse_COW_reference(interpreter, src, dest);
+ dest->bufused = iter.bytepos;
+ dest->strlen = iter.charpos;
return dest;
}
- UNIMPL;
+ else {
+ internal_exception(UNIMPLEMENTED,
+ "to_charset inplace for unicode not implemented");
+ }
return NULL;
}
-static STRING *
-from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
-{
- UNIMPL;
- return NULL;
-}
static void
compose(Interp *interpreter, STRING *source_string)
@@ -434,9 +429,6 @@ Parrot_charset_unicode_init(Interp *inte
get_graphemes_inplace,
set_graphemes,
to_charset,
- to_unicode,
- from_charset,
- from_unicode,
compose,
decompose,
upcase,
Modified: trunk/encodings/utf16.c
==============================================================================
--- trunk/encodings/utf16.c (original)
+++ trunk/encodings/utf16.c Thu Nov 10 13:00:37 2005
@@ -204,7 +204,7 @@ set_byte(Interp *interpreter, const STRI
internal_exception(0, "set_byte past the end of the buffer");
}
contents = src->strstart;
- contents[offset] = byte;
+ contents[offset] = (unsigned char)byte;
}
static STRING *
Modified: trunk/include/parrot/charset.h
==============================================================================
--- trunk/include/parrot/charset.h (original)
+++ trunk/include/parrot/charset.h Thu Nov 10 13:00:37 2005
@@ -38,9 +38,6 @@ typedef STRING *(*charset_get_graphemes_
typedef void (*charset_set_graphemes_t)(Interp *, STRING *source_string,
UINTVAL offset, UINTVAL replace_count, STRING *insert_string);
typedef STRING * (*charset_to_charset_t)(Interp *, STRING *source_string,
- CHARSET *new_charset, STRING *dest);
-typedef STRING * (*charset_to_unicode_t)(Interp *, STRING *src, STRING *dest);
-typedef STRING * (*charset_from_charset_t)(Interp *, STRING *source_string,
STRING *dest);
typedef STRING * (*charset_from_unicode_t)(Interp *, STRING *source_string,
STRING *dest);
@@ -104,9 +101,6 @@ struct _charset {
charset_get_graphemes_inplace_t get_graphemes_inplace;
charset_set_graphemes_t set_graphemes;
charset_to_charset_t to_charset;
- charset_to_unicode_t to_unicode;
- charset_from_charset_t from_charset;
- charset_from_unicode_t from_unicode;
charset_compose_t compose;
charset_decompose_t decompose;
charset_upcase_t upcase;
@@ -130,7 +124,6 @@ struct _charset {
#define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((CHARSET
*)source->charset)->get_graphemes(interpreter, source, offset, count)
#define CHARSET_GET_GRAPHEMES_INPLACE(interp, source, dest, offset, count)
((CHARSET *)source->charset)->get_graphemes(interpreter, source, dest, offset,
count)
#define CHARSET_SET_GRAPHEMES(interp, source, offset, replace_count, insert)
((CHARSET *)source->charset)->set_graphemes(interpreter, source, offset,
replace_count, insert)
-#define CHARSET_TO_CHARSET(interp, source, new_charset, dest) ((CHARSET
*)source->charset)->to_charset(interpreter, source, new_charset, dest)
#define CHARSET_TO_UNICODE(interp, source, dest) ((CHARSET
*)source->charset)->to_unicode(interpreter, source, dest)
#define CHARSET_COMPOSE(interp, source) ((CHARSET
*)source->charset)->compose(interpreter, source)
#define CHARSET_DECOMPOSE(interp, source) ((CHARSET
*)source->charset)->decompose(interpreter, source)
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Thu Nov 10 13:00:37 2005
@@ -2653,6 +2653,19 @@ Parrot_string_find_not_cclass(Interp *in
return CHARSET_FIND_NOT_CCLASS(interpreter, flags, s, offset, count);
}
+/*
+
+=item C< STRING*
+Parrot_string_trans_charset(Interp *interpreter, STRING *src,
+ INTVAL charset_nr, STRING *dest)>
+
+If C<dest> == NULL convert C<src> to the given charset inplace, else
+return a copy of C<src> with the charset in dest.
+
+=cut
+
+*/
+
STRING*
Parrot_string_trans_charset(Interp *interpreter, STRING *src,
INTVAL charset_nr, STRING *dest)
@@ -2685,7 +2698,7 @@ Parrot_string_trans_charset(Interp *inte
return src;
}
}
- return CHARSET_TO_CHARSET(interpreter, src, new_charset, dest);
+ return new_charset->to_charset(interpreter, src, dest);
}
STRING*