cvsuser 05/03/01 06:19:49
Modified: charset ascii.c ascii.h binary.c binary.h iso-8859-1.c
iso-8859-1.h
include/parrot charset.h exceptions.h string_funcs.h
ops ops.num string.ops
src charset.c string.c
t/op string_cs.t
Log:
Strings. Finally. 8 - charset conversion
* trans_charset opcodes
* Parrot_string_trans_charset() interface
* charset converter registration and lookup
* adapt converter function signature to take a dest STRING
* iso-8859-1 to ascii conversion
Revision Changes Path
1.13 +13 -18 parrot/charset/ascii.c
Index: ascii.c
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -r1.12 -r1.13
--- ascii.c 1 Mar 2005 11:06:26 -0000 1.12
+++ ascii.c 1 Mar 2005 14:19:45 -0000 1.13
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.12 2005/03/01 11:06:26 leo Exp $
+$Id: ascii.c,v 1.13 2005/03/01 14:19:45 leo Exp $
=head1 NAME
@@ -95,37 +95,33 @@
offset, count, dest_string);
}
-static void
-to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
+static STRING *
+to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING
*dest)
{
internal_exception(UNIMPLEMENTED, "to_charset for ascii not
implemented");
+ return NULL;
}
-static STRING *
-copy_to_charset(Interp *interpreter, STRING *source_string,
- CHARSET *new_charset)
-{
- STRING *return_string = NULL;
-
- return return_string;
-}
-static void
-to_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "to_unicode for ascii not
implemented");
+ return NULL;
}
-static void
-from_charset(Interp *interpreter, STRING *source_string)
+static STRING *
+from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void
-from_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
/* A noop. can't compose ascii */
@@ -511,7 +507,6 @@
ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- copy_to_charset,
to_unicode,
from_charset,
from_unicode,
1.10 +1 -5 parrot/charset/ascii.h
Index: ascii.h
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.h,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- ascii.h 1 Mar 2005 11:06:26 -0000 1.9
+++ ascii.h 1 Mar 2005 14:19:45 -0000 1.10
@@ -1,7 +1,7 @@
/* ascii.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: ascii.h,v 1.9 2005/03/01 11:06:26 leo Exp $
+ * $Id: ascii.h,v 1.10 2005/03/01 14:19:45 leo Exp $
* Overview:
* This is the header for the ascii charset functions
* Data Structure and Algorithms:
@@ -40,10 +40,6 @@
const STRING *search_string, UINTVAL offset);
size_t ascii_compute_hash(Interp *, STRING *source_string);
-static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset,
UINTVAL replace_count, STRING *insert_string);
-static void to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
-static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
-static void to_unicode(Interp *, STRING *source_string);
static void compose(Interp *, STRING *source_string);
static void decompose(Interp *, STRING *source_string);
static void upcase(Interp *, STRING *source_string);
1.10 +13 -20 parrot/charset/binary.c
Index: binary.c
===================================================================
RCS file: /cvs/public/parrot/charset/binary.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- binary.c 1 Mar 2005 11:06:26 -0000 1.9
+++ binary.c 1 Mar 2005 14:19:45 -0000 1.10
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: binary.c,v 1.9 2005/03/01 11:06:26 leo Exp $
+$Id: binary.c,v 1.10 2005/03/01 14:19:45 leo Exp $
=head1 NAME
@@ -36,38 +36,32 @@
replace_count, insert_string);
}
-static void
-to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
+static STRING*
+to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING
*dest)
{
internal_exception(UNIMPLEMENTED, "to_charset for binary not
implemented");
+ return NULL;
}
-static STRING *
-copy_to_charset(Interp *interpreter, STRING *source_string,
- CHARSET *new_charset)
-{
- STRING *return_string = NULL;
- internal_exception(UNIMPLEMENTED,
- "copy_to_charset for binary not implemented");
- return return_string;
-}
-
-static void
-to_unicode(Interp *interpreter, STRING *source_string)
+static STRING*
+to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "to_unicode for binary not
implemented");
+ return NULL;
}
-static void
-from_charset(Interp *interpreter, STRING *source_string)
+static STRING*
+from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void
-from_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
/* A noop. can't compose binary */
@@ -262,7 +256,6 @@
ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- copy_to_charset,
to_unicode,
from_charset,
from_unicode,
1.7 +1 -4 parrot/charset/binary.h
Index: binary.h
===================================================================
RCS file: /cvs/public/parrot/charset/binary.h,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- binary.h 1 Mar 2005 11:06:26 -0000 1.6
+++ binary.h 1 Mar 2005 14:19:45 -0000 1.7
@@ -1,7 +1,7 @@
/* binary.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: binary.h,v 1.6 2005/03/01 11:06:26 leo Exp $
+ * $Id: binary.h,v 1.7 2005/03/01 14:19:45 leo Exp $
* Overview:
* This is the header for the binary charset functions
* Data Structure and Algorithms:
@@ -13,9 +13,6 @@
#if !defined(PARROT_CHARSET_BINARY_H_GUARD)
#define PARROT_CHARSET_BINARY_H_GUARD
-static void to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
-static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
-static void to_unicode(Interp *, STRING *source_string);
static void compose(Interp *, STRING *source_string);
static void decompose(Interp *, STRING *source_string);
static void upcase(Interp *, STRING *source_string);
1.10 +56 -31 parrot/charset/iso-8859-1.c
Index: iso-8859-1.c
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- iso-8859-1.c 1 Mar 2005 11:06:26 -0000 1.9
+++ iso-8859-1.c 1 Mar 2005 14:19:45 -0000 1.10
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.9 2005/03/01 11:06:26 leo Exp $
+$Id: iso-8859-1.c,v 1.10 2005/03/01 14:19:45 leo Exp $
=head1 NAME
@@ -18,6 +18,17 @@
#include "iso-8859-1.h"
#include "ascii.h"
+#ifdef EXCEPTION
+# undef EXCEPTION
+#endif
+
+/*
+ * TODO check interpreter error and warnings setting
+ */
+
+#define EXCEPTION(err, str) \
+ real_exception(interpreter, NULL, err, str)
+
/* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
@@ -55,51 +66,43 @@
replace_count, insert_string);
}
-static void
-from_charset(Interp *interpreter, STRING *source_string)
+static STRING *
+from_charset(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void
-from_unicode(Interp *interpreter, STRING *source_string)
+static STRING *
+from_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
internal_exception(UNIMPLEMENTED, "Can't do this yet");
+ return NULL;
}
-static void
-to_charset(Interp *interpreter, STRING *source_string, CHARSET *new_charset)
+static STRING *
+to_unicode(Interp *interpreter, STRING *source_string, STRING *dest)
{
- charset_converter_t conversion_func;
- if ((conversion_func = Parrot_find_charset_converter(interpreter,
- source_string->charset, new_charset))) {
- /*
- * XXX conversion_func has wrong signature ?
- *
- * conversion_func(interpreter, new_charset, source_string);
- */
- }
- else {
- to_unicode(interpreter, source_string);
- new_charset->from_charset(interpreter, source_string);
- }
+ internal_exception(UNIMPLEMENTED,
+ "to_unicode for iso-8859-1 not implemented");
+ return NULL;
}
static STRING *
-copy_to_charset(Interp *interpreter, STRING *source_string,
- CHARSET *new_charset)
+to_charset(Interp *interpreter, STRING *src, CHARSET *new_charset, STRING
*dest)
{
- STRING *return_string = NULL;
+ charset_converter_t conversion_func;
- return return_string;
-}
+ if ((conversion_func = Parrot_find_charset_converter(interpreter,
+ src->charset, new_charset))) {
+ return conversion_func(interpreter, src, dest);
+ }
+ else {
+ STRING *res = to_unicode(interpreter, src, dest);
+ return new_charset->from_charset(interpreter, res, dest);
-static void
-to_unicode(Interp *interpreter, STRING *source_string)
-{
- internal_exception(UNIMPLEMENTED,
- "to_unicode for iso-8859-1 not implemented");
+ }
}
/* A noop. can't compose iso-8859-1 */
@@ -367,7 +370,6 @@
ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
- copy_to_charset,
to_unicode,
from_charset,
from_unicode,
@@ -417,6 +419,29 @@
return return_set;
}
+STRING *
+charset_cvt_iso_8859_1_to_ascii(Interp *interpreter, STRING *src, STRING
*dest)
+{
+ UINTVAL offs, c;
+ if (dest) {
+ Parrot_reallocate_string(interpreter, dest, src->strlen);
+ dest->bufused = src->bufused;
+ dest->strlen = src->strlen;
+ }
+ for (offs = 0; offs < src->strlen; ++offs) {
+ c = ENCODING_GET_BYTE(interpreter, src, offs);
+ if (c >= 0x80) {
+ EXCEPTION(LOSSY_CONVERSION, "lossy conversion to ascii");
+ }
+ if (dest)
+ ENCODING_SET_BYTE(interpreter, dest, offs, c);
+ }
+ if (dest)
+ return dest;
+ src->charset = Parrot_ascii_charset_ptr;
+ return src;
+}
+
/*
* Local variables:
* c-indentation-style: bsd
1.8 +3 -4 parrot/charset/iso-8859-1.h
Index: iso-8859-1.h
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- iso-8859-1.h 1 Mar 2005 11:06:26 -0000 1.7
+++ iso-8859-1.h 1 Mar 2005 14:19:45 -0000 1.8
@@ -1,7 +1,7 @@
/* iso_8859_1.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: iso-8859-1.h,v 1.7 2005/03/01 11:06:26 leo Exp $
+ * $Id: iso-8859-1.h,v 1.8 2005/03/01 14:19:45 leo Exp $
* Overview:
* This is the header for the iso_8859-1 charset functions
* Data Structure and Algorithms:
@@ -14,9 +14,6 @@
#define PARROT_CHARSET_ISO_8859_1_H_GUARD
static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset,
UINTVAL replace_count, STRING *insert_string);
-static void to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
-static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
-static void to_unicode(Interp *, STRING *source_string);
static void compose(Interp *, STRING *source_string);
static void decompose(Interp *, STRING *source_string);
static void upcase(Interp *, STRING *source_string);
@@ -39,6 +36,8 @@
static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
+STRING *charset_cvt_iso_8859_1_to_ascii(Interp *, STRING *src, STRING *dest);
+
CHARSET *Parrot_charset_iso_8859_1_init(Interp *);
#endif /* PARROT_CHARSET_ISO_8859_1_H_GUARD */
1.9 +15 -11 parrot/include/parrot/charset.h
Index: charset.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/charset.h,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- charset.h 1 Mar 2005 08:30:56 -0000 1.8
+++ charset.h 1 Mar 2005 14:19:46 -0000 1.9
@@ -1,7 +1,7 @@
/* charset.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: charset.h,v 1.8 2005/03/01 08:30:56 leo Exp $
+ * $Id: charset.h,v 1.9 2005/03/01 14:19:46 leo Exp $
* Overview:
* This is the header for the 8-bit fixed-width encoding
* Data Structure and Algorithms:
@@ -35,11 +35,14 @@
typedef STRING *(*charset_get_graphemes_t)(Interp *, STRING *source_string,
UINTVAL offset, UINTVAL count);
typedef STRING *(*charset_get_graphemes_inplace_t)(Interp *, STRING
*source_string, STRING *dest_string, UINTVAL offset, UINTVAL count);
typedef void (*charset_set_graphemes_t)(Interp *, STRING *source_string,
UINTVAL offset, UINTVAL replace_count, STRING *insert_string);
-typedef void (*charset_to_charset_t)(Interp *, STRING *source_string,
CHARSET *new_charset);
-typedef STRING *(*charset_copy_to_charset_t)(Interp *, STRING
*source_string, CHARSET *new_charset);
-typedef void (*charset_to_unicode_t)(Interp *, STRING *source_string);
-typedef void (*charset_from_charset_t)(Interp *, STRING *source_string);
-typedef void (*charset_from_unicode_t)(Interp *, STRING *source_string);
+
+typedef STRING * (*charset_to_charset_t)(Interp *, STRING *source_string,
+ CHARSET *new_charset, STRING *dest);
+typedef STRING * (*charset_to_unicode_t)(Interp *, STRING *src, STRING
*dest);
+typedef STRING * (*charset_from_charset_t)(Interp *, STRING *source_string,
+ STRING *dest);
+typedef STRING * (*charset_from_unicode_t)(Interp *, STRING *source_string,
+ STRING *dest);
typedef void (*charset_compose_t)(Interp *, STRING *source_string);
typedef void (*charset_decompose_t)(Interp *, STRING *source_string);
typedef void (*charset_upcase_t)(Interp *, STRING *source_string);
@@ -77,14 +80,17 @@
INTVAL Parrot_register_charset(Interp *, const char *charsetname, CHARSET
*charset);
INTVAL Parrot_make_default_charset(Interp *, const char *charsetname,
CHARSET *charset);
CHARSET *Parrot_default_charset(Interp *);
-typedef INTVAL (*charset_converter_t)(Interp *, CHARSET *lhs, CHARSET *rhs);
+typedef STRING* (*charset_converter_t)(Interp *, STRING *src, STRING *dst);
charset_converter_t Parrot_find_charset_converter(Interp *, CHARSET *lhs,
CHARSET *rhs);
+void Parrot_register_charset_converter(Interp *,
+ CHARSET *lhs, CHARSET *rhs, charset_converter_t func);
void Parrot_deinit_charsets(Interp *);
INTVAL Parrot_charset_number(Interp *, STRING *charsetname);
STRING* Parrot_charset_name(Interp *, INTVAL);
const char* Parrot_charset_c_name(Interp *, INTVAL);
INTVAL Parrot_charset_number_of_str(Interp *, STRING *src);
+CHARSET* Parrot_get_charset(Interp *, INTVAL number_of_charset);
struct _charset {
const char *name;
@@ -92,7 +98,6 @@
charset_get_graphemes_inplace_t get_graphemes_inplace;
charset_set_graphemes_t set_graphemes;
charset_to_charset_t to_charset;
- charset_copy_to_charset_t copy_to_charset;
charset_to_unicode_t to_unicode;
charset_from_charset_t from_charset;
charset_from_unicode_t from_unicode;
@@ -132,9 +137,8 @@
#define CHARSET_GET_GRAPEMES(interp, source, offset, count) ((CHARSET
*)source->charset)->get_graphemes(interpreter, source, offset, count)
#define CHARSET_GET_GRAPHEMES_INPLACE(interp, source, dest, offset, count)
((CHARSET *)source->charset)->get_graphemes(interpreter, source, dest, offset,
count)
#define CHARSET_SET_GRAPHEMES(interp, source, offset, replace_count, insert)
((CHARSET *)source->charset)->set_graphemes(interpreter, source, offset,
replace_count, insert)
-#define CHARSET_TO_CHARSET(interp, source, new_charset) ((CHARSET
*)source->charset)->to_charset(interpreter, source, new_charset)
-#define CHARSET_COPY_TO_CHARSET(interp, source, new_charset) ((CHARSET
*)source->charset)->copy_to_charset(interpreter, source, new_charset)
-#define CHARSET_TO_UNICODE(interp, source) ((CHARSET
*)source->charset)->to_unicode(interpreter, source)
+#define CHARSET_TO_CHARSET(interp, source, new_charset, dest) ((CHARSET
*)source->charset)->to_charset(interpreter, source, new_charset, dest)
+#define CHARSET_TO_UNICODE(interp, source, dest) ((CHARSET
*)source->charset)->to_unicode(interpreter, source, dest)
#define CHARSET_COMPOSE(interp, source) ((CHARSET
*)source->charset)->compose(interpreter, source)
#define CHARSET_DECOMPOSE(interp, source) ((CHARSET
*)source->charset)->decompose(interpreter, source)
#define CHARSET_UPCASE(interp, source) ((CHARSET
*)source->charset)->upcase(interpreter, source)
1.54 +3 -2 parrot/include/parrot/exceptions.h
Index: exceptions.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/exceptions.h,v
retrieving revision 1.53
retrieving revision 1.54
diff -u -r1.53 -r1.54
--- exceptions.h 6 Jan 2005 00:42:05 -0000 1.53
+++ exceptions.h 1 Mar 2005 14:19:46 -0000 1.54
@@ -1,7 +1,7 @@
/* exceptions.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: exceptions.h,v 1.53 2005/01/06 00:42:05 rubys Exp $
+ * $Id: exceptions.h,v 1.54 2005/03/01 14:19:46 leo Exp $
* Overview:
* define the internal interpreter exceptions
* Data Structure and Algorithms:
@@ -119,7 +119,8 @@
WRITE_TO_CONSTCLASS,
NOSPAWN,
INTERNAL_NOT_IMPLEMENTED,
- ERR_OVERFLOW
+ ERR_OVERFLOW,
+ LOSSY_CONVERSION
} exception_type_enum;
/* &end_gen */
1.50 +3 -1 parrot/include/parrot/string_funcs.h
Index: string_funcs.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
retrieving revision 1.49
retrieving revision 1.50
diff -u -r1.49 -r1.50
--- string_funcs.h 28 Feb 2005 18:01:22 -0000 1.49
+++ string_funcs.h 1 Mar 2005 14:19:46 -0000 1.50
@@ -1,7 +1,7 @@
/* string_funcs.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string_funcs.h,v 1.49 2005/02/28 18:01:22 leo Exp $
+ * $Id: string_funcs.h,v 1.50 2005/03/01 14:19:46 leo Exp $
* Overview:
* This is the api header for the string subsystem
* Data Structure and Algorithms:
@@ -115,6 +115,8 @@
INTVAL Parrot_string_find_newline(Interp *, STRING *, INTVAL offset);
INTVAL Parrot_string_find_word_boundary(Interp *, STRING *, INTVAL offset);
+STRING* Parrot_string_trans_charset(Interp *, STRING *src,
+ INTVAL charset_nr, STRING *dest);
#endif /* PARROT_IN_CORE */
#endif /* PARROT_STRING_FUNCS_H_GUARD */
1.60 +6 -0 parrot/ops/ops.num
Index: ops.num
===================================================================
RCS file: /cvs/public/parrot/ops/ops.num,v
retrieving revision 1.59
retrieving revision 1.60
diff -u -r1.59 -r1.60
--- ops.num 28 Feb 2005 18:01:24 -0000 1.59
+++ ops.num 1 Mar 2005 14:19:47 -0000 1.60
@@ -1422,3 +1422,9 @@
find_word_boundary_i_s_ic 1392
find_word_boundary_i_sc_i 1393
find_word_boundary_i_sc_ic 1394
+trans_charset_s_i 1395
+trans_charset_s_ic 1396
+trans_charset_s_s_i 1397
+trans_charset_s_s_ic 1398
+trans_charset_s_sc_i 1399
+trans_charset_s_sc_ic 1400
1.35 +21 -0 parrot/ops/string.ops
Index: string.ops
===================================================================
RCS file: /cvs/public/parrot/ops/string.ops,v
retrieving revision 1.34
retrieving revision 1.35
diff -u -r1.34 -r1.35
--- string.ops 28 Feb 2005 18:01:24 -0000 1.34
+++ string.ops 1 Mar 2005 14:19:47 -0000 1.35
@@ -634,6 +634,16 @@
Return the charset number of the charset named $2. If the charset doesn't
exit, throw an exception.
+=item B<trans_charset>(inout STR, in INT)
+
+Change the string to have the specified charset.
+
+=item B<trans_charset>(out STR, in STR, in INT)
+
+Create a string $1 from $2 with the specified charset.
+
+Both functions may throw an exception on information loss.
+
=cut
op charset(out INT, in STR) :base_core {
@@ -655,6 +665,17 @@
goto NEXT();
}
+op trans_charset(inout STR, in INT) {
+ $1 = Parrot_string_trans_charset(interpreter, $1, $2, NULL);
+ goto NEXT();
+}
+
+op trans_charset(out STR, in STR, in INT) {
+ STRING *dest = new_string_header(interpreter, 0);
+ $1 = Parrot_string_trans_charset(interpreter, $2, $3, dest);
+ goto NEXT();
+}
+
=item B<is_whitespace>(out INT, in STR, in INT)
Set $1 to 1 if the codepoint of string $2 at offset $3 is whitespace.
1.9 +60 -1 parrot/src/charset.c
Index: charset.c
===================================================================
RCS file: /cvs/public/parrot/src/charset.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- charset.c 1 Mar 2005 08:31:02 -0000 1.8
+++ charset.c 1 Mar 2005 14:19:48 -0000 1.9
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: charset.c,v 1.8 2005/03/01 08:31:02 leo Exp $
+$Id: charset.c,v 1.9 2005/03/01 14:19:48 leo Exp $
=head1 NAME
@@ -14,6 +14,7 @@
#define PARROT_NO_EXTERN_CHARSET_PTRS
#include "parrot/parrot.h"
+#include "../charset/iso-8859-1.h"
CHARSET *Parrot_iso_8859_1_charset_ptr;
CHARSET *Parrot_binary_charset_ptr;
@@ -26,8 +27,15 @@
*/
typedef struct {
+ CHARSET *to;
+ charset_converter_t func;
+} To_converter;
+
+typedef struct {
CHARSET *charset;
STRING *name;
+ int n_converters;
+ To_converter *to_converters;
} One_charset;
typedef struct {
@@ -126,6 +134,14 @@
return all_charsets->set[number_of_charset].name;
}
+CHARSET*
+Parrot_get_charset(Interp *interpreter, INTVAL number_of_charset)
+{
+ if (number_of_charset >= all_charsets->n_charsets)
+ return NULL;
+ return all_charsets->set[number_of_charset].charset;
+}
+
const char *
Parrot_charset_c_name(Interp *interpreter, INTVAL number_of_charset)
{
@@ -153,6 +169,7 @@
all_charsets->n_charsets++;
all_charsets->set[n].charset = charset;
all_charsets->set[n].name = const_string(interpreter, charsetname);
+ all_charsets->set[n].n_converters = 0;
return 1;
}
@@ -183,6 +200,9 @@
}
if (!strcmp("ascii", charsetname)) {
Parrot_ascii_charset_ptr = charset;
+ Parrot_register_charset_converter(interpreter,
+ Parrot_iso_8859_1_charset_ptr, charset,
+ charset_cvt_iso_8859_1_to_ascii);
return register_charset(interpreter, charsetname, charset);
}
return 0;
@@ -202,12 +222,51 @@
return Parrot_default_charset_ptr;
}
+
charset_converter_t
Parrot_find_charset_converter(Interp *interpreter, CHARSET *lhs, CHARSET
*rhs)
{
+ int i, j, n, nc;
+
+ n = all_charsets->n_charsets;
+ for (i = 0; i < n; ++i) {
+ if (lhs == all_charsets->set[i].charset) {
+ One_charset *left = all_charsets->set + i;
+
+ nc = left->n_converters;
+ for (j = 0; j < nc; ++j) {
+ if (left->to_converters[j].to == rhs)
+ return left->to_converters[j].func;
+ }
+ }
+ }
return NULL;
}
+void
+Parrot_register_charset_converter(Interp *interpreter,
+ CHARSET *lhs, CHARSET *rhs, charset_converter_t func)
+{
+ int i, n, nc;
+
+ n = all_charsets->n_charsets;
+ for (i = 0; i < n; ++i) {
+ if (lhs == all_charsets->set[i].charset) {
+ One_charset *left = all_charsets->set + i;
+
+ nc = left->n_converters++;
+ if (nc) {
+ left->to_converters = mem_sys_realloc(left->to_converters,
+ sizeof(To_converter) * (nc + 1));
+ }
+ else
+ left->to_converters = mem_sys_allocate(sizeof(To_converter));
+ left->to_converters[nc].to = rhs;
+ left->to_converters[nc].func = func;
+ }
+ }
+}
+
/*
* Local variables:
* c-indentation-style: bsd
1.238 +36 -1 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.237
retrieving revision 1.238
diff -u -r1.237 -r1.238
--- string.c 28 Feb 2005 18:01:28 -0000 1.237
+++ string.c 1 Mar 2005 14:19:48 -0000 1.238
@@ -1,6 +1,6 @@
/*
Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-$Id: string.c,v 1.237 2005/02/28 18:01:28 leo Exp $
+$Id: string.c,v 1.238 2005/03/01 14:19:48 leo Exp $
=head1 NAME
@@ -2591,6 +2591,41 @@
return CHARSET_FIND_WORD_BOUNDARY(interpreter, s, offset);
}
+STRING*
+Parrot_string_trans_charset(Interp *interpreter, STRING *src,
+ INTVAL charset_nr, STRING *dest)
+{
+ CHARSET *new_charset;
+
+ if (!src)
+ return NULL;
+ new_charset = Parrot_get_charset(interpreter, charset_nr);
+ if (!new_charset)
+ real_exception(interpreter, NULL, INVALID_CHARTYPE,
+ "charset #%d not found", (int) charset_nr);
+ /*
+ * dest is an empty string header or NULL, if an inplace
+ * operation is desired
+ */
+ if (dest) {
+ if (new_charset == src->charset) {
+ Parrot_reuse_COW_reference(interpreter, src, dest);
+ dest->charset = new_charset;
+ /* keep encoding */
+ return dest;
+ }
+ dest->charset = new_charset;
+ /* XXX prefered encoding for charset */
+ dest->encoding = PARROT_DEFAULT_ENCODING;
+ }
+ else {
+ if (new_charset == src->charset) {
+ return src;
+ }
+ }
+ return CHARSET_TO_CHARSET(interpreter, src, new_charset, dest);
+}
+
/*
=back
1.6 +62 -2 parrot/t/op/string_cs.t
Index: string_cs.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string_cs.t,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- string_cs.t 28 Feb 2005 18:01:30 -0000 1.5
+++ string_cs.t 1 Mar 2005 14:19:49 -0000 1.6
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
-# $Id: string_cs.t,v 1.5 2005/02/28 18:01:30 leo Exp $
+# $Id: string_cs.t,v 1.6 2005/03/01 14:19:49 leo Exp $
=head1 NAME
@@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 16;
+use Parrot::Test tests => 20;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -245,3 +245,63 @@
CODE
0 2 3 6 -1 ok
OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_s_i");
+ set S0, "abc"
+ find_charset I0, "ascii"
+ trans_charset S1, S0, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ end
+CODE
+abc
+ascii
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i");
+ set S1, "abc"
+ find_charset I0, "ascii"
+ trans_charset S1, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ end
+CODE
+abc
+ascii
+OUTPUT
+
+
+output_like( <<'CODE', <<OUTPUT, "trans_charset_s_i - lossy");
+ set S1, "abc�"
+ find_charset I0, "ascii"
+ trans_charset S1, I0
+ print "never\n"
+ end
+CODE
+/lossy conversion to ascii/
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "trans_charset_s_i - same");
+ set S1, ascii:"abc"
+ find_charset I0, "ascii"
+ trans_charset S1, I0
+ print S1
+ print "\n"
+ charset I0, S1
+ charsetname S2, I0
+ print S2
+ print "\n"
+ end
+CODE
+abc
+ascii
+OUTPUT
+