Author: leo
Date: Wed Nov 16 10:21:29 2005
New Revision: 10028
Modified:
trunk/charset/ascii.c
trunk/charset/ascii.h
trunk/charset/binary.c
trunk/charset/iso-8859-1.c
trunk/charset/iso-8859-1.h
trunk/charset/unicode.c
trunk/include/parrot/charset.h
trunk/include/parrot/string_funcs.h
trunk/ops/experimental.ops
trunk/src/string.c
trunk/t/op/string_cs.t
Log:
unicode charsets - new opcode compose
* new API string_compose
* test
Modified: trunk/charset/ascii.c
==============================================================================
--- trunk/charset/ascii.c (original)
+++ trunk/charset/ascii.c Wed Nov 16 10:21:29 2005
@@ -141,15 +141,17 @@ to_charset(Interp *interpreter, STRING *
}
/* A noop. can't compose ascii */
-static void
-compose(Interp *interpreter, STRING *source_string)
+static STRING*
+compose(Interp *interpreter, STRING *src)
{
+ return string_copy(interpreter, src);
}
/* A noop. can't decompose ascii */
-static void
-decompose(Interp *interpreter, STRING *source_string)
+static STRING*
+decompose(Interp *interpreter, STRING *src)
{
+ return string_copy(interpreter, src);
}
static void
Modified: trunk/charset/ascii.h
==============================================================================
--- trunk/charset/ascii.h (original)
+++ trunk/charset/ascii.h Wed Nov 16 10:21:29 2005
@@ -36,27 +36,6 @@ INTVAL ascii_cs_rindex(Interp *, STRING
size_t ascii_compute_hash(Interp *, STRING *source_string, size_t seed);
INTVAL mixed_cs_index(Interp *, STRING *src, STRING *search, UINTVAL offs);
-static void compose(Interp *, STRING *source_string);
-static void decompose(Interp *, STRING *source_string);
-static void upcase(Interp *, STRING *source_string);
-static void downcase(Interp *, STRING *source_string);
-static void titlecase(Interp *, STRING *source_string);
-static void upcase_first(Interp *, STRING *source_string);
-static void downcase_first(Interp *, STRING *source_string);
-static void titlecase_first(Interp *, STRING *source_string);
-static UINTVAL validate(Interp *, STRING *source_string);
-static INTVAL is_wordchar(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_wordchar(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_wordchar(Interp *, STRING *source_string, UINTVAL
offset);
-static INTVAL is_whitespace(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_whitespace(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_whitespace(Interp *, STRING *source_string, UINTVAL
offset);
-static INTVAL is_digit(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_digit(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_digit(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL is_punctuation(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
-static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL
offset);
CHARSET *Parrot_charset_ascii_init(Interp *);
STRING *charset_cvt_ascii_to_binary(Interp *, STRING *src, STRING *dest);
Modified: trunk/charset/binary.c
==============================================================================
--- trunk/charset/binary.c (original)
+++ trunk/charset/binary.c Wed Nov 16 10:21:29 2005
@@ -45,16 +45,20 @@ to_charset(Interp *interpreter, STRING *
return NULL;
}
-/* A noop. can't compose binary */
-static void
+/* A err. can't compose binary */
+static STRING*
compose(Interp *interpreter, STRING *source_string)
{
+ EXCEPTION(INVALID_CHARTYPE, "Can't compose binary data");
+ return NULL;
}
-/* A noop. can't decompose binary */
-static void
+/* A err. can't decompose binary */
+static STRING*
decompose(Interp *interpreter, STRING *source_string)
{
+ EXCEPTION(INVALID_CHARTYPE, "Can't decompose binary data");
+ return NULL;
}
static void
Modified: trunk/charset/iso-8859-1.c
==============================================================================
--- trunk/charset/iso-8859-1.c (original)
+++ trunk/charset/iso-8859-1.c Wed Nov 16 10:21:29 2005
@@ -117,15 +117,17 @@ to_charset(Interp *interpreter, STRING *
/* A noop. can't compose iso-8859-1 */
-static void
-compose(Interp *interpreter, STRING *source_string)
+static STRING*
+compose(Interp *interpreter, STRING *src)
{
+ return string_copy(interpreter, src);
}
/* A noop. can't decompose iso-8859-1 */
-static void
-decompose(Interp *interpreter, STRING *source_string)
+static STRING*
+decompose(Interp *interpreter, STRING *src)
{
+ return string_copy(interpreter, src);
}
static void
Modified: trunk/charset/iso-8859-1.h
==============================================================================
--- trunk/charset/iso-8859-1.h (original)
+++ trunk/charset/iso-8859-1.h Wed Nov 16 10:21:29 2005
@@ -13,17 +13,6 @@
#if !defined(PARROT_CHARSET_ISO_8859_1_H_GUARD)
#define PARROT_CHARSET_ISO_8859_1_H_GUARD
-static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset,
UINTVAL replace_count, STRING *insert_string);
-static void compose(Interp *, STRING *source_string);
-static void decompose(Interp *, STRING *source_string);
-static void upcase(Interp *, STRING *source_string);
-static void downcase(Interp *, STRING *source_string);
-static void titlecase(Interp *, STRING *source_string);
-static void upcase_first(Interp *, STRING *source_string);
-static void downcase_first(Interp *, STRING *source_string);
-static void titlecase_first(Interp *, STRING *source_string);
-static UINTVAL validate(Interp *, STRING *source_string);
-
STRING *charset_cvt_iso_8859_1_to_ascii(Interp *, STRING *src, STRING *dest);
CHARSET *Parrot_charset_iso_8859_1_init(Interp *);
Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c (original)
+++ trunk/charset/unicode.c Wed Nov 16 10:21:29 2005
@@ -28,6 +28,7 @@ This file implements the charset functio
#include <unicode/utypes.h>
#include <unicode/uchar.h>
#include <unicode/ustring.h>
+#include <unicode/unorm.h>
#endif
#define EXCEPTION(err, str) \
real_exception(interpreter, NULL, err, str)
@@ -70,16 +71,55 @@ to_charset(Interp *interpreter, STRING *
}
-static void
-compose(Interp *interpreter, STRING *source_string)
+static STRING*
+compose(Interp *interpreter, STRING *src)
{
- UNIMPL;
+#if PARROT_HAS_ICU
+ STRING *dest;
+ int src_len, dest_len;
+ UErrorCode err;
+ /*
+ U_STABLE int32_t U_EXPORT2
+ unorm_normalize(const UChar *source, int32_t sourceLength,
+ UNormalizationMode mode, int32_t options,
+ UChar *result, int32_t resultLength,
+ UErrorCode *status);
+ */
+ dest_len = src_len = src->strlen;
+ dest = string_make_direct(interpreter, NULL, src_len,
+ src->encoding, src->charset, 0);
+ err = U_ZERO_ERROR;
+ dest_len = unorm_normalize(src->strstart, src_len,
+ UNORM_DEFAULT, /* default is NFC */
+ 0, /* options 0 default - no specific icu version
*/
+ dest->strstart, dest_len,
+ &err);
+ dest->bufused = dest_len * sizeof(UChar);
+ if (!U_SUCCESS(err)) {
+ err = U_ZERO_ERROR;
+ Parrot_reallocate_string(interpreter, dest, dest->bufused);
+ dest_len = unorm_normalize(src->strstart, src_len,
+ UNORM_DEFAULT, /* default is NFC */
+ 0, /* options 0 default - no specific icu
version */
+ dest->strstart, dest_len,
+ &err);
+ assert(U_SUCCESS(err));
+ dest->bufused = dest_len * sizeof(UChar);
+ }
+ dest->strlen = dest_len;
+ return dest;
+#else
+ real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+ "no ICU lib loaded");
+ return NULL;
+#endif
}
-static void
-decompose(Interp *interpreter, STRING *source_string)
+static STRING*
+decompose(Interp *interpreter, STRING *src)
{
UNIMPL;
+ return NULL;
}
static void
Modified: trunk/include/parrot/charset.h
==============================================================================
--- trunk/include/parrot/charset.h (original)
+++ trunk/include/parrot/charset.h Wed Nov 16 10:21:29 2005
@@ -41,8 +41,8 @@ typedef STRING * (*charset_to_charset_t)
STRING *dest);
typedef STRING * (*charset_from_unicode_t)(Interp *, STRING *source_string,
STRING *dest);
-typedef void (*charset_compose_t)(Interp *, STRING *source_string);
-typedef void (*charset_decompose_t)(Interp *, STRING *source_string);
+typedef STRING* (*charset_compose_t)(Interp *, STRING *source_string);
+typedef STRING* (*charset_decompose_t)(Interp *, STRING *source_string);
typedef void (*charset_upcase_t)(Interp *, STRING *source_string);
typedef void (*charset_downcase_t)(Interp *, STRING *source_string);
typedef void (*charset_titlecase_t)(Interp *, STRING *source_string);
Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h (original)
+++ trunk/include/parrot/string_funcs.h Wed Nov 16 10:21:29 2005
@@ -96,6 +96,7 @@ STRING * string_unescape_cstring(Interp
const char *cstring, char delimiter, const char *enc_or_charset);
STRING * string_escape_string(Interp *, STRING *);
STRING * string_escape_string_delimited(Interp *, STRING *, UINTVAL len);
+STRING * string_compose(Interp *, STRING *);
STRING *string_upcase(Interp *, const STRING *);
STRING *string_downcase(Interp *, const STRING *);
Modified: trunk/ops/experimental.ops
==============================================================================
--- trunk/ops/experimental.ops (original)
+++ trunk/ops/experimental.ops Wed Nov 16 10:21:29 2005
@@ -241,6 +241,10 @@ inline op newclosure(out PMC, in PMC) {
Escape all non-ascii chars to backslashed escape sequences. A
string with charset I<ascii> is created as result.
+=item B<compose>(out STR, in STR)
+
+Compose (normalize) a string.
+
=cut
op escape(out STR, invar STR) {
@@ -248,6 +252,11 @@ op escape(out STR, invar STR) {
goto NEXT();
}
+op compose(out STR, in STR) {
+ $1 = string_compose(interpreter, $2);
+ goto NEXT();
+}
+
=back
=head1 COPYRIGHT
Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c (original)
+++ trunk/src/string.c Wed Nov 16 10:21:29 2005
@@ -2895,6 +2895,15 @@ Parrot_string_trans_encoding(Interp *int
return new_encoding->to_encoding(interpreter, src, dest);
}
+STRING *
+string_compose(Interp * interpreter, STRING *src)
+{
+ if (!src)
+ return NULL;
+ if (!src->strlen)
+ return string_make_empty(interpreter, enum_stringrep_one, 0);
+ return CHARSET_COMPOSE(interpreter, src);
+}
/*
=back
Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t (original)
+++ trunk/t/op/string_cs.t Wed Nov 16 10:21:29 2005
@@ -16,7 +16,7 @@ Tests charset support.
=cut
-use Parrot::Test tests => 50;
+use Parrot::Test tests => 51;
use Parrot::Config;
use Test::More;
@@ -507,7 +507,7 @@ abcdefg
OUTPUT
SKIP: {
- skip('no ICU lib', 14) unless $PConfig{has_icu};
+ skip('no ICU lib', 16) unless $PConfig{has_icu};
output_is( <<'CODE', <<"OUTPUT", "unicode downcase");
set S0, iso-8859-1:"T�TSCH"
find_charset I0, "unicode"
@@ -772,6 +772,30 @@ CODE
T\x{c3}\x{b6}tsch Leo
OUTPUT
+output_is( <<'CODE', <<OUTPUT, "combose combined char" );
+ set S1, unicode:"___\u01f0___"
+ length I0, S1
+ upcase S1 # decompose J+hacek
+ length I1, S1 # 1 longer
+ downcase S1 # j+hacek
+ length I2, S1
+ compose S1, S1
+ length I3, S1 # back at original string
+ getstdout P0 # need to convert back to utf8
+ push P0, "utf8" # push utf8 output layer
+ print S1
+ print "\n"
+ print_item I0
+ print_item I1
+ print_item I2
+ print_item I3
+ print_newline
+ end
+CODE
+___\x{c7}\x{b0}___
+7 8 8 7
+OUTPUT
+
} # SKIP
output_is( <<'CODE', <<'OUTPUT', "escape ascii" );