Author: leo
Date: Wed Nov 16 10:21:29 2005
New Revision: 10028

Modified:
   trunk/charset/ascii.c
   trunk/charset/ascii.h
   trunk/charset/binary.c
   trunk/charset/iso-8859-1.c
   trunk/charset/iso-8859-1.h
   trunk/charset/unicode.c
   trunk/include/parrot/charset.h
   trunk/include/parrot/string_funcs.h
   trunk/ops/experimental.ops
   trunk/src/string.c
   trunk/t/op/string_cs.t
Log:
unicode charsets - new opcode compose

* new API string_compose
* test


Modified: trunk/charset/ascii.c
==============================================================================
--- trunk/charset/ascii.c       (original)
+++ trunk/charset/ascii.c       Wed Nov 16 10:21:29 2005
@@ -141,15 +141,17 @@ to_charset(Interp *interpreter, STRING *
 }
 
 /* A noop. can't compose ascii */
-static void
-compose(Interp *interpreter, STRING *source_string)
+static STRING*
+compose(Interp *interpreter, STRING *src)
 {
+    return string_copy(interpreter, src);
 }
 
 /* A noop. can't decompose ascii */
-static void
-decompose(Interp *interpreter, STRING *source_string)
+static STRING*
+decompose(Interp *interpreter, STRING *src)
 {
+    return string_copy(interpreter, src);
 }
 
 static void

Modified: trunk/charset/ascii.h
==============================================================================
--- trunk/charset/ascii.h       (original)
+++ trunk/charset/ascii.h       Wed Nov 16 10:21:29 2005
@@ -36,27 +36,6 @@ INTVAL ascii_cs_rindex(Interp *, STRING 
 size_t ascii_compute_hash(Interp *, STRING *source_string, size_t seed);
 INTVAL mixed_cs_index(Interp *, STRING *src, STRING *search, UINTVAL offs);
 
-static void compose(Interp *, STRING *source_string);
-static void decompose(Interp *, STRING *source_string);
-static void upcase(Interp *, STRING *source_string);
-static void downcase(Interp *, STRING *source_string);
-static void titlecase(Interp *, STRING *source_string);
-static void upcase_first(Interp *, STRING *source_string);
-static void downcase_first(Interp *, STRING *source_string);
-static void titlecase_first(Interp *, STRING *source_string);
-static UINTVAL validate(Interp *, STRING *source_string);
-static INTVAL is_wordchar(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_wordchar(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_wordchar(Interp *, STRING *source_string, UINTVAL 
offset);
-static INTVAL is_whitespace(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_whitespace(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_whitespace(Interp *, STRING *source_string, UINTVAL 
offset);
-static INTVAL is_digit(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_digit(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_not_digit(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL is_punctuation(Interp *, STRING *source_string, UINTVAL offset);
-static INTVAL find_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
-static INTVAL find_not_punctuation(Interp *, STRING *source_string, UINTVAL 
offset);
 CHARSET *Parrot_charset_ascii_init(Interp *);
 
 STRING *charset_cvt_ascii_to_binary(Interp *, STRING *src, STRING *dest);

Modified: trunk/charset/binary.c
==============================================================================
--- trunk/charset/binary.c      (original)
+++ trunk/charset/binary.c      Wed Nov 16 10:21:29 2005
@@ -45,16 +45,20 @@ to_charset(Interp *interpreter, STRING *
     return NULL;
 }
 
-/* A noop. can't compose binary */
-static void
+/* A err. can't compose binary */
+static STRING*
 compose(Interp *interpreter, STRING *source_string)
 {
+    EXCEPTION(INVALID_CHARTYPE, "Can't compose binary data");
+    return NULL;
 }
 
-/* A noop. can't decompose binary */
-static void
+/* A err. can't decompose binary */
+static STRING*
 decompose(Interp *interpreter, STRING *source_string)
 {
+    EXCEPTION(INVALID_CHARTYPE, "Can't decompose binary data");
+    return NULL;
 }
 
 static void

Modified: trunk/charset/iso-8859-1.c
==============================================================================
--- trunk/charset/iso-8859-1.c  (original)
+++ trunk/charset/iso-8859-1.c  Wed Nov 16 10:21:29 2005
@@ -117,15 +117,17 @@ to_charset(Interp *interpreter, STRING *
 
 
 /* A noop. can't compose iso-8859-1 */
-static void
-compose(Interp *interpreter, STRING *source_string)
+static STRING*
+compose(Interp *interpreter, STRING *src)
 {
+    return string_copy(interpreter, src);
 }
 
 /* A noop. can't decompose iso-8859-1 */
-static void
-decompose(Interp *interpreter, STRING *source_string)
+static STRING*
+decompose(Interp *interpreter, STRING *src)
 {
+    return string_copy(interpreter, src);
 }
 
 static void

Modified: trunk/charset/iso-8859-1.h
==============================================================================
--- trunk/charset/iso-8859-1.h  (original)
+++ trunk/charset/iso-8859-1.h  Wed Nov 16 10:21:29 2005
@@ -13,17 +13,6 @@
 #if !defined(PARROT_CHARSET_ISO_8859_1_H_GUARD)
 #define PARROT_CHARSET_ISO_8859_1_H_GUARD
 
-static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset, 
UINTVAL replace_count, STRING *insert_string);
-static void compose(Interp *, STRING *source_string);
-static void decompose(Interp *, STRING *source_string);
-static void upcase(Interp *, STRING *source_string);
-static void downcase(Interp *, STRING *source_string);
-static void titlecase(Interp *, STRING *source_string);
-static void upcase_first(Interp *, STRING *source_string);
-static void downcase_first(Interp *, STRING *source_string);
-static void titlecase_first(Interp *, STRING *source_string);
-static UINTVAL validate(Interp *, STRING *source_string);
-
 STRING *charset_cvt_iso_8859_1_to_ascii(Interp *, STRING *src, STRING *dest);
 
 CHARSET *Parrot_charset_iso_8859_1_init(Interp *);

Modified: trunk/charset/unicode.c
==============================================================================
--- trunk/charset/unicode.c     (original)
+++ trunk/charset/unicode.c     Wed Nov 16 10:21:29 2005
@@ -28,6 +28,7 @@ This file implements the charset functio
 #include <unicode/utypes.h>
 #include <unicode/uchar.h>
 #include <unicode/ustring.h>
+#include <unicode/unorm.h>
 #endif
 #define EXCEPTION(err, str) \
     real_exception(interpreter, NULL, err, str)
@@ -70,16 +71,55 @@ to_charset(Interp *interpreter, STRING *
 }
 
 
-static void
-compose(Interp *interpreter, STRING *source_string)
+static STRING*
+compose(Interp *interpreter, STRING *src)
 {
-    UNIMPL;
+#if PARROT_HAS_ICU
+    STRING *dest;
+    int src_len, dest_len;
+    UErrorCode err;
+    /*
+       U_STABLE int32_t U_EXPORT2 
+       unorm_normalize(const UChar *source, int32_t sourceLength,
+       UNormalizationMode mode, int32_t options,
+       UChar *result, int32_t resultLength,
+       UErrorCode *status);
+       */
+    dest_len = src_len = src->strlen;
+    dest = string_make_direct(interpreter, NULL, src_len,
+            src->encoding, src->charset, 0);
+    err = U_ZERO_ERROR;
+    dest_len = unorm_normalize(src->strstart, src_len,
+            UNORM_DEFAULT,      /* default is NFC */
+            0,                  /* options 0 default - no specific icu version 
*/
+            dest->strstart, dest_len,
+            &err);
+    dest->bufused = dest_len * sizeof(UChar);
+    if (!U_SUCCESS(err)) {
+        err = U_ZERO_ERROR;
+        Parrot_reallocate_string(interpreter, dest, dest->bufused);
+        dest_len = unorm_normalize(src->strstart, src_len,
+                UNORM_DEFAULT,      /* default is NFC */
+                0,                  /* options 0 default - no specific icu 
version */
+                dest->strstart, dest_len,
+                &err);
+        assert(U_SUCCESS(err));
+        dest->bufused = dest_len * sizeof(UChar);
+    }
+    dest->strlen = dest_len;
+    return dest;
+#else
+    real_exception(interpreter, NULL, E_LibraryNotLoadedError,
+            "no ICU lib loaded");
+    return NULL;
+#endif
 }
 
-static void
-decompose(Interp *interpreter, STRING *source_string)
+static STRING*
+decompose(Interp *interpreter, STRING *src)
 {
     UNIMPL;
+    return NULL;
 }
 
 static void

Modified: trunk/include/parrot/charset.h
==============================================================================
--- trunk/include/parrot/charset.h      (original)
+++ trunk/include/parrot/charset.h      Wed Nov 16 10:21:29 2005
@@ -41,8 +41,8 @@ typedef STRING * (*charset_to_charset_t)
         STRING *dest);
 typedef STRING * (*charset_from_unicode_t)(Interp *, STRING *source_string,
         STRING *dest);
-typedef void (*charset_compose_t)(Interp *, STRING *source_string);
-typedef void (*charset_decompose_t)(Interp *, STRING *source_string);
+typedef STRING* (*charset_compose_t)(Interp *, STRING *source_string);
+typedef STRING* (*charset_decompose_t)(Interp *, STRING *source_string);
 typedef void (*charset_upcase_t)(Interp *, STRING *source_string);
 typedef void (*charset_downcase_t)(Interp *, STRING *source_string);
 typedef void (*charset_titlecase_t)(Interp *, STRING *source_string);

Modified: trunk/include/parrot/string_funcs.h
==============================================================================
--- trunk/include/parrot/string_funcs.h (original)
+++ trunk/include/parrot/string_funcs.h Wed Nov 16 10:21:29 2005
@@ -96,6 +96,7 @@ STRING * string_unescape_cstring(Interp 
         const char *cstring, char delimiter, const char *enc_or_charset);
 STRING * string_escape_string(Interp *, STRING *);
 STRING * string_escape_string_delimited(Interp *, STRING *, UINTVAL len);
+STRING * string_compose(Interp *, STRING *);
 
 STRING *string_upcase(Interp *, const STRING *);
 STRING *string_downcase(Interp *, const STRING *);

Modified: trunk/ops/experimental.ops
==============================================================================
--- trunk/ops/experimental.ops  (original)
+++ trunk/ops/experimental.ops  Wed Nov 16 10:21:29 2005
@@ -241,6 +241,10 @@ inline op newclosure(out PMC, in PMC) {
 Escape all non-ascii chars to backslashed escape sequences. A
 string with charset I<ascii> is created as result.
 
+=item B<compose>(out STR, in STR) 
+
+Compose (normalize) a string.
+
 =cut
 
 op escape(out STR, invar STR) {
@@ -248,6 +252,11 @@ op escape(out STR, invar STR) {
   goto NEXT();
 }
 
+op compose(out STR, in STR) {
+  $1 = string_compose(interpreter, $2);
+  goto NEXT();
+}
+
 =back
 
 =head1 COPYRIGHT

Modified: trunk/src/string.c
==============================================================================
--- trunk/src/string.c  (original)
+++ trunk/src/string.c  Wed Nov 16 10:21:29 2005
@@ -2895,6 +2895,15 @@ Parrot_string_trans_encoding(Interp *int
     return new_encoding->to_encoding(interpreter, src, dest);
 }
 
+STRING *
+string_compose(Interp * interpreter, STRING *src)
+{
+    if (!src)
+        return NULL;
+    if (!src->strlen)
+        return string_make_empty(interpreter, enum_stringrep_one, 0);
+    return CHARSET_COMPOSE(interpreter, src);
+}
 /*
 
 =back

Modified: trunk/t/op/string_cs.t
==============================================================================
--- trunk/t/op/string_cs.t      (original)
+++ trunk/t/op/string_cs.t      Wed Nov 16 10:21:29 2005
@@ -16,7 +16,7 @@ Tests charset support.
 
 =cut
 
-use Parrot::Test tests => 50;
+use Parrot::Test tests => 51;
 use Parrot::Config;
 use Test::More;
 
@@ -507,7 +507,7 @@ abcdefg
 OUTPUT
 
 SKIP: {
-  skip('no ICU lib', 14) unless $PConfig{has_icu};
+  skip('no ICU lib', 16) unless $PConfig{has_icu};
 output_is( <<'CODE', <<"OUTPUT", "unicode downcase");
     set S0, iso-8859-1:"T�TSCH"
     find_charset I0, "unicode"
@@ -772,6 +772,30 @@ CODE
 T\x{c3}\x{b6}tsch Leo
 OUTPUT
 
+output_is( <<'CODE', <<OUTPUT, "combose combined char" );
+    set S1, unicode:"___\u01f0___"
+    length I0, S1
+    upcase S1        # decompose J+hacek
+    length I1, S1    # 1 longer
+    downcase S1      # j+hacek
+    length I2, S1
+    compose S1, S1
+    length I3, S1    # back at original string
+    getstdout P0          # need to convert back to utf8
+    push P0, "utf8"       # push utf8 output layer
+    print S1
+    print "\n"
+    print_item I0
+    print_item I1
+    print_item I2
+    print_item I3
+    print_newline
+    end
+CODE
+___\x{c7}\x{b0}___
+7 8 8 7
+OUTPUT
+
 }  # SKIP
 
 output_is( <<'CODE', <<'OUTPUT', "escape ascii" );

Reply via email to