cvsuser 05/02/28 07:10:58
Modified: charset ascii.c ascii.h iso-8859-1.c iso-8859-1.h
config/inter charset.pl
src charset.c
t/op string_cs.t
Log:
Strings. Finally. 3 - iso-8859-1 upcase ...
* upcase, downcase, titlecase for iso-8859-1
* moved some common code to ascii.c
Revision Changes Path
1.9 +46 -8 parrot/charset/ascii.c
Index: ascii.c
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- ascii.c 28 Feb 2005 10:41:16 -0000 1.8
+++ ascii.c 28 Feb 2005 15:10:55 -0000 1.9
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: ascii.c,v 1.8 2005/02/28 10:41:16 leo Exp $
+$Id: ascii.c,v 1.9 2005/02/28 15:10:55 leo Exp $
=head1 NAME
@@ -8,7 +8,8 @@
=head1 DESCRIPTION
-This file implements the charset functions for ascii data
+This file implements the charset functions for ascii data and common
+charset functionality for similar charsets like iso-8859-1.
=cut
@@ -39,9 +40,46 @@
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 240-255 */
};
+INTVAL
+ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
+ unsigned char type, const unsigned char *table)
+{
+ INTVAL retval = -1;
+ INTVAL found = 0;
+
+ for (; start < string->strlen; start++) {
+ if (table[ENCODING_GET_CODEPOINT(interpreter, string, start)] ==
type) {
+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ retval = start;
+ }
+ return retval;
+}
-static STRING *
-get_graphemes(Interp *interpreter, STRING *source_string,
+INTVAL
+ascii_find_not_thing(Interp *interpreter, STRING *string, UINTVAL start,
+ unsigned char type, const unsigned char *table)
+{
+ INTVAL retval = -1;
+ INTVAL found = 0;
+
+ for (; start < string->strlen; start++) {
+ if (table[ENCODING_GET_CODEPOINT(interpreter, string, start)] !=
type) {
+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ retval = start;
+ }
+ return retval;
+}
+
+STRING *
+ascii_get_graphemes(Interp *interpreter, STRING *source_string,
UINTVAL offset, UINTVAL count)
{
return ENCODING_GET_BYTES(interpreter, source_string, offset, count);
@@ -56,8 +94,8 @@
}
-static STRING *
-get_graphemes_inplace(Interp *interpreter, STRING *source_string,
+STRING *
+ascii_get_graphemes_inplace(Interp *interpreter, STRING *source_string,
STRING *dest_string, UINTVAL offset, UINTVAL count)
{
return ENCODING_GET_BYTES_INPLACE(interpreter, source_string,
@@ -349,8 +387,8 @@
CHARSET *return_set = Parrot_new_charset(interpreter);
CHARSET base_set = {
"ascii",
- get_graphemes,
- get_graphemes_inplace,
+ ascii_get_graphemes,
+ ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
copy_to_charset,
1.7 +16 -3 parrot/charset/ascii.h
Index: ascii.h
===================================================================
RCS file: /cvs/public/parrot/charset/ascii.h,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- ascii.h 27 Feb 2005 11:03:38 -0000 1.6
+++ ascii.h 28 Feb 2005 15:10:55 -0000 1.7
@@ -1,7 +1,7 @@
/* ascii.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: ascii.h,v 1.6 2005/02/27 11:03:38 leo Exp $
+ * $Id: ascii.h,v 1.7 2005/02/28 15:10:55 leo Exp $
* Overview:
* This is the header for the ascii charset functions
* Data Structure and Algorithms:
@@ -13,8 +13,21 @@
#if !defined(PARROT_CHARSET_ASCII_H_GUARD)
#define PARROT_CHARSET_ASCII_H_GUARD
-static STRING *get_graphemes(Interp *, STRING *source_string, UINTVAL
offset, UINTVAL count);
-static STRING *get_graphemes_inplace(Interp *, STRING *source_string, STRING
*dest_string, UINTVAL offset, UINTVAL count);
+/*
+ * common functions for ascii-ish charsets
+ */
+
+INTVAL
+ascii_find_thing(Interp *interpreter, STRING *string, UINTVAL start,
+ unsigned char type, const unsigned char *table);
+INTVAL
+ascii_find_not_thing(Interp *interpreter, STRING *string, UINTVAL start,
+ unsigned char type, const unsigned char *table);
+STRING *ascii_get_graphemes(Interp *, STRING *source_string,
+ UINTVAL offset, UINTVAL count);
+STRING *ascii_get_graphemes_inplace(Interp *, STRING *source_string,
+ STRING *dest_string, UINTVAL offset, UINTVAL count);
+
static void set_graphemes(Interp *, STRING *source_string, UINTVAL offset,
UINTVAL replace_count, STRING *insert_string);
static void to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
static STRING *copy_to_charset(Interp *, STRING *source_string, CHARSET
*new_charset);
1.7 +71 -84 parrot/charset/iso-8859-1.c
Index: iso-8859-1.c
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- iso-8859-1.c 28 Feb 2005 10:41:16 -0000 1.6
+++ iso-8859-1.c 28 Feb 2005 15:10:55 -0000 1.7
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: iso-8859-1.c,v 1.6 2005/02/28 10:41:16 leo Exp $
+$Id: iso-8859-1.c,v 1.7 2005/02/28 15:10:55 leo Exp $
=head1 NAME
@@ -16,6 +16,7 @@
#include "parrot/parrot.h"
#include "iso-8859-1.h"
+#include "ascii.h"
/* The encoding we prefer, given a choice */
static ENCODING *preferred_encoding;
@@ -39,64 +40,12 @@
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 160-175 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, /* 176-191 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 192-207 */
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 207-223 */
+ 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, /* 208-223 */
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 224-239 */
- 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 240-255 */
+ 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, /* 240-255 */
};
-static INTVAL
-find_thing(Interp *interpreter, STRING *string, UINTVAL start, UINTVAL type)
-{
- INTVAL retval = -1;
- UINTVAL offset = start;
- INTVAL found = 0;
- for (; offset < string->strlen; offset++) {
- if (typetable[ENCODING_GET_CODEPOINT(interpreter, string, offset)]
- == type) {
- found = 1;
- break;
- }
- }
- if (found) {
- retval = offset;
- }
- return retval;
-}
-
-static INTVAL
-find_not_thing(Interp *interpreter, STRING *string, UINTVAL start,
- UINTVAL type)
-{
- INTVAL retval = -1;
- UINTVAL offset = start;
- INTVAL found = 0;
- for (; offset < string->strlen; offset++) {
- if (typetable[ENCODING_GET_CODEPOINT(interpreter, string, offset)]
- != type) {
- found = 1;
- break;
- }
- }
- if (found) {
- retval = offset;
- }
- return retval;
-}
-
-static STRING *
-get_graphemes(Interp *interpreter, STRING *source_string,
- UINTVAL offset, UINTVAL count)
-{
- return ENCODING_GET_BYTES(interpreter, source_string, offset, count);
-}
-static STRING *
-get_graphemes_inplace(Interp *interpreter, STRING *source_string,
- STRING *dest_string, UINTVAL offset, UINTVAL count)
-{
- return ENCODING_GET_BYTES_INPLACE(interpreter, source_string,
- offset, count, dest_string);
-}
static void
set_graphemes(Interp *interpreter, STRING *source_string,
@@ -168,7 +117,7 @@
static void
upcase(Interp *interpreter, STRING *source_string)
{
- char *buffer;
+ unsigned char *buffer;
UINTVAL offset = 0;
if (!source_string->strlen) {
@@ -178,7 +127,12 @@
Parrot_unmake_COW(interpreter, source_string);
buffer = source_string->strstart;
for (offset = 0; offset < source_string->strlen; offset++) {
- buffer[offset] = toupper(buffer[offset]);
+ unsigned int c = buffer[offset]; /* XXX use encoding ? */
+ if (c >= 0xe0 && c != 0xf7)
+ c &= ~0x20;
+ else
+ c = toupper(c);
+ buffer[offset] = c;
}
}
@@ -186,67 +140,94 @@
downcase(Interp *interpreter, STRING *source_string)
{
UINTVAL offset = 0;
- char *buffer;
+ unsigned char *buffer;
if (!source_string->strlen) {
return;
}
Parrot_unmake_COW(interpreter, source_string);
buffer = source_string->strstart;
for (offset = 0; offset < source_string->strlen; offset++) {
- buffer[offset] = tolower(buffer[offset]);
+ unsigned int c = buffer[offset];
+ if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
+ c |= 0x20;
+ else
+ c = tolower(c);
+ buffer[offset] = c;
}
}
static void
titlecase(Interp *interpreter, STRING *source_string)
{
- char *buffer;
- UINTVAL offset = 0;
+ unsigned char *buffer;
+ unsigned int c;
+ UINTVAL offset;
+
if (!source_string->strlen) {
return;
}
Parrot_unmake_COW(interpreter, source_string);
buffer = source_string->strstart;
- buffer[0] = toupper(buffer[0]);
+ c = buffer[0];
+ if (c >= 0xe0 && c != 0xf7)
+ c &= ~0x20;
+ else
+ c = toupper(c);
+ buffer[0] = c;
+
for (offset = 1; offset < source_string->strlen; offset++) {
- buffer[offset] = tolower(buffer[offset]);
+ c = buffer[offset];
+ if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
+ c |= 0x20;
+ else
+ c = tolower(c);
+ buffer[offset] = c;
}
}
static void
upcase_first(Interp *interpreter, STRING *source_string)
{
- char *buffer;
+ unsigned char *buffer;
+ unsigned int c;
+
if (!source_string->strlen) {
return;
}
Parrot_unmake_COW(interpreter, source_string);
buffer = source_string->strstart;
- buffer[0] = toupper(buffer[0]);
+ c = buffer[0];
+ if (c >= 0xe0 && c != 0xf7)
+ c &= ~0x20;
+ else
+ c = toupper(c);
+ buffer[0] = c;
}
static void
downcase_first(Interp *interpreter, STRING *source_string)
{
- char *buffer;
+ unsigned char *buffer;
+ unsigned int c;
+
if (!source_string->strlen) {
return;
}
Parrot_unmake_COW(interpreter, source_string);
buffer = source_string->strstart;
+ c = buffer[0];
+ if (c >= 0xc0 && c != 0xd7 && c <= 0xde)
+ c &= ~0x20;
+ else
+ c = tolower(c);
+ buffer[0] = c;
buffer[0] = toupper(buffer[0]);
}
static void
titlecase_first(Interp *interpreter, STRING *source_string)
{
- char *buffer;
- if (!source_string->strlen) {
- return;
- }
- Parrot_unmake_COW(interpreter, source_string);
- buffer = source_string->strstart;
- buffer[0] = toupper(buffer[0]);
+ upcase_first(interpreter, source_string);
}
static INTVAL
@@ -312,13 +293,14 @@
static INTVAL
find_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return find_thing(interpreter, source_string, offset, WORDCHAR);
+ return ascii_find_thing(interpreter, source_string, offset, WORDCHAR,
typetable);
}
static INTVAL
find_not_wordchar(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return find_not_thing(interpreter, source_string, offset, WORDCHAR);
+ return ascii_find_not_thing(interpreter, source_string, offset, WORDCHAR,
+ typetable);
}
static INTVAL
@@ -332,14 +314,16 @@
static INTVAL
find_whitespace(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return find_thing(interpreter, source_string, offset, WHITESPACE);
+ return ascii_find_thing(interpreter, source_string, offset, WHITESPACE,
+ typetable);
}
static INTVAL
find_not_whitespace(Interp *interpreter, STRING *source_string,
UINTVAL offset)
{
- return find_not_thing(interpreter, source_string, offset, WHITESPACE);
+ return ascii_find_not_thing(interpreter, source_string, offset,
+ WHITESPACE, typetable);
}
static INTVAL
@@ -353,13 +337,15 @@
static INTVAL
find_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return find_thing(interpreter, source_string, offset, DIGIT);
+ return ascii_find_thing(interpreter, source_string, offset, DIGIT,
+ typetable);
}
static INTVAL
find_not_digit(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return find_not_thing(interpreter, source_string, offset, DIGIT);
+ return ascii_find_not_thing(interpreter, source_string, offset, DIGIT,
+ typetable);
}
static INTVAL
@@ -373,15 +359,16 @@
static INTVAL
find_punctuation(Interp *interpreter, STRING *source_string, UINTVAL offset)
{
- return find_thing(interpreter, source_string, offset, PUNCTUATION);
+ return ascii_find_thing(interpreter, source_string, offset, PUNCTUATION,
+ typetable);
}
static INTVAL
find_not_punctuation(Interp *interpreter, STRING *source_string,
UINTVAL offset)
{
- return find_not_thing(interpreter, source_string, offset, PUNCTUATION);
-
+ return ascii_find_not_thing(interpreter, source_string, offset,
+ PUNCTUATION, typetable);
}
static INTVAL
@@ -441,8 +428,8 @@
CHARSET *return_set = Parrot_new_charset(interpreter);
CHARSET base_set = {
"iso-8859-1",
- get_graphemes,
- get_graphemes_inplace,
+ ascii_get_graphemes,
+ ascii_get_graphemes_inplace,
set_graphemes,
to_charset,
copy_to_charset,
1.5 +1 -2 parrot/charset/iso-8859-1.h
Index: iso-8859-1.h
===================================================================
RCS file: /cvs/public/parrot/charset/iso-8859-1.h,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- iso-8859-1.h 27 Feb 2005 11:03:38 -0000 1.4
+++ iso-8859-1.h 28 Feb 2005 15:10:55 -0000 1.5
@@ -1,7 +1,7 @@
/* iso_8859_1.h
* Copyright: 2004 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: iso-8859-1.h,v 1.4 2005/02/27 11:03:38 leo Exp $
+ * $Id: iso-8859-1.h,v 1.5 2005/02/28 15:10:55 leo Exp $
* Overview:
* This is the header for the iso_8859-1 charset functions
* Data Structure and Algorithms:
@@ -13,7 +13,6 @@
#if !defined(PARROT_CHARSET_ISO_8859_1_H_GUARD)
#define PARROT_CHARSET_ISO_8859_1_H_GUARD
-static STRING *get_graphemes(Interp *interpreter, STRING *source_string,
UINTVAL offset, UINTVAL count);
static void set_graphemes(Interp *interpreter, STRING *source_string,
UINTVAL offset, UINTVAL replace_count, STRING *insert_string);
static void to_charset(Interp *interpreter, STRING *source_string, CHARSET
*new_charset);
static STRING *copy_to_charset(Interp *interpreter, STRING *source_string,
CHARSET *new_charset);
1.2 +2 -2 parrot/config/inter/charset.pl
Index: charset.pl
===================================================================
RCS file: /cvs/public/parrot/config/inter/charset.pl,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- charset.pl 4 Nov 2004 18:37:22 -0000 1.1
+++ charset.pl 28 Feb 2005 15:10:56 -0000 1.2
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
-# $Id: charset.pl,v 1.1 2004/11/04 18:37:22 dan Exp $
+# $Id: charset.pl,v 1.2 2005/02/28 15:10:56 leo Exp $
=head1 NAME
@@ -54,7 +54,7 @@
foreach my $charset (split(/\s+/, $charset_list)) {
$charset =~ s/\.c$//;
$TEMP_charset_build .= <<END
-charset/$charset\$(O): charset/$charset.h charset/$charset.c
\$(NONGEN_HEADERS)
+charset/$charset\$(O): charset/$charset.h charset/ascii.h charset/$charset.c
\$(NONGEN_HEADERS)
END
1.7 +11 -3 parrot/src/charset.c
Index: charset.c
===================================================================
RCS file: /cvs/public/parrot/src/charset.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- charset.c 28 Feb 2005 13:35:46 -0000 1.6
+++ charset.c 28 Feb 2005 15:10:57 -0000 1.7
@@ -1,6 +1,6 @@
/*
Copyright: 2004 The Perl Foundation. All Rights Reserved.
-$Id: charset.c,v 1.6 2005/02/28 13:35:46 leo Exp $
+$Id: charset.c,v 1.7 2005/02/28 15:10:57 leo Exp $
=head1 NAME
@@ -47,6 +47,15 @@
void
Parrot_deinit_charsets(Interp *interpreter)
{
+ int i, n;
+
+ n = all_charsets->n_charsets;
+ for (i = 0; i < n; ++i) {
+ mem_sys_free(all_charsets->set[i].charset);
+ }
+ mem_sys_free(all_charsets->set);
+ mem_sys_free(all_charsets);
+ all_charsets = NULL;
}
CHARSET *
@@ -135,8 +144,7 @@
sizeof(One_charset));
all_charsets->n_charsets++;
all_charsets->set[n].charset = charset;
- all_charsets->set[n].name = string_from_cstring(interpreter,
- charsetname, 0);
+ all_charsets->set[n].name = const_string(interpreter, charsetname);
return 1;
}
1.3 +30 -2 parrot/t/op/string_cs.t
Index: string_cs.t
===================================================================
RCS file: /cvs/public/parrot/t/op/string_cs.t,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- string_cs.t 28 Feb 2005 13:35:47 -0000 1.2
+++ string_cs.t 28 Feb 2005 15:10:58 -0000 1.3
@@ -1,6 +1,6 @@
#! perl -w
# Copyright: 2001-2004 The Perl Foundation. All Rights Reserved.
-# $Id: string_cs.t,v 1.2 2005/02/28 13:35:47 leo Exp $
+# $Id: string_cs.t,v 1.3 2005/02/28 15:10:58 leo Exp $
=head1 NAME
@@ -16,7 +16,7 @@
=cut
-use Parrot::Test tests => 4;
+use Parrot::Test tests => 7;
use Test::More;
output_is( <<'CODE', <<OUTPUT, "basic syntax" );
@@ -64,3 +64,31 @@
CODE
/charset 'no_such' not found/
OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "downcase" );
+ set S0, "AEIOU_���\n"
+ downcase S1, S0
+ print S1
+ end
+CODE
+aeiou_���
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "upcase" );
+ set S0, "aeiou_����\n"
+ upcase S1, S0
+ print S1
+ end
+CODE
+AEIOU_����
+OUTPUT
+
+output_is( <<'CODE', <<OUTPUT, "titlecase" );
+ set S0, "zAEIOU_���\n"
+ titlecase S1, S0
+ print S1
+ end
+CODE
+Zaeiou_���
+OUTPUT
+