cvsuser 03/11/14 12:27:02
Modified: encodings dbcs.c singlebyte.c utf16.c utf32.c utf8.c
include/parrot encoding.h string.h string_funcs.h
src chartype.c string.c
Log:
Implement string iterator and decode_and_advance function
Revision Changes Path
1.2 +20 -3 parrot/encodings/dbcs.c
Index: dbcs.c
===================================================================
RCS file: /cvs/public/parrot/encodings/dbcs.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -w -r1.1 -r1.2
--- dbcs.c 3 Nov 2003 15:04:58 -0000 1.1
+++ dbcs.c 14 Nov 2003 20:26:38 -0000 1.2
@@ -1,7 +1,7 @@
/* dbcs.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: dbcs.c,v 1.1 2003/11/03 15:04:58 petergibbs Exp $
+ * $Id: dbcs.c,v 1.2 2003/11/14 20:26:38 petergibbs Exp $
* Overview:
* This defines the DBCS encoding routines.
* Data Structure and Algorithms:
@@ -90,15 +90,32 @@
return ptr;
}
+static UINTVAL
+dbcs_decode_and_advance(struct string_iterator_t *i)
+{
+ const byte_t *ptr = (byte_t *)i->str->strstart + i->bytepos;
+ if (*ptr < 128) {
+ i->bytepos++;
+ i->charpos++;
+ return *ptr;
+ }
+ else {
+ i->bytepos += 2;
+ i->charpos++;
+ return (*ptr << 8) | *(ptr+1);
+ }
+}
+
const ENCODING dbcs_encoding = {
enum_encoding_dbcs,
"dbcs",
- 1,
+ 2,
dbcs_characters,
dbcs_decode,
dbcs_encode,
dbcs_skip_forward,
- dbcs_skip_backward
+ dbcs_skip_backward,
+ dbcs_decode_and_advance
};
/*
1.17 +12 -2 parrot/encodings/singlebyte.c
Index: singlebyte.c
===================================================================
RCS file: /cvs/public/parrot/encodings/singlebyte.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -w -r1.16 -r1.17
--- singlebyte.c 21 Jul 2003 18:00:37 -0000 1.16
+++ singlebyte.c 14 Nov 2003 20:26:40 -0000 1.17
@@ -1,7 +1,7 @@
/* singlebyte.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: singlebyte.c,v 1.16 2003/07/21 18:00:37 chromatic Exp $
+ * $Id: singlebyte.c,v 1.17 2003/11/14 20:26:40 petergibbs Exp $
* Overview:
* This defines the single byte encoding routines.
* Data Structure and Algorithms:
@@ -59,6 +59,15 @@
return bptr - n;
}
+static UINTVAL
+singlebyte_decode_and_advance(struct string_iterator_t *i)
+{
+ const byte_t *ptr = (byte_t *)i->str->strstart + i->bytepos;
+ i->bytepos++;
+ i->charpos++;
+ return *ptr;
+}
+
const ENCODING singlebyte_encoding = {
enum_encoding_singlebyte,
"singlebyte",
@@ -67,7 +76,8 @@
singlebyte_decode,
singlebyte_encode,
singlebyte_skip_forward,
- singlebyte_skip_backward
+ singlebyte_skip_backward,
+ singlebyte_decode_and_advance
};
/*
1.14 +31 -2 parrot/encodings/utf16.c
Index: utf16.c
===================================================================
RCS file: /cvs/public/parrot/encodings/utf16.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -w -r1.13 -r1.14
--- utf16.c 21 Jul 2003 18:00:37 -0000 1.13
+++ utf16.c 14 Nov 2003 20:26:40 -0000 1.14
@@ -1,7 +1,7 @@
/* utf16.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: utf16.c,v 1.13 2003/07/21 18:00:37 chromatic Exp $
+ * $Id: utf16.c,v 1.14 2003/11/14 20:26:40 petergibbs Exp $
* Overview:
* This defines the UTF-16 encoding routines.
* Data Structure and Algorithms:
@@ -131,6 +131,34 @@
return u16ptr;
}
+static UINTVAL
+utf16_decode_and_advance(struct string_iterator_t *i)
+{
+ const utf16_t *u16ptr = (char *)i->str->strstart + i->bytepos;
+ UINTVAL c = *u16ptr++;
+
+ if (UNICODE_IS_HIGH_SURROGATE(c)) {
+ utf16_t low = *u16ptr++;
+
+ if (!UNICODE_IS_LOW_SURROGATE(low)) {
+ internal_exception(MALFORMED_UTF16,
+ "Malformed UTF-16 surrogate\n");
+ }
+
+ c = UNICODE_DECODE_SURROGATE(c, low);
+ i->bytepos += 4;
+ }
+ else if (UNICODE_IS_LOW_SURROGATE(c)) {
+ internal_exception(MALFORMED_UTF16, "Malformed UTF-16 surrogate\n");
+ }
+ else {
+ i->bytepos += 2;
+ }
+
+ i->charpos++;
+ return c;
+}
+
const ENCODING utf16_encoding = {
enum_encoding_utf16,
"utf16",
@@ -139,7 +167,8 @@
utf16_decode,
utf16_encode,
utf16_skip_forward,
- utf16_skip_backward
+ utf16_skip_backward,
+ utf16_decode_and_advance
};
/*
1.12 +12 -2 parrot/encodings/utf32.c
Index: utf32.c
===================================================================
RCS file: /cvs/public/parrot/encodings/utf32.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -w -r1.11 -r1.12
--- utf32.c 21 Jul 2003 18:00:37 -0000 1.11
+++ utf32.c 14 Nov 2003 20:26:40 -0000 1.12
@@ -1,7 +1,7 @@
/* utf32.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: utf32.c,v 1.11 2003/07/21 18:00:37 chromatic Exp $
+ * $Id: utf32.c,v 1.12 2003/11/14 20:26:40 petergibbs Exp $
* Overview:
* This defines the UTF-32 encoding routines.
* Data Structure and Algorithms:
@@ -64,6 +64,15 @@
return u32ptr - n;
}
+static UINTVAL
+utf32_decode_and_advance(struct string_iterator_t *i)
+{
+ const utf32_t *u32ptr = (utf32_t *)((char *)i->str->strstart + i->bytepos);
+ i->bytepos += 4;
+ i->charpos++;
+ return *u32ptr;
+}
+
const ENCODING utf32_encoding = {
enum_encoding_utf32,
"utf32",
@@ -72,7 +81,8 @@
utf32_decode,
utf32_encode,
utf32_skip_forward,
- utf32_skip_backward
+ utf32_skip_backward,
+ utf32_decode_and_advance
};
/*
1.15 +39 -2 parrot/encodings/utf8.c
Index: utf8.c
===================================================================
RCS file: /cvs/public/parrot/encodings/utf8.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -w -r1.14 -r1.15
--- utf8.c 21 Jul 2003 18:00:37 -0000 1.14
+++ utf8.c 14 Nov 2003 20:26:40 -0000 1.15
@@ -1,7 +1,7 @@
/* utf8.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: utf8.c,v 1.14 2003/07/21 18:00:37 chromatic Exp $
+ * $Id: utf8.c,v 1.15 2003/11/14 20:26:40 petergibbs Exp $
* Overview:
* This defines the UTF-8 encoding routines.
* Data Structure and Algorithms:
@@ -133,6 +133,42 @@
return u8ptr;
}
+static UINTVAL
+utf8_decode_and_advance(struct string_iterator_t *i)
+{
+ const utf8_t *u8ptr = (char *)i->str->strstart + i->bytepos;
+ UINTVAL c = *u8ptr;
+
+ if (UTF8_IS_START(c)) {
+ UINTVAL len = UTF8SKIP(u8ptr);
+ UINTVAL count;
+
+ c &= UTF8_START_MASK(len);
+ i->bytepos += len;
+// for (count = 1; count < len; count++) {
+ for (len--; len; len--) {
+ u8ptr++;
+ if (!UTF8_IS_CONTINUATION(*u8ptr)) {
+ internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
+ }
+ c = UTF8_ACCUMULATE(c, *u8ptr);
+ }
+
+ if (UNICODE_IS_SURROGATE(c)) {
+ internal_exception(MALFORMED_UTF8, "Surrogate in UTF-8 string\n");
+ }
+ }
+ else if (!UNICODE_IS_INVARIANT(c)) {
+ internal_exception(MALFORMED_UTF8, "Malformed UTF-8 string\n");
+ }
+ else {
+ i->bytepos++;
+ }
+
+ i->charpos++;
+ return c;
+}
+
const ENCODING utf8_encoding = {
enum_encoding_utf8,
"utf8",
@@ -141,7 +177,8 @@
utf8_decode,
utf8_encode,
utf8_skip_forward,
- utf8_skip_backward
+ utf8_skip_backward,
+ utf8_decode_and_advance
};
/*
1.23 +6 -3 parrot/include/parrot/encoding.h
Index: encoding.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/encoding.h,v
retrieving revision 1.22
retrieving revision 1.23
diff -u -w -r1.22 -r1.23
--- encoding.h 3 Nov 2003 15:05:01 -0000 1.22
+++ encoding.h 14 Nov 2003 20:26:55 -0000 1.23
@@ -1,7 +1,7 @@
/* encoding.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: encoding.h,v 1.22 2003/11/03 15:05:01 petergibbs Exp $
+ * $Id: encoding.h,v 1.23 2003/11/14 20:26:55 petergibbs Exp $
* Overview:
* This is the api header for the string encoding subsystem
* Data Structure and Algorithms:
@@ -25,6 +25,8 @@
/* &end_gen */
+struct string_iterator_t;
+
struct parrot_encoding_t {
INTVAL index;
const char *name;
@@ -34,6 +36,7 @@
void *(*encode) (void *ptr, Parrot_UInt c);
const void *(*skip_forward) (const void *ptr, Parrot_UInt n);
const void *(*skip_backward) (const void *ptr, Parrot_UInt n);
+ Parrot_UInt(*decode_and_advance) (struct string_iterator_t *i);
};
typedef struct parrot_encoding_t* Parrot_Encoding;
1.55 +8 -1 parrot/include/parrot/string.h
Index: string.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/string.h,v
retrieving revision 1.54
retrieving revision 1.55
diff -u -w -r1.54 -r1.55
--- string.h 21 Jul 2003 18:00:42 -0000 1.54
+++ string.h 14 Nov 2003 20:26:57 -0000 1.55
@@ -1,7 +1,7 @@
/* string.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string.h,v 1.54 2003/07/21 18:00:42 chromatic Exp $
+ * $Id: string.h,v 1.55 2003/11/14 20:26:57 petergibbs Exp $
* Overview:
* This is the api header for the string subsystem
* Data Structure and Algorithms:
@@ -35,6 +35,13 @@
TAIL_moved_FLAG = 1 << 0
} TAIL_flags;
+/* String iterator */
+typedef struct string_iterator_t {
+ String *str;
+ UINTVAL bytepos;
+ UINTVAL charpos;
+ UINTVAL (*decode_and_advance)(struct string_iterator_t *i);
+} string_iterator;
/* stringinfo parameters */
1.30 +2 -1 parrot/include/parrot/string_funcs.h
Index: string_funcs.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/string_funcs.h,v
retrieving revision 1.29
retrieving revision 1.30
diff -u -w -r1.29 -r1.30
--- string_funcs.h 14 Nov 2003 08:35:47 -0000 1.29
+++ string_funcs.h 14 Nov 2003 20:26:58 -0000 1.30
@@ -1,7 +1,7 @@
/* string_funcs.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string_funcs.h,v 1.29 2003/11/14 08:35:47 leo Exp $
+ * $Id: string_funcs.h,v 1.30 2003/11/14 20:26:58 petergibbs Exp $
* Overview:
* This is the api header for the string subsystem
* Data Structure and Algorithms:
@@ -69,6 +69,7 @@
STRING *s2, STRING **dest);
STRING *string_bitwise_xor(struct Parrot_Interp *interpreter, STRING *s1,
STRING *s2, STRING **dest);
+void string_iterator_init(struct string_iterator_t *i, STRING *s);
#endif
1.21 +2 -2 parrot/src/chartype.c
Index: chartype.c
===================================================================
RCS file: /cvs/public/parrot/src/chartype.c,v
retrieving revision 1.20
retrieving revision 1.21
diff -u -w -r1.20 -r1.21
--- chartype.c 6 Nov 2003 20:56:06 -0000 1.20
+++ chartype.c 14 Nov 2003 20:27:02 -0000 1.21
@@ -1,7 +1,7 @@
/* chartype.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: chartype.c,v 1.20 2003/11/06 20:56:06 petergibbs Exp $
+ * $Id: chartype.c,v 1.21 2003/11/14 20:27:02 petergibbs Exp $
* Overview:
* This defines the string character type subsystem
* Data Structure and Algorithms:
@@ -115,7 +115,7 @@
}
}
internal_exception(INVALID_CHARACTER,
- "Invalid character for chartype\n");
+ "Invalid character <%X> for chartype\n",c);
return 0;
}
}
1.158 +18 -13 parrot/src/string.c
Index: string.c
===================================================================
RCS file: /cvs/public/parrot/src/string.c,v
retrieving revision 1.157
retrieving revision 1.158
diff -u -w -r1.157 -r1.158
--- string.c 14 Nov 2003 08:35:50 -0000 1.157
+++ string.c 14 Nov 2003 20:27:02 -0000 1.158
@@ -1,7 +1,7 @@
/* string.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: string.c,v 1.157 2003/11/14 08:35:50 leo Exp $
+ * $Id: string.c,v 1.158 2003/11/14 20:27:02 petergibbs Exp $
* Overview:
* This is the api definitions for the string subsystem
* Data Structure and Algorithms:
@@ -1013,9 +1013,9 @@
INTVAL
hash_string_equal(struct Parrot_Interp *interpreter, STRING *s1, STRING *s2)
{
- const char *s1start, *s1end;
- const char *s2start;
- size_t len;
+ struct string_iterator_t i1;
+ struct string_iterator_t i2;
+
/*
* both strings aren't null
*/
@@ -1026,23 +1026,19 @@
/*
* both strings have equal amount of chars
*/
- s1start = s1->strstart;
- s2start = s2->strstart;
- len = (size_t) s1->bufused;
/* speed up ascii, slow down general case
*/
if (s1->encoding->index == enum_encoding_singlebyte &&
s2->encoding->index == enum_encoding_singlebyte) {
- return memcmp(s1start, s2start, s1->bufused);
+ return memcmp(s1->strstart, s2->strstart, s1->bufused);
}
- s1end = s1start + len;
- while (s1start < s1end) {
- if (s1->encoding->decode(s1start) != s2->encoding->decode(s2start))
+ string_iterator_init(&i1, s1);
+ string_iterator_init(&i2, s2);
+ while (i1.charpos < s1->strlen) {
+ if (i1.decode_and_advance(&i1) != i2.decode_and_advance(&i2))
return 1;
- s1start = s1->encoding->skip_forward(s1start, 1);
- s2start = s2->encoding->skip_forward(s2start, 1);
}
return 0;
}
@@ -1651,6 +1647,15 @@
PObj_sysmem_CLEAR(s);
/* Free up the memory */
mem_sys_free(memory);
+}
+
+void
+string_iterator_init(struct string_iterator_t *i, STRING *s)
+{
+ i->str = s;
+ i->bytepos = 0;
+ i->charpos = 0;
+ i->decode_and_advance = s->encoding->decode_and_advance;
}
/*