cvsuser 03/11/03 07:05:09
Modified: . MANIFEST
config/gen/makefiles root.in
include/parrot encoding.h
src chartype.c encoding.c
Added: encodings dbcs.c
Log:
Implement DBCS encoding (minus skip_backward)
Extend dynamic chartype loading to support DBCS
Revision Changes Path
1.496 +1 -0 parrot/MANIFEST
Index: MANIFEST
===================================================================
RCS file: /cvs/public/parrot/MANIFEST,v
retrieving revision 1.495
retrieving revision 1.496
diff -u -w -r1.495 -r1.496
--- MANIFEST 1 Nov 2003 17:28:16 -0000 1.495
+++ MANIFEST 3 Nov 2003 15:04:52 -0000 1.496
@@ -238,6 +238,7 @@
editor/ops2vim.pl [devel]
editor/pasm.el [devel]
editor/pasm.vim [devel]
+encodings/dbcs.c []
encodings/singlebyte.c []
encodings/utf16.c []
encodings/utf32.c []
1.166 +3 -1 parrot/config/gen/makefiles/root.in
Index: root.in
===================================================================
RCS file: /cvs/public/parrot/config/gen/makefiles/root.in,v
retrieving revision 1.165
retrieving revision 1.166
diff -u -w -r1.165 -r1.166
--- root.in 29 Oct 2003 15:05:11 -0000 1.165
+++ root.in 3 Nov 2003 15:04:55 -0000 1.166
@@ -160,7 +160,7 @@
CLASS_O_FILES = ${pmc_classes_o}
ENCODING_O_FILES = encodings/singlebyte$(O) encodings/utf8$(O) \
- encodings/utf16$(O) encodings/utf32$(O)
+ encodings/utf16$(O) encodings/utf32$(O) encodings/dbcs$(O)
CHARTYPE_O_FILES = chartypes/unicode$(O) chartypes/usascii$(O)
@@ -502,6 +502,8 @@
encoding/utf16$(O) : $(GENERAL_H_FILES)
encoding/utf32$(O) : $(GENERAL_H_FILES)
+
+encoding/dbcs$(O) : $(GENERAL_H_FILES)
$(SRC)/events$(O) : $(GENERAL_H_FILES)
1.1 parrot/encodings/dbcs.c
Index: dbcs.c
===================================================================
/* dbcs.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
* $Id: dbcs.c,v 1.1 2003/11/03 15:04:58 petergibbs Exp $
* Overview:
* This defines the DBCS encoding routines.
* Data Structure and Algorithms:
* History:
* Notes:
* All byte values above 127 are assumed to be lead bytes
* skip_backward cannot be implemented for DBCS
* References:
*/
#include "parrot/parrot.h"
typedef unsigned char byte_t;
static UINTVAL
dbcs_characters(const void *ptr, UINTVAL bytes)
{
const byte_t *bptr = ptr;
const byte_t *end = bptr + bytes;
UINTVAL characters = 0;
while (bptr < end) {
if (*bptr > 127)
bptr += 2;
else
bptr++;
characters++;
}
return characters;
}
static UINTVAL
dbcs_decode(const void *ptr)
{
byte_t byte1 = *(const byte_t *)ptr;
if (byte1 < 128)
return byte1;
else {
byte_t byte2 = *((const byte_t *)ptr+1);
return (byte1 << 8) | byte2;
}
}
static void *
dbcs_encode(void *ptr, UINTVAL c)
{
byte_t *bptr = (byte_t *)ptr;
if (c < 128)
*bptr++ = (byte_t)c;
else {
if (c >> 8 > 255 || c >> 8 < 128) {
internal_exception(INVALID_CHARACTER,
"Invalid character for DBCS encoding\n");
}
else {
*bptr++ = c >> 8;
*bptr++ = c & 0xFF;
}
}
return bptr;
}
static const void *
dbcs_skip_forward(const void *ptr, UINTVAL n)
{
const byte_t *bptr = (const byte_t *)ptr;
while (n--) {
if (*bptr > 127)
bptr += 2;
else
bptr++;
}
return bptr;
}
static const void *
dbcs_skip_backward(const void *ptr, UINTVAL n)
{
internal_exception(INVALID_OPERATION,
"Undefined operation for DBCS encoding\n");
return ptr;
}
const ENCODING dbcs_encoding = {
enum_encoding_dbcs,
"dbcs",
1,
dbcs_characters,
dbcs_decode,
dbcs_encode,
dbcs_skip_forward,
dbcs_skip_backward
};
/*
* Local variables:
* c-indentation-style: bsd
* c-basic-offset: 4
* indent-tabs-mode: nil
* End:
*
* vim: expandtab shiftwidth=4:
*/
1.22 +2 -1 parrot/include/parrot/encoding.h
Index: encoding.h
===================================================================
RCS file: /cvs/public/parrot/include/parrot/encoding.h,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -w -r1.21 -r1.22
--- encoding.h 28 Oct 2003 07:32:38 -0000 1.21
+++ encoding.h 3 Nov 2003 15:05:01 -0000 1.22
@@ -1,7 +1,7 @@
/* encoding.h
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: encoding.h,v 1.21 2003/10/28 07:32:38 leo Exp $
+ * $Id: encoding.h,v 1.22 2003/11/03 15:05:01 petergibbs Exp $
* Overview:
* This is the api header for the string encoding subsystem
* Data Structure and Algorithms:
@@ -19,6 +19,7 @@
enum_encoding_utf8,
enum_encoding_utf16,
enum_encoding_utf32,
+ enum_encoding_dbcs,
enum_encoding_MAX
};
1.18 +48 -17 parrot/src/chartype.c
Index: chartype.c
===================================================================
RCS file: /cvs/public/parrot/src/chartype.c,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -w -r1.17 -r1.18
--- chartype.c 3 Nov 2003 12:54:48 -0000 1.17
+++ chartype.c 3 Nov 2003 15:05:09 -0000 1.18
@@ -1,7 +1,7 @@
/* chartype.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: chartype.c,v 1.17 2003/11/03 12:54:48 petergibbs Exp $
+ * $Id: chartype.c,v 1.18 2003/11/03 15:05:09 petergibbs Exp $
* Overview:
* This defines the string character type subsystem
* Data Structure and Algorithms:
@@ -18,9 +18,9 @@
struct chartype_unicode_map_t {
UINTVAL n1;
INTVAL *cparray;
+ INTVAL *cparray2;
};
-
extern CHARTYPE usascii_chartype;
extern CHARTYPE unicode_chartype;
@@ -35,7 +35,8 @@
struct chartype_digit_map_t default_digit_map = { 0x30, 0x39, 0 };
/*
- * Register a chartype entry and TODO its transcoders
+ * Register a chartype entry
+ * XXX Register transcode functions
*/
static void
chartype_register(CHARTYPE *type)
@@ -77,14 +78,17 @@
}
static UINTVAL
-chartype_to_unicode_cparray(const CHARTYPE *from, const CHARTYPE *to, UINTVAL c)
+chartype_to_unicode_cparray(const CHARTYPE *from, const CHARTYPE *to,
+ UINTVAL c)
{
const struct chartype_unicode_map_t *map = from->unicode_map;
+
if (c < map->n1)
return c;
- else {
+ else if (c < 256)
return map->cparray[c - map->n1];
- }
+ else
+ return map->cparray2[c - 128*256];
}
static UINTVAL
@@ -92,15 +96,24 @@
UINTVAL c)
{
const struct chartype_unicode_map_t *map = to->unicode_map;
+
if (c < map->n1) {
return c;
}
else {
UINTVAL i;
+ if (map->cparray) {
for (i = 0; i < 256 - map->n1; i++) {
if (map->cparray[i] == (INTVAL)c)
return i + map->n1;
}
+ }
+ if (map->cparray2) {
+ for (i = 0; i < 128*256; i++) {
+ if (map->cparray2[i] == (INTVAL)c)
+ return i + 128*256;
+ }
+ }
internal_exception(INVALID_CHARACTER,
"Invalid character for chartype\n");
return 0;
@@ -110,9 +123,9 @@
/*
* Create chartype from mapping file
* Still TODO:
- * Handle encodings other than singlebyte
+ * Handle more encodings (singlebyte & dbcs implemented so far)
* Create proper digit mapping table (currently always ascii)
- * Create other variants of unicode mapping table
+ * -> this is REQUIRED for DBCS!
* Path is hardcoded to "runtime/parrot/chartypes/<name>.TXT"
* Does direct file system IO - should probably use Parrot IO
* Better parsing code - e.g. handle erroneous input!
@@ -127,6 +140,7 @@
INTVAL typecode;
INTVAL unicode;
INTVAL *cparray = NULL;
+ INTVAL *cparray2 = NULL;
struct chartype_unicode_map_t *map;
int one2one = 0;
@@ -142,11 +156,13 @@
char *p = fgets(line, 80, f);
if (line[0] != '#') {
int n = sscanf(line, "%li\t%li", &typecode, &unicode);
- if (n == 2 && typecode >= 0 && typecode < 256) {
- if (typecode == one2one && unicode == typecode) {
+ if (n == 2 && typecode >= 0) {
+ if (typecode < 256 && typecode == one2one &&
+ unicode == typecode)
+ {
one2one++;
}
- else {
+ else if (typecode < 256) {
if (!cparray) {
int size = (256 - one2one) * sizeof(INTVAL);
cparray = mem_sys_allocate(size);
@@ -154,6 +170,15 @@
}
cparray[typecode-one2one] = unicode;
}
+ /* XXX Should abort loading if invalid value found */
+ else if (typecode >= 128*256) {
+ if (!cparray2) {
+ int size = 128 * 256 * sizeof(INTVAL);
+ cparray2 = mem_sys_allocate(size);
+ memset(cparray2, 0xFF, size);
+ }
+ cparray2[typecode - (128*256)] = unicode;
+ }
}
}
}
@@ -162,16 +187,22 @@
type = mem_sys_allocate(sizeof(CHARTYPE));
type->index = -1; /* will be allocated during registration */
type->name = malloc_and_strcpy(name);
+ if (cparray2) {
+ type->default_encoding = malloc_and_strcpy("dbcs");
+ }
+ else {
type->default_encoding = malloc_and_strcpy("singlebyte");
+ }
+ type->from_unicode = chartype_from_unicode_cparray;
+ type->to_unicode = chartype_to_unicode_cparray;
type->is_digit = chartype_is_digit_map1;
type->get_digit = chartype_get_digit_map1;
type->digit_map = &default_digit_map;
map = mem_sys_allocate(sizeof(struct chartype_unicode_map_t));
map->n1 = one2one;
map->cparray = cparray;
+ map->cparray2 = cparray2;
type->unicode_map = map;
- type->from_unicode = chartype_from_unicode_cparray;
- type->to_unicode = chartype_to_unicode_cparray;
type->transcoders = NULL;
chartype_register(type);
return type;
1.17 +3 -1 parrot/src/encoding.c
Index: encoding.c
===================================================================
RCS file: /cvs/public/parrot/src/encoding.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -w -r1.16 -r1.17
--- encoding.c 23 Oct 2003 17:48:59 -0000 1.16
+++ encoding.c 3 Nov 2003 15:05:09 -0000 1.17
@@ -1,7 +1,7 @@
/* encoding.c
* Copyright: 2001-2003 The Perl Foundation. All Rights Reserved.
* CVS Info
- * $Id: encoding.c,v 1.16 2003/10/23 17:48:59 robert Exp $
+ * $Id: encoding.c,v 1.17 2003/11/03 15:05:09 petergibbs Exp $
* Overview:
* This defines the string encoding subsystem
* Data Structure and Algorithms:
@@ -16,6 +16,7 @@
extern const ENCODING utf8_encoding;
extern const ENCODING utf16_encoding;
extern const ENCODING utf32_encoding;
+extern const ENCODING dbcs_encoding;
static const ENCODING **encoding_array = NULL;
static int encoding_count = 0;
@@ -29,6 +30,7 @@
encoding_array[enum_encoding_utf8] = &utf8_encoding;
encoding_array[enum_encoding_utf16] = &utf16_encoding;
encoding_array[enum_encoding_utf32] = &utf32_encoding;
+ encoding_array[enum_encoding_dbcs] = &dbcs_encoding;
}
void