cvs commit: parrot/src chartype.c encoding.c

Peter Gibbs Mon, 03 Nov 2003 08:09:54 -0800

cvsuser     03/11/03 07:05:09


  Modified:    .        MANIFEST
               config/gen/makefiles root.in
               include/parrot encoding.h
               src      chartype.c encoding.c
  Added:       encodings dbcs.c
  Log:
  Implement DBCS encoding (minus skip_backward)
  Extend dynamic chartype loading to support DBCS
  
  Revision  Changes    Path
  1.496     +1 -0      parrot/MANIFEST
  
  Index: MANIFEST
  ===================================================================
  RCS file: /cvs/public/parrot/MANIFEST,v
  retrieving revision 1.495
  retrieving revision 1.496
  diff -u -w -r1.495 -r1.496
  --- MANIFEST  1 Nov 2003 17:28:16 -0000       1.495
  +++ MANIFEST  3 Nov 2003 15:04:52 -0000       1.496
  @@ -238,6 +238,7 @@
   editor/ops2vim.pl                                 [devel]
   editor/pasm.el                                    [devel]
   editor/pasm.vim                                   [devel]
  +encodings/dbcs.c                                  []
   encodings/singlebyte.c                            []
   encodings/utf16.c                                 []
   encodings/utf32.c                                 []
  
  
  
  1.166     +3 -1      parrot/config/gen/makefiles/root.in
  
  Index: root.in
  ===================================================================
  RCS file: /cvs/public/parrot/config/gen/makefiles/root.in,v
  retrieving revision 1.165
  retrieving revision 1.166
  diff -u -w -r1.165 -r1.166
  --- root.in   29 Oct 2003 15:05:11 -0000      1.165
  +++ root.in   3 Nov 2003 15:04:55 -0000       1.166
  @@ -160,7 +160,7 @@
   CLASS_O_FILES = ${pmc_classes_o}
   
   ENCODING_O_FILES = encodings/singlebyte$(O) encodings/utf8$(O) \
  -     encodings/utf16$(O) encodings/utf32$(O)
  +     encodings/utf16$(O) encodings/utf32$(O) encodings/dbcs$(O)
   
   CHARTYPE_O_FILES = chartypes/unicode$(O) chartypes/usascii$(O)
   
  @@ -502,6 +502,8 @@
   encoding/utf16$(O) : $(GENERAL_H_FILES)
   
   encoding/utf32$(O) : $(GENERAL_H_FILES)
  +
  +encoding/dbcs$(O) : $(GENERAL_H_FILES)
   
   $(SRC)/events$(O) : $(GENERAL_H_FILES)
   
  
  
  
  1.1                  parrot/encodings/dbcs.c
  
  Index: dbcs.c
  ===================================================================
  /* dbcs.c
   *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
   *  CVS Info
   *     $Id: dbcs.c,v 1.1 2003/11/03 15:04:58 petergibbs Exp $
   *  Overview:
   *     This defines the DBCS encoding routines.
   *  Data Structure and Algorithms:
   *  History:
   *  Notes:
   *     All byte values above 127 are assumed to be lead bytes
   *     skip_backward cannot be implemented for DBCS
   *  References:
   */
  
  #include "parrot/parrot.h"
  
  typedef unsigned char byte_t;
  
  static UINTVAL
  dbcs_characters(const void *ptr, UINTVAL bytes)
  {
      const byte_t *bptr = ptr;
      const byte_t *end = bptr + bytes;
      UINTVAL characters = 0;
  
      while (bptr < end) {
          if (*bptr > 127) 
              bptr += 2;
          else
              bptr++;
          characters++;
      }
  
      return characters;
  }
  
  static UINTVAL
  dbcs_decode(const void *ptr)
  {
      byte_t byte1 = *(const byte_t *)ptr;
      if (byte1 < 128)
          return byte1;
      else {
          byte_t byte2 = *((const byte_t *)ptr+1);
          return (byte1 << 8) | byte2;
      }
  }
  
  static void *
  dbcs_encode(void *ptr, UINTVAL c)
  {
      byte_t *bptr = (byte_t *)ptr;
  
      if (c < 128)
          *bptr++ = (byte_t)c;
      else {
          if (c >> 8 > 255 || c >> 8 < 128) {
              internal_exception(INVALID_CHARACTER,
                                 "Invalid character for DBCS encoding\n");
          }
          else {
              *bptr++ = c >> 8;
              *bptr++ = c & 0xFF;
          }
      }
  
      return bptr;
  }
  
  static const void *
  dbcs_skip_forward(const void *ptr, UINTVAL n)
  {
      const byte_t *bptr = (const byte_t *)ptr;
  
      while (n--) {
          if (*bptr > 127)
              bptr += 2;
          else
              bptr++;
      }
  
      return bptr;
  }
  
  static const void *
  dbcs_skip_backward(const void *ptr, UINTVAL n)
  {
      internal_exception(INVALID_OPERATION,
                         "Undefined operation for DBCS encoding\n");
      return ptr;
  }
  
  const ENCODING dbcs_encoding = {
      enum_encoding_dbcs,
      "dbcs",
      1,
      dbcs_characters,
      dbcs_decode,
      dbcs_encode,
      dbcs_skip_forward,
      dbcs_skip_backward
  };
  
  /*
   * Local variables:
   * c-indentation-style: bsd
   * c-basic-offset: 4
   * indent-tabs-mode: nil
   * End:
   *
   * vim: expandtab shiftwidth=4:
  */
  
  
  
  1.22      +2 -1      parrot/include/parrot/encoding.h
  
  Index: encoding.h
  ===================================================================
  RCS file: /cvs/public/parrot/include/parrot/encoding.h,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -w -r1.21 -r1.22
  --- encoding.h        28 Oct 2003 07:32:38 -0000      1.21
  +++ encoding.h        3 Nov 2003 15:05:01 -0000       1.22
  @@ -1,7 +1,7 @@
   /* encoding.h
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: encoding.h,v 1.21 2003/10/28 07:32:38 leo Exp $
  + *     $Id: encoding.h,v 1.22 2003/11/03 15:05:01 petergibbs Exp $
    *  Overview:
    *     This is the api header for the string encoding subsystem
    *  Data Structure and Algorithms:
  @@ -19,6 +19,7 @@
       enum_encoding_utf8,
       enum_encoding_utf16,
       enum_encoding_utf32,
  +    enum_encoding_dbcs,
       enum_encoding_MAX
   };
   
  
  
  
  1.18      +48 -17    parrot/src/chartype.c
  
  Index: chartype.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/chartype.c,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -w -r1.17 -r1.18
  --- chartype.c        3 Nov 2003 12:54:48 -0000       1.17
  +++ chartype.c        3 Nov 2003 15:05:09 -0000       1.18
  @@ -1,7 +1,7 @@
   /* chartype.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: chartype.c,v 1.17 2003/11/03 12:54:48 petergibbs Exp $
  + *     $Id: chartype.c,v 1.18 2003/11/03 15:05:09 petergibbs Exp $
    *  Overview:
    *     This defines the string character type subsystem
    *  Data Structure and Algorithms:
  @@ -18,9 +18,9 @@
   struct chartype_unicode_map_t {
       UINTVAL n1;
       INTVAL *cparray;
  +    INTVAL *cparray2;
   };
   
  -
   extern CHARTYPE usascii_chartype;
   extern CHARTYPE unicode_chartype;
   
  @@ -35,7 +35,8 @@
   struct chartype_digit_map_t default_digit_map = { 0x30, 0x39, 0 };
   
   /*
  - * Register a chartype entry and TODO its transcoders
  + * Register a chartype entry
  + * XXX Register transcode functions
    */
   static void
   chartype_register(CHARTYPE *type)
  @@ -77,14 +78,17 @@
   }
   
   static UINTVAL
  -chartype_to_unicode_cparray(const CHARTYPE *from, const CHARTYPE *to, UINTVAL c)
  +chartype_to_unicode_cparray(const CHARTYPE *from, const CHARTYPE *to, 
  +                            UINTVAL c)
   {
       const struct chartype_unicode_map_t *map = from->unicode_map;
  +
       if (c < map->n1)
           return c;
  -    else {
  +    else if (c < 256) 
           return map->cparray[c - map->n1];
  -    }
  +    else
  +        return map->cparray2[c - 128*256];
   }
   
   static UINTVAL
  @@ -92,15 +96,24 @@
                                 UINTVAL c)
   {
       const struct chartype_unicode_map_t *map = to->unicode_map;
  +
       if (c < map->n1) {
           return c;
       }
       else {
           UINTVAL i;
  +        if (map->cparray) {
           for (i = 0; i < 256 - map->n1; i++) {
               if (map->cparray[i] == (INTVAL)c)
                   return i + map->n1;
           }
  +        }
  +        if (map->cparray2) {
  +            for (i = 0; i < 128*256; i++) {
  +                if (map->cparray2[i] == (INTVAL)c)
  +                    return i + 128*256;
  +            }
  +        }
           internal_exception(INVALID_CHARACTER,
                              "Invalid character for chartype\n");
           return 0;
  @@ -110,9 +123,9 @@
   /*
    * Create chartype from mapping file
    * Still TODO:
  - *   Handle encodings other than singlebyte
  + *   Handle more encodings (singlebyte & dbcs implemented so far)
    *   Create proper digit mapping table (currently always ascii)
  - *   Create other variants of unicode mapping table
  + *   -> this is REQUIRED for DBCS!
    *   Path is hardcoded to "runtime/parrot/chartypes/<name>.TXT"
    *   Does direct file system IO - should probably use Parrot IO
    *   Better parsing code - e.g. handle erroneous input!
  @@ -127,6 +140,7 @@
       INTVAL typecode;
       INTVAL unicode;
       INTVAL *cparray = NULL;
  +    INTVAL *cparray2 = NULL;
       struct chartype_unicode_map_t *map;
       int one2one = 0;
   
  @@ -142,11 +156,13 @@
           char *p = fgets(line, 80, f);
           if (line[0] != '#') {
               int n = sscanf(line, "%li\t%li", &typecode, &unicode);
  -            if (n == 2 && typecode >= 0 && typecode < 256) {
  -                if (typecode == one2one && unicode == typecode) {
  +            if (n == 2 && typecode >= 0) {
  +                if (typecode < 256 && typecode == one2one && 
  +                    unicode == typecode) 
  +                {
                       one2one++;
                   }
  -                else {
  +                else if (typecode < 256) {
                       if (!cparray) {
                           int size = (256 - one2one) * sizeof(INTVAL);
                           cparray = mem_sys_allocate(size);
  @@ -154,6 +170,15 @@
                       }
                       cparray[typecode-one2one] = unicode;
                   }
  +                /* XXX Should abort loading if invalid value found */
  +                else if (typecode >= 128*256) {
  +                    if (!cparray2) {
  +                        int size = 128 * 256 * sizeof(INTVAL);
  +                        cparray2 = mem_sys_allocate(size);
  +                        memset(cparray2, 0xFF, size);
  +                    }
  +                    cparray2[typecode - (128*256)] = unicode;
  +                }
               }
           }
       }
  @@ -162,16 +187,22 @@
       type = mem_sys_allocate(sizeof(CHARTYPE));
       type->index = -1;    /* will be allocated during registration */
       type->name = malloc_and_strcpy(name);
  +    if (cparray2) {
  +        type->default_encoding = malloc_and_strcpy("dbcs");
  +    }
  +    else {
       type->default_encoding = malloc_and_strcpy("singlebyte");
  +    }
  +    type->from_unicode = chartype_from_unicode_cparray;
  +    type->to_unicode = chartype_to_unicode_cparray;
       type->is_digit = chartype_is_digit_map1;
       type->get_digit = chartype_get_digit_map1;
       type->digit_map = &default_digit_map;
       map = mem_sys_allocate(sizeof(struct chartype_unicode_map_t));
       map->n1 = one2one;
       map->cparray = cparray;
  +    map->cparray2 = cparray2;
       type->unicode_map = map;
  -    type->from_unicode = chartype_from_unicode_cparray;
  -    type->to_unicode = chartype_to_unicode_cparray;
       type->transcoders = NULL;
       chartype_register(type);
       return type;
  
  
  
  1.17      +3 -1      parrot/src/encoding.c
  
  Index: encoding.c
  ===================================================================
  RCS file: /cvs/public/parrot/src/encoding.c,v
  retrieving revision 1.16
  retrieving revision 1.17
  diff -u -w -r1.16 -r1.17
  --- encoding.c        23 Oct 2003 17:48:59 -0000      1.16
  +++ encoding.c        3 Nov 2003 15:05:09 -0000       1.17
  @@ -1,7 +1,7 @@
   /* encoding.c
    *  Copyright: 2001-2003 The Perl Foundation.  All Rights Reserved.
    *  CVS Info
  - *     $Id: encoding.c,v 1.16 2003/10/23 17:48:59 robert Exp $
  + *     $Id: encoding.c,v 1.17 2003/11/03 15:05:09 petergibbs Exp $
    *  Overview:
    *     This defines the string encoding subsystem
    *  Data Structure and Algorithms:
  @@ -16,6 +16,7 @@
   extern const ENCODING utf8_encoding;
   extern const ENCODING utf16_encoding;
   extern const ENCODING utf32_encoding;
  +extern const ENCODING dbcs_encoding;
   
   static const ENCODING **encoding_array = NULL;
   static int encoding_count = 0;
  @@ -29,6 +30,7 @@
       encoding_array[enum_encoding_utf8] = &utf8_encoding;
       encoding_array[enum_encoding_utf16] = &utf16_encoding;
       encoding_array[enum_encoding_utf32] = &utf32_encoding;
  +    encoding_array[enum_encoding_dbcs] = &dbcs_encoding;
   }
   
   void

cvs commit: parrot/src chartype.c encoding.c

Reply via email to