Re: [Ecls-list] UTF-8 sequence decoding errors [Was: Upcoming changes]

Matthew Mondor Wed, 16 Feb 2011 00:12:22 -0800

On Sun, 13 Feb 2011 04:41:27 -0500
Matthew Mondor <mm_li...@pulsar-zone.net> wrote:


> Yes I think that supporting that encoding would be very easy too.  The
> only possibly tricky part is for users of that encoding to as necessary
> output a more conventional utf-8 stream to some streams, such as for
> display, possibly with bad sequences converted to latin-1.  But it
> could read data from an UTF-8B exernal format stream and write it back
> to another UTF-8B stream and be sure that the original data was
> transparently copied as-is, and not be bothered with decoding/encoding
> errors on streams with that external format.
> 
> I'm not sure if ECL should itself treat those invalid octets
> transparently as LATIN-1 if doing the output on an UTF-8
> external-format stream, however.  It's possible that without this some
> problems occur in the debugger, slime, etc, which would be presented
> with invalid UTF-8 characters in the UTF-16 surrogate range.

So I had some time tonight and wanted to write a test implementation.

However, there indeed is a problem at decoding time because more than
one bytes might be invalid octets in a row in which case more than one
UTF-16 surrogates must be used to represent multiple litteral octets.

You said that the streams lacked push/pop buffers, and it seems that at
least a minimal one would be necessary to implement this (i.e. the
decoding_error() function would instead of signaling an error with the
octets, insert those into the push-buffer if the stream has an UTF-8B
external format).  The stream reading routine would then have to first
issue the contents of that buffer before processing more characters...

My first idea was to replace the ecl_read_byte8(stream, buffer+1,
nbytes) < nbytes) call by a one-byte reading one in the following loop,
but that would still lose bytes if the second, third, etc octet was
invalid, and could only really return a character for that last one.

Attached is the attempt, but it's by no means complete.
Thanks and good night,
-- 
Matt

diff --git a/src/c/file.d b/src/c/file.d
index 20f079e..b6bb9e8 100755
--- a/src/c/file.d
+++ b/src/c/file.d
@@ -1048,6 +1048,8 @@ user_multistate_encoder(cl_object stream, unsigned char 
*buffer, ecl_character c
 static ecl_character
 utf_8_decoder(cl_object stream)
 {
+       int utf8b = (stream->stream.format == @':UTF-8B' ? 1 : 0);
+
        /* In understanding this code:
         * 0x8 = 1000, 0xC = 1100, 0xE = 1110, 0xF = 1111
         * 0x1 = 0001, 0x3 = 0011, 0x7 = 0111, 0xF = 1111
@@ -1060,8 +1062,11 @@ utf_8_decoder(cl_object stream)
        if ((buffer[0] & 0x80) == 0) {
                return buffer[0];
        }
-       unlikely_if ((buffer[0] & 0x40) == 0)
+       unlikely_if ((buffer[0] & 0x40) == 0) {
+               if (utf8b)
+                       return (0xdc00 | buffer[0]);
                 return decoding_error(stream, buffer, 1);
+       }
        if ((buffer[0] & 0x20) == 0) {
                cum = buffer[0] & 0x1F;
                nbytes = 1;
@@ -1099,6 +1104,8 @@ static int
 utf_8_encoder(cl_object stream, unsigned char *buffer, ecl_character c)
 {
        int nbytes;
+       int utf8b = (stream->stream.format == @':UTF-8B' ? 1 : 0);
+
        if (c < 0) {
                return 0;
        } else if (c <= 0x7F) {
@@ -1109,6 +1116,18 @@ utf_8_encoder(cl_object stream, unsigned char *buffer, 
ecl_character c)
                buffer[0] = c | 0xC0;
                /*printf("\n; %04x ;: %04x :: %04x :\n", c_orig, buffer[0], 
buffer[1]);*/
                nbytes = 2;
+       } else if (c <= 0xdcff && c >= 0xdc80) {
+               /* Special UTF-16 surrogate range used to implement UTF-8B */
+               if (utf8b) {
+                       /* Litteral octet */
+                       buffer[0] = c;
+                       nbytes = 1;
+               } else {
+                       /* Treat octet like LATIN-1 */
+                       buffer[1] = c - 0x20;
+                       buffer[0] = 0xc3;
+                       nbytes = 2;
+               }
        } else if (c <= 0xFFFF) {
                buffer[2] = (c & 0x3f) | 0x80; c >>= 6;
                buffer[1] = (c & 0x3f) | 0x80; c >>= 6;
@@ -2936,6 +2955,9 @@ parse_external_format(cl_object stream, cl_object format, 
int flags)
        if (format == @':UTF-8') {
                return (flags & ~ECL_STREAM_FORMAT) | ECL_STREAM_UTF_8; 
        }
+       if (format == @':UTF-8B') {
+               return (flags & ~ECL_STREAM_FORMAT) | ECL_STREAM_UTF_8B; 
+       }
        if (format == @':UCS-2') {
                return (flags & ~ECL_STREAM_FORMAT) | ECL_STREAM_UCS_2;
        }
@@ -3019,6 +3041,13 @@ set_stream_elt_type(cl_object stream, cl_fixnum 
byte_size, int flags,
                stream->stream.encoder = utf_8_encoder;
                stream->stream.decoder = utf_8_decoder;
                break;
+       case ECL_STREAM_UTF_8B:
+               IO_STREAM_ELT_TYPE(stream) = @'character';
+               byte_size = 8;
+               stream->stream.format = @':utf-8b';
+               stream->stream.encoder = utf_8_encoder;
+               stream->stream.decoder = utf_8_decoder;
+               break;
        case ECL_STREAM_UCS_2:
                IO_STREAM_ELT_TYPE(stream) = @'character';
                byte_size = 8*2;
diff --git a/src/c/symbols_list.h b/src/c/symbols_list.h
index e93452c..0c3330d 100755
--- a/src/c/symbols_list.h
+++ b/src/c/symbols_list.h
@@ -1824,6 +1824,7 @@ cl_symbols[] = {
 {KEY_ "LATIN-1", KEYWORD, NULL, -1, OBJNULL},
 {KEY_ "ISO-8859-1", KEYWORD, NULL, -1, OBJNULL},
 {KEY_ "UTF-8", KEYWORD, NULL, -1, OBJNULL},
+{KEY_ "UTF-8B", KEYWORD, NULL, -1, OBJNULL},
 {KEY_ "UCS-2", KEYWORD, NULL, -1, OBJNULL},
 {KEY_ "UCS-4", KEYWORD, NULL, -1, OBJNULL},
 
diff --git a/src/c/symbols_list2.h b/src/c/symbols_list2.h
index 90db2e8..9a25151 100644
--- a/src/c/symbols_list2.h
+++ b/src/c/symbols_list2.h
@@ -1824,6 +1824,7 @@ cl_symbols[] = {
 {KEY_ "LATIN-1",NULL},
 {KEY_ "ISO-8859-1",NULL},
 {KEY_ "UTF-8",NULL},
+{KEY_ "UTF-8B",NULL},
 {KEY_ "UCS-2",NULL},
 {KEY_ "UCS-4",NULL},
 
diff --git a/src/h/object.h b/src/h/object.h
index 6de2792..ffc853d 100644
--- a/src/h/object.h
+++ b/src/h/object.h
@@ -595,6 +595,7 @@ enum {
        ECL_STREAM_ISO_8859_1 = 1,
        ECL_STREAM_LATIN_1 = 1,
        ECL_STREAM_UTF_8 = 2,
+       ECL_STREAM_UTF_8B = 2048,
        ECL_STREAM_UCS_2 = 3,
        ECL_STREAM_UCS_2LE = 5 + 128,
        ECL_STREAM_UCS_2BE = 5,
diff --git a/src/lsp/iolib.lsp b/src/lsp/iolib.lsp
index ebe5fda..84f8417 100644
--- a/src/lsp/iolib.lsp
+++ b/src/lsp/iolib.lsp
@@ -268,7 +268,7 @@ the one used internally by ECL compiled files."
 
 (let* ((basic-encodings
         #+unicode
-         '(:UTF-8 :UCS-2 :UCS-2BE :UCS-2LE :UCS-4 :UCS-4BE
+         '(:UTF-8 :UTF-8B :UCS-2 :UCS-2BE :UCS-2LE :UCS-4 :UCS-4BE
            :ISO-8859-1 :LATIN-1 :US-ASCII :DEFAULT)
          #-unicode
          '(:DEFAULT))

------------------------------------------------------------------------------
The ultimate all-in-one performance toolkit: Intel(R) Parallel Studio XE:
Pinpoint memory and threading errors before they happen.
Find and fix more than 250 security defects in the development cycle.
Locate bottlenecks in serial and parallel code that limit performance.
http://p.sf.net/sfu/intel-dev2devfeb

_______________________________________________
Ecls-list mailing list
Ecls-list@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/ecls-list

Re: [Ecls-list] UTF-8 sequence decoding errors [Was: Upcoming changes]

Reply via email to