[elephant-devel] revised UTF seriazer/desirializer patch

Hiroyuki Komatsu Fri, 07 Aug 2009 19:16:17 -0700

This patch does these things;

o Maybe, big endian machines are nothing affected by this
  patch.  I do not have any big endian machine.
 
o little endian machines;
  + UTF strings are serialized into UTF16le or UTF32le with BOM
  + deserializers are test existency of BOM and choice deserialize from
    big endian or little endian.
  + comparators in libberkeley-db are also test BOM,
    create temporally buffer when the string is serialize into big
    endian.


o old store image preserved
o sort order is corrected when migrate old store to new store.

I did not test any other backing store.

diff -rN -u old-elephant/src/db-bdb/libberkeley-db.c 
new-elephant/src/db-bdb/libberkeley-db.c
--- old-elephant/src/db-bdb/libberkeley-db.c    2009-08-08 10:51:25.000000000 
+0900
+++ new-elephant/src/db-bdb/libberkeley-db.c    2009-08-08 10:51:25.000000000 
+0900
@@ -25,6 +25,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
+#include <stdlib.h>
 
 /* Some utility stuff used to be here but has been placed in
    libmemutil.c  */
@@ -920,7 +921,7 @@
   case S1_UCS4_SYMBOL:
   case S1_UCS4_STRING:
   case S1_UCS4_PATHNAME:
-    return wcs_cmp((wchar_t*)ad+9, read_int(ad, 5), (wchar_t*)bd+9, 
read_int(bd, 5)); 
+  return wcs_cmp((wchar_t*)(ad+9), read_int(ad, 5), (wchar_t*)(bd+9), 
read_int(bd, 5)); 
   default:
     return lex_cmp(ad+5, (a->size)-5, bd+5, (b->size)-5);
   }
@@ -1130,7 +1131,7 @@
     /*****
     printf("Doing a 32-bit compare\n");
     *****/
-    return wcs_cmp((wchar_t*)ad+5+offset, read_int32(ad+offset, 1), 
(wchar_t*)bd+5+offset, read_int32(bd+offset, 1)); 
+    return wcs_cmp((wchar_t*)(ad+5+offset), read_int32(ad+offset, 1), 
(wchar_t*)(bd+5+offset), read_int32(bd+offset, 1)); 
   default:
     /*****
     printf("Doing a lex compare\n");
@@ -1306,6 +1307,18 @@
 #define strncasecmp _strnicmp
 typedef unsigned short uint16_t;
 #endif
+#define ENDIAN_BIG 0
+#define ENDIAN_LITTLE 1
+
+int machine_endian()
+{
+       uint32_t x = 0x01020304;
+       uint8_t *xp = (uint8_t *)&x;
+       if (*xp == 0x01)
+               return ENDIAN_BIG;
+       else
+               return ENDIAN_LITTLE;
+}
 
 int case_cmp(const unsigned char *a, int32_t length1, const unsigned char *b, 
int32_t length2) {
   int min, sizediff, diff;
@@ -1316,12 +1329,72 @@
   return diff;
 }
 
+wchar_t utf32_char(const wchar_t *c)
+{
+       uint8_t *cp = (uint8_t *)c;
+       return (cp[3] << 24) | (cp[2] << 16) | (cp[1] << 8) | cp[0];
+}
+
+wchar_t *swap32_string(const wchar_t *str, int32_t length)
+{
+       int i;
+       wchar_t *swap_buff = malloc(4 * length);
+       for (i = 0; i < length; ++i) {
+               uint8_t *sp = (uint8_t *)&str[i],
+                       *dp = (uint8_t *)&swap_buff[i];
+               sp[0] = dp[3];
+               sp[1] = dp[2];
+               sp[2] = dp[1];
+               sp[3] = dp[0];
+       }
+       return swap_buff;
+}
+
+#if 0
+void dump_string(int size, uint8_t *str, int32_t length, char *prefix)
+{
+       int i;
+       printf("%s: ", prefix);
+       for (i = 0; i < length * size; i += 2)
+               printf("%02x%02x ", str[i], str[i + 1]);
+       printf("\n");
+}
+#endif
+
 int wcs_cmp(const wchar_t *a, int32_t length1, 
            const wchar_t *b, int32_t length2) {
   int min, sizediff, diff;
+  wchar_t *swap_a = NULL, *swap_b = NULL;
+
+#if 0
+  dump_string(4, a, length1, "A");
+  dump_string(4, b, length2, "B");
+#endif
+  if (machine_endian() == ENDIAN_LITTLE) {
+         if (utf32_char(a) != 0xfffe) {/* BIG-ENDIAN */
+                 swap_a = swap32_string(a, length1);
+                 if (swap_a)
+                         a = swap_a;
+         } else {              /* LITTLE-ENDIAN */
+                 ++a;
+                 --length1;
+         }
+         if (utf32_char(b) != 0xfffe) {/* BIG-ENDIAN */
+                 swap_b = swap32_string(b, length2);
+                 if (swap_b)
+                         b = swap_b;
+         } else {              /* LITTLE-ENDIAN */
+                 ++b;
+                 --length2;
+         }
+  }
   sizediff = length1 - length2;
   min = sizediff > 0 ? length2 : length1;
-  diff = wcsncmp(a, b, min /4);
+  diff = wcsncmp(a, b, min);
+  if (swap_a)
+         free(swap_a);
+  if (swap_b)
+         free(swap_b);
   if (diff == 0) return sizediff;
   return diff;
 }
@@ -1351,6 +1424,22 @@
 #define UTF_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
 #define UTF_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
 
+uint16_t utf16_char(const uint8_t *str)
+{
+       return (str[1] << 8) | str[0];
+}
+
+uint8_t *swap16_string(const uint8_t *src, int32_t length)
+{
+       int i;
+       uint8_t *swap_buff = malloc(2 * length);
+       for (i = 0; i < length * 2; i += 2) {
+               swap_buff[i + 0] = src[i + 1];
+               swap_buff[i + 1] = src[i + 1];
+       }
+       return swap_buff;
+}
+
 /* compare UTF-16 strings */
 /* memcmp/UnicodeString style, both length-specified */
 /* don't assume byte-aligned! */
@@ -1359,7 +1448,29 @@
   const unsigned char *start1, *start2, *limit1, *limit2;
   UChar c1, c2;
   int32_t lengthResult;
-
+  uint8_t *swap_s1 = NULL, *swap_s2 = NULL;
+#if 0
+  dump_string(2, s1, length1, "S1");
+  dump_string(2, s2, length2, "S2");
+#endif
+  if (machine_endian() == ENDIAN_LITTLE) {
+         if (utf16_char(s1) != 0xfffe) {/* BIG-ENDIAN */
+                 swap_s1 = swap16_string(s1, length1);
+                 if (swap_s1)
+                         s1 = swap_s1;
+         } else {              /* LITTLE-ENDIAN */
+                 s1 += 2;
+                 length1 -= 1;
+         }
+         if (utf16_char(s2) != 0xfffe) {/* BIG-ENDIAN */
+                 swap_s2 = swap16_string(s2, length2);
+                 if (swap_s2)
+                         s2 = swap_s2;
+         } else {              /* LITTLE-ENDIAN */
+                 s2 += 2;
+                 length2 -= 1;
+         }
+  }
   if(length1<length2) {
     lengthResult=-1;
     limit1=s1+2*length1;
@@ -1415,6 +1526,10 @@
       }*/
   }
 
+  if (swap_s1)
+         free(swap_s1);
+  if (swap_s2)
+         free(swap_s2);
   return (int32_t)c1-(int32_t)c2;
 }
 
diff -rN -u old-elephant/src/elephant/unicode.lisp 
new-elephant/src/elephant/unicode.lisp
--- old-elephant/src/elephant/unicode.lisp      2009-08-08 10:51:25.000000000 
+0900
+++ new-elephant/src/elephant/unicode.lisp      2009-08-08 10:51:25.000000000 
+0900
@@ -41,7 +41,7 @@
 
 ;; #+allegro
 ;; (defun serialize-string (string bstream)
-;;   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
+;;   (e(lephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
 ;;                                     (size buffer-stream-size)
 ;;                                     (allocated buffer-stream-length))
 ;;       bstream
@@ -59,20 +59,20 @@
   (declare (type buffer-stream bstream)
           (type string string))
   (cond ((and (not (equal "" string)) (> (char-code (char string 0)) #xFFFF))
-        (serialize-to-utf32le string bstream))
+        (serialize-to-utf32 string bstream))
        ;; Accelerate the common case where a character set is not Latin-1
        ((and (not (equal "" string)) (> (char-code (char string 0)) #xFF))
-        (or (serialize-to-utf16le string bstream)
-            (serialize-to-utf32le string bstream)))
+        (or (serialize-to-utf16 string bstream)
+            (serialize-to-utf32 string bstream)))
        ;; Actually code pages > 0 are rare; so we can pay an extra cost
        (t (or (serialize-to-utf8 string bstream)
-              (serialize-to-utf16le string bstream)
-              (serialize-to-utf32le string bstream)))))
+              (serialize-to-utf16 string bstream)
+              (serialize-to-utf32 string bstream)))))
 
 (defun serialize-to-utf8 (string bstream)
   "Standard serialization"
   (declare (type buffer-stream bstream)
-          (type string string))
+          (type simple-string string))
   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
                                        (size buffer-stream-size)
                                        (allocated buffer-stream-length))
@@ -117,73 +117,105 @@
            (setf (buffer-stream-size bstream) needed)
            (succeed))))))
 
-(defun serialize-to-utf16le (string bstream)
-  "Serialize to utf16le compliant format unless contains code pages > 0"
+(defvar *machine-endian*
+  (let* ((bstream (make-buffer-stream))
+        (buffer (buffer-stream-buffer bstream))
+        (size (buffer-stream-size bstream)))
+    (buffer-write-int32 #x01020304 bstream)
+    (let ((byte-image
+          (loop for i from 0 to 3
+                collect (uffi:deref-array buffer '(:array :unsigned-char)
+                                                  (the fixnum (+ size i))))))
+      (cond ((equal byte-image '(4 3 2 1)) 'endian-little)
+           ((equal byte-image '(1 2 3 4)) 'endian-big)
+           (t 'unknown)))))
+
+(defun machine-endian ()
+  *machine-endian*)
+
+(defun write-utf-char-to-buffer (char char-index char-size buffer base endian)
+  (declare (type (signed-byte 31) char-index)
+          (type (integer 1 4) char-size))
+  (loop for i from 0 below char-size do
+    (setf (uffi:deref-array buffer '(:array :unsigned-char)
+                                   (+ (* char-index char-size) base
+                                      (the (integer 0 3)
+                                        (if (eq endian 'endian-little)
+                                            i
+                                            (- char-size 1 i)))))
+         (ldb (byte 8 (* 8 i)) char))))
+  
+(defun serialize-to-utf16 (string bstream)
+  "Serialize to utf16 compliant format unless contains code pages > 0"
   (declare (type buffer-stream bstream)
           (type string string))
+  (progn
+    (format *debug-io* "LSIP-ENTER: ")
+    (loop for i from 0 below (length string)
+         do (format *debug-io* "~4,'0X " (char-code (char string i))))
+    (format *debug-io* "~%"))
   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
                                        (size buffer-stream-size)
                                        (allocated buffer-stream-length))
       bstream
       (let* ((saved-size (buffer-stream-size bstream))
             (saved-pos (elephant-memutil::buffer-stream-position bstream))
-            (characters (length string)))
+            (characters (length string))
+            (endian (machine-endian))
+            (bom-length (if (eq endian 'endian-big) 0 1)))
        (labels ((fail () 
                   (setf (buffer-stream-size bstream) saved-size)
                   (setf (elephant-memutil::buffer-stream-position bstream) 
saved-pos)
-                  (return-from serialize-to-utf16le nil))
+                  (return-from serialize-to-utf16 nil))
                 (succeed ()
-                  (return-from serialize-to-utf16le t)))
+                  (return-from serialize-to-utf16 t)))
          (buffer-write-byte +utf16-string+ bstream)
-         (buffer-write-int32 characters bstream)
-         (let ((needed (+ size (* characters 2)))
-                (char (etypecase string
+         (buffer-write-int32  (+ characters bom-length) bstream)
+         (let ((needed (+ size (* (+ characters bom-length) 2)))
+                 (char (etypecase string
                         (simple-string #'schar)
                         (string #'char))))
             (when (> needed allocated)
               (resize-buffer-stream bstream needed))
-            (loop for i fixnum from 0 below characters do
-                  (let ((code (char-code (funcall char string i))))
-                    (when (> code #xFFFF) (fail))
-                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ 
(* i 2) size))
-                          ;;                     (coerce (ldb (byte 8 8) code) 
'(signed 8)))
-                          (ldb (byte 8 8) code))
-                    (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ 
(* i 2) size 1))
-                          ;;                     (coerce (ldb (byte 8 0) code) 
'(signed 8))))))
-                          (ldb (byte 8 0) code))))
+           (when (eq endian 'endian-little)
+             (write-utf-char-to-buffer #xfffe 0 2 buffer size endian)
+             (incf size 2))
+            (loop for i fixnum from 0 below characters
+                 do (let ((code (char-code (funcall char string i))))
+                      (when (> code #xFFFF) (fail))
+                      (write-utf-char-to-buffer code i 2 buffer size endian)))
             (incf size (* characters 2))
             (succeed))))))
 
-(defun serialize-to-utf32le (string bstream)
+(defun serialize-to-utf32 (string bstream)
   "Serialize to utf32 compliant format unless contains code pages > 0"
-   (declare (type buffer-stream bstream)
-           (type string string))
+  (declare (type buffer-stream bstream)
+          (type string string))
   (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer)
                                        (size buffer-stream-size)
                                        (allocated buffer-stream-length))
       bstream
-      (let* ((characters (length string)))
-         (buffer-write-byte +utf32-string+ bstream)
-         (buffer-write-int32 characters bstream)
-         (let ((needed (+ size (* 4 characters)))
-                (char (etypecase string
-                        (simple-string #'schar)
-                        (string #'char))))
-           (when (> needed allocated)
-             (resize-buffer-stream bstream needed))
-            (loop for i fixnum from 0 below characters do
-                 (let ((code (char-code (funcall char string i))))
-                   (when (> code #x10FFFF) (error "Invalid unicode code type"))
-                   (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ 
(* i 4) size 0))
-                         (ldb (byte 8 24) code))
-                   (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ 
(* i 4) size 1))
-                         (ldb (byte 8 16) code))
-                   (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ 
(* i 4) size 2))
-                         (ldb (byte 8 8) code))
-                   (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ 
(* i 4) size 3))
-                         (ldb (byte 8 0) code)))))
+      (let* ((characters (length string))
+            (endian (machine-endian))
+            (bom-length (if (eq endian 'endian-big) 0 1)))
+       (buffer-write-byte +utf32-string+ bstream)
+       (buffer-write-int32 (+ characters bom-length) bstream)
+       (let ((needed (+ size (* 4 (+ characters bom-length))))
+             (char (etypecase string
+                     (simple-string #'schar)
+                     (string #'char))))
+         (when (> needed allocated)
+           (resize-buffer-stream bstream needed))
+         (when (eq endian 'endian-little)
+           (write-utf-char-to-buffer #xfffe 0 4 buffer size endian)
+           (incf size 4))
+         (loop for i fixnum from 0 below characters
+               do (let ((code (char-code (funcall char string i))))
+                    (when (> code #x10FFFF)
+                      (error "Invalid unicode code type"))
+                    (write-utf-char-to-buffer code i 4 buffer size endian)))
          (incf size (* characters 4))
-         t)))
+         t))))
 
 ;;
 ;; Deserialization of Strings 
@@ -260,50 +292,67 @@
                                                    (+ pos i)))))))
        string))))
 
+(defun read-utf-char-from-buffer (char-index char-size buffer position endian)
+  (declare (type (integer 1 4) char-size)
+          (type (signed-byte 31) char-index)
+          (type fixnum position))
+  (let ((code 0))
+    (macrolet ((next-byte (offset)
+                `(uffi:deref-array buffer
+                                   '(:array :unsigned-byte)
+                                   (+ (* char-index 2) position ,offset))))
+      (loop for i from 0 below char-size
+           do (setf code (dpb (next-byte (if (eq endian 'endian-little)
+                                             i (- char-size i 1)))
+                              (byte 8 (* i 8)) code)))
+      code)))
+
 (defmethod deserialize-string ((type (eql :utf16le)) bstream &optional 
temp-string)
   "All returned strings are simple-strings for, uh, simplicity"
   (declare (type buffer-stream bstream))
   (let* ((length (buffer-read-int32 bstream))
         (string (or temp-string (make-string length :element-type 'character)))
         (pos (elephant-memutil::buffer-stream-position bstream))
-        (code 0))
-    (macrolet ((next-byte (offset)
-                `(uffi:deref-array (buffer-stream-buffer bstream) '(:array 
:unsigned-byte) (+ (* i 2) pos ,offset))))
-      (declare (type simple-string string)
-              (type fixnum length pos code))
-      (assert (subtypep (type-of string) 'simple-string))
-      (assert (compatible-unicode-support-p :utf16le))
-      (loop for i fixnum from 0 below length do
-          (setf code (dpb (next-byte 0) (byte 8 8) 0))
-          (setf code (dpb (next-byte 1) (byte 8 0) code))
-          (setf (schar string i) (code-char code)))
-      (incf (elephant-memutil::buffer-stream-position bstream)
-           (* length 2)))
-    (the simple-string string)))
+        (code 0) (endian 'endian-big))
+    (declare (type simple-string string)
+            (type fixnum length pos code))
+    (assert (subtypep (type-of string) 'simple-string))
+    (assert (compatible-unicode-support-p :utf16le))
+    (when (= (read-utf-char-from-buffer 0 2 (buffer-stream-buffer bstream)
+                                       pos (machine-endian)) #xfffe)
+      (setf endian 'endian-little)
+      (decf length)
+      (incf pos 2)
+      (incf (elephant-memutil::buffer-stream-position bstream) 2))
+    (loop for i fixnum from 0 below length
+         do (setf code
+                  (read-utf-char-from-buffer i 2 (buffer-stream-buffer bstream)
+                       pos endian))
+            (setf (schar string i) (code-char code)))
+    (incf (elephant-memutil::buffer-stream-position bstream)
+         (* length 2))
+    (the simple-string (subseq string 0 length))))
 
 (defmethod deserialize-string ((type (eql :utf32le)) bstream  &optional 
temp-string)
   (declare (type buffer-stream bstream))
-  (macrolet ((next-byte (offset)
-              `(uffi:deref-array (buffer-stream-buffer bstream) '(:array 
:unsigned-byte) (+ (* i 4) pos ,offset))))
   (let* ((length (buffer-read-int32 bstream))
         (string (or temp-string (make-string length :element-type 'character)))
         (pos (elephant-memutil::buffer-stream-position bstream))
-        (code 0))
+        (code 0) (endian 'endian-big))
     (declare (type string string)
             (type fixnum length pos code))
     (assert (subtypep (type-of string) 'simple-string))
     (assert (compatible-unicode-support-p :utf32le))
+    (when (= (read-utf-char-from-buffer 0 4 (buffer-stream-buffer bstream)
+                                       pos (machine-endian)) #xfffe)
+       (setf endian 'endian-little)
+       (decf length)
+       (incf pos 4)
+       (incf (elephant-memutil::buffer-stream-position bstream) 4))
     (loop for i fixnum from 0 below length do
-        (setf code (dpb (next-byte 0) (byte 8 24) 0))
-        (setf code (dpb (next-byte 1) (byte 8 16) code))
-        (setf code (dpb (next-byte 2) (byte 8 8) code))
-        (setf code (dpb (next-byte 3) (byte 8 0) code))
-        (setf (char string i) (code-char code)))
+      (setf code (read-utf-char-from-buffer i 4 (buffer-stream-buffer bstream)
+                     pos endian))
+      (setf (char string i) (code-char code)))
     (incf (elephant-memutil::buffer-stream-position bstream)
          (* length 4))
-    (the simple-string string))))
-
-
-  
-  
-
+    (the simple-string (subseq string 0 length))))

_______________________________________________
elephant-devel site list
elephant-devel@common-lisp.net
http://common-lisp.net/mailman/listinfo/elephant-devel

[elephant-devel] revised UTF seriazer/desirializer patch

Reply via email to