This patch does these things; o Maybe, big endian machines are nothing affected by this patch. I do not have any big endian machine. o little endian machines; + UTF strings are serialized into UTF16le or UTF32le with BOM + deserializers are test existency of BOM and choice deserialize from big endian or little endian. + comparators in libberkeley-db are also test BOM, create temporally buffer when the string is serialize into big endian.
o old store image preserved o sort order is corrected when migrate old store to new store. I did not test any other backing store.
diff -rN -u old-elephant/src/db-bdb/libberkeley-db.c new-elephant/src/db-bdb/libberkeley-db.c --- old-elephant/src/db-bdb/libberkeley-db.c 2009-08-08 10:51:25.000000000 +0900 +++ new-elephant/src/db-bdb/libberkeley-db.c 2009-08-08 10:51:25.000000000 +0900 @@ -25,6 +25,7 @@ #include <stdio.h> #include <string.h> #include <wchar.h> +#include <stdlib.h> /* Some utility stuff used to be here but has been placed in libmemutil.c */ @@ -920,7 +921,7 @@ case S1_UCS4_SYMBOL: case S1_UCS4_STRING: case S1_UCS4_PATHNAME: - return wcs_cmp((wchar_t*)ad+9, read_int(ad, 5), (wchar_t*)bd+9, read_int(bd, 5)); + return wcs_cmp((wchar_t*)(ad+9), read_int(ad, 5), (wchar_t*)(bd+9), read_int(bd, 5)); default: return lex_cmp(ad+5, (a->size)-5, bd+5, (b->size)-5); } @@ -1130,7 +1131,7 @@ /***** printf("Doing a 32-bit compare\n"); *****/ - return wcs_cmp((wchar_t*)ad+5+offset, read_int32(ad+offset, 1), (wchar_t*)bd+5+offset, read_int32(bd+offset, 1)); + return wcs_cmp((wchar_t*)(ad+5+offset), read_int32(ad+offset, 1), (wchar_t*)(bd+5+offset), read_int32(bd+offset, 1)); default: /***** printf("Doing a lex compare\n"); @@ -1306,6 +1307,18 @@ #define strncasecmp _strnicmp typedef unsigned short uint16_t; #endif +#define ENDIAN_BIG 0 +#define ENDIAN_LITTLE 1 + +int machine_endian() +{ + uint32_t x = 0x01020304; + uint8_t *xp = (uint8_t *)&x; + if (*xp == 0x01) + return ENDIAN_BIG; + else + return ENDIAN_LITTLE; +} int case_cmp(const unsigned char *a, int32_t length1, const unsigned char *b, int32_t length2) { int min, sizediff, diff; @@ -1316,12 +1329,72 @@ return diff; } +wchar_t utf32_char(const wchar_t *c) +{ + uint8_t *cp = (uint8_t *)c; + return (cp[3] << 24) | (cp[2] << 16) | (cp[1] << 8) | cp[0]; +} + +wchar_t *swap32_string(const wchar_t *str, int32_t length) +{ + int i; + wchar_t *swap_buff = malloc(4 * length); + for (i = 0; i < length; ++i) { + uint8_t *sp = (uint8_t *)&str[i], + *dp = (uint8_t *)&swap_buff[i]; + sp[0] = dp[3]; + sp[1] = dp[2]; + sp[2] = dp[1]; + sp[3] = dp[0]; + } + return swap_buff; +} + +#if 0 +void dump_string(int size, uint8_t *str, int32_t length, char *prefix) +{ + int i; + printf("%s: ", prefix); + for (i = 0; i < length * size; i += 2) + printf("%02x%02x ", str[i], str[i + 1]); + printf("\n"); +} +#endif + int wcs_cmp(const wchar_t *a, int32_t length1, const wchar_t *b, int32_t length2) { int min, sizediff, diff; + wchar_t *swap_a = NULL, *swap_b = NULL; + +#if 0 + dump_string(4, a, length1, "A"); + dump_string(4, b, length2, "B"); +#endif + if (machine_endian() == ENDIAN_LITTLE) { + if (utf32_char(a) != 0xfffe) {/* BIG-ENDIAN */ + swap_a = swap32_string(a, length1); + if (swap_a) + a = swap_a; + } else { /* LITTLE-ENDIAN */ + ++a; + --length1; + } + if (utf32_char(b) != 0xfffe) {/* BIG-ENDIAN */ + swap_b = swap32_string(b, length2); + if (swap_b) + b = swap_b; + } else { /* LITTLE-ENDIAN */ + ++b; + --length2; + } + } sizediff = length1 - length2; min = sizediff > 0 ? length2 : length1; - diff = wcsncmp(a, b, min /4); + diff = wcsncmp(a, b, min); + if (swap_a) + free(swap_a); + if (swap_b) + free(swap_b); if (diff == 0) return sizediff; return diff; } @@ -1351,6 +1424,22 @@ #define UTF_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) #define UTF_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) +uint16_t utf16_char(const uint8_t *str) +{ + return (str[1] << 8) | str[0]; +} + +uint8_t *swap16_string(const uint8_t *src, int32_t length) +{ + int i; + uint8_t *swap_buff = malloc(2 * length); + for (i = 0; i < length * 2; i += 2) { + swap_buff[i + 0] = src[i + 1]; + swap_buff[i + 1] = src[i + 1]; + } + return swap_buff; +} + /* compare UTF-16 strings */ /* memcmp/UnicodeString style, both length-specified */ /* don't assume byte-aligned! */ @@ -1359,7 +1448,29 @@ const unsigned char *start1, *start2, *limit1, *limit2; UChar c1, c2; int32_t lengthResult; - + uint8_t *swap_s1 = NULL, *swap_s2 = NULL; +#if 0 + dump_string(2, s1, length1, "S1"); + dump_string(2, s2, length2, "S2"); +#endif + if (machine_endian() == ENDIAN_LITTLE) { + if (utf16_char(s1) != 0xfffe) {/* BIG-ENDIAN */ + swap_s1 = swap16_string(s1, length1); + if (swap_s1) + s1 = swap_s1; + } else { /* LITTLE-ENDIAN */ + s1 += 2; + length1 -= 1; + } + if (utf16_char(s2) != 0xfffe) {/* BIG-ENDIAN */ + swap_s2 = swap16_string(s2, length2); + if (swap_s2) + s2 = swap_s2; + } else { /* LITTLE-ENDIAN */ + s2 += 2; + length2 -= 1; + } + } if(length1<length2) { lengthResult=-1; limit1=s1+2*length1; @@ -1415,6 +1526,10 @@ }*/ } + if (swap_s1) + free(swap_s1); + if (swap_s2) + free(swap_s2); return (int32_t)c1-(int32_t)c2; } diff -rN -u old-elephant/src/elephant/unicode.lisp new-elephant/src/elephant/unicode.lisp --- old-elephant/src/elephant/unicode.lisp 2009-08-08 10:51:25.000000000 +0900 +++ new-elephant/src/elephant/unicode.lisp 2009-08-08 10:51:25.000000000 +0900 @@ -41,7 +41,7 @@ ;; #+allegro ;; (defun serialize-string (string bstream) -;; (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer) +;; (e(lephant-memutil::with-struct-slots ((buffer buffer-stream-buffer) ;; (size buffer-stream-size) ;; (allocated buffer-stream-length)) ;; bstream @@ -59,20 +59,20 @@ (declare (type buffer-stream bstream) (type string string)) (cond ((and (not (equal "" string)) (> (char-code (char string 0)) #xFFFF)) - (serialize-to-utf32le string bstream)) + (serialize-to-utf32 string bstream)) ;; Accelerate the common case where a character set is not Latin-1 ((and (not (equal "" string)) (> (char-code (char string 0)) #xFF)) - (or (serialize-to-utf16le string bstream) - (serialize-to-utf32le string bstream))) + (or (serialize-to-utf16 string bstream) + (serialize-to-utf32 string bstream))) ;; Actually code pages > 0 are rare; so we can pay an extra cost (t (or (serialize-to-utf8 string bstream) - (serialize-to-utf16le string bstream) - (serialize-to-utf32le string bstream))))) + (serialize-to-utf16 string bstream) + (serialize-to-utf32 string bstream))))) (defun serialize-to-utf8 (string bstream) "Standard serialization" (declare (type buffer-stream bstream) - (type string string)) + (type simple-string string)) (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer) (size buffer-stream-size) (allocated buffer-stream-length)) @@ -117,73 +117,105 @@ (setf (buffer-stream-size bstream) needed) (succeed)))))) -(defun serialize-to-utf16le (string bstream) - "Serialize to utf16le compliant format unless contains code pages > 0" +(defvar *machine-endian* + (let* ((bstream (make-buffer-stream)) + (buffer (buffer-stream-buffer bstream)) + (size (buffer-stream-size bstream))) + (buffer-write-int32 #x01020304 bstream) + (let ((byte-image + (loop for i from 0 to 3 + collect (uffi:deref-array buffer '(:array :unsigned-char) + (the fixnum (+ size i)))))) + (cond ((equal byte-image '(4 3 2 1)) 'endian-little) + ((equal byte-image '(1 2 3 4)) 'endian-big) + (t 'unknown))))) + +(defun machine-endian () + *machine-endian*) + +(defun write-utf-char-to-buffer (char char-index char-size buffer base endian) + (declare (type (signed-byte 31) char-index) + (type (integer 1 4) char-size)) + (loop for i from 0 below char-size do + (setf (uffi:deref-array buffer '(:array :unsigned-char) + (+ (* char-index char-size) base + (the (integer 0 3) + (if (eq endian 'endian-little) + i + (- char-size 1 i))))) + (ldb (byte 8 (* 8 i)) char)))) + +(defun serialize-to-utf16 (string bstream) + "Serialize to utf16 compliant format unless contains code pages > 0" (declare (type buffer-stream bstream) (type string string)) + (progn + (format *debug-io* "LSIP-ENTER: ") + (loop for i from 0 below (length string) + do (format *debug-io* "~4,'0X " (char-code (char string i)))) + (format *debug-io* "~%")) (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer) (size buffer-stream-size) (allocated buffer-stream-length)) bstream (let* ((saved-size (buffer-stream-size bstream)) (saved-pos (elephant-memutil::buffer-stream-position bstream)) - (characters (length string))) + (characters (length string)) + (endian (machine-endian)) + (bom-length (if (eq endian 'endian-big) 0 1))) (labels ((fail () (setf (buffer-stream-size bstream) saved-size) (setf (elephant-memutil::buffer-stream-position bstream) saved-pos) - (return-from serialize-to-utf16le nil)) + (return-from serialize-to-utf16 nil)) (succeed () - (return-from serialize-to-utf16le t))) + (return-from serialize-to-utf16 t))) (buffer-write-byte +utf16-string+ bstream) - (buffer-write-int32 characters bstream) - (let ((needed (+ size (* characters 2))) - (char (etypecase string + (buffer-write-int32 (+ characters bom-length) bstream) + (let ((needed (+ size (* (+ characters bom-length) 2))) + (char (etypecase string (simple-string #'schar) (string #'char)))) (when (> needed allocated) (resize-buffer-stream bstream needed)) - (loop for i fixnum from 0 below characters do - (let ((code (char-code (funcall char string i)))) - (when (> code #xFFFF) (fail)) - (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size)) - ;; (coerce (ldb (byte 8 8) code) '(signed 8))) - (ldb (byte 8 8) code)) - (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 2) size 1)) - ;; (coerce (ldb (byte 8 0) code) '(signed 8)))))) - (ldb (byte 8 0) code)))) + (when (eq endian 'endian-little) + (write-utf-char-to-buffer #xfffe 0 2 buffer size endian) + (incf size 2)) + (loop for i fixnum from 0 below characters + do (let ((code (char-code (funcall char string i)))) + (when (> code #xFFFF) (fail)) + (write-utf-char-to-buffer code i 2 buffer size endian))) (incf size (* characters 2)) (succeed)))))) -(defun serialize-to-utf32le (string bstream) +(defun serialize-to-utf32 (string bstream) "Serialize to utf32 compliant format unless contains code pages > 0" - (declare (type buffer-stream bstream) - (type string string)) + (declare (type buffer-stream bstream) + (type string string)) (elephant-memutil::with-struct-slots ((buffer buffer-stream-buffer) (size buffer-stream-size) (allocated buffer-stream-length)) bstream - (let* ((characters (length string))) - (buffer-write-byte +utf32-string+ bstream) - (buffer-write-int32 characters bstream) - (let ((needed (+ size (* 4 characters))) - (char (etypecase string - (simple-string #'schar) - (string #'char)))) - (when (> needed allocated) - (resize-buffer-stream bstream needed)) - (loop for i fixnum from 0 below characters do - (let ((code (char-code (funcall char string i)))) - (when (> code #x10FFFF) (error "Invalid unicode code type")) - (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 0)) - (ldb (byte 8 24) code)) - (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 1)) - (ldb (byte 8 16) code)) - (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 2)) - (ldb (byte 8 8) code)) - (setf (uffi:deref-array buffer '(:array :unsigned-char) (+ (* i 4) size 3)) - (ldb (byte 8 0) code))))) + (let* ((characters (length string)) + (endian (machine-endian)) + (bom-length (if (eq endian 'endian-big) 0 1))) + (buffer-write-byte +utf32-string+ bstream) + (buffer-write-int32 (+ characters bom-length) bstream) + (let ((needed (+ size (* 4 (+ characters bom-length)))) + (char (etypecase string + (simple-string #'schar) + (string #'char)))) + (when (> needed allocated) + (resize-buffer-stream bstream needed)) + (when (eq endian 'endian-little) + (write-utf-char-to-buffer #xfffe 0 4 buffer size endian) + (incf size 4)) + (loop for i fixnum from 0 below characters + do (let ((code (char-code (funcall char string i)))) + (when (> code #x10FFFF) + (error "Invalid unicode code type")) + (write-utf-char-to-buffer code i 4 buffer size endian))) (incf size (* characters 4)) - t))) + t)))) ;; ;; Deserialization of Strings @@ -260,50 +292,67 @@ (+ pos i))))))) string)))) +(defun read-utf-char-from-buffer (char-index char-size buffer position endian) + (declare (type (integer 1 4) char-size) + (type (signed-byte 31) char-index) + (type fixnum position)) + (let ((code 0)) + (macrolet ((next-byte (offset) + `(uffi:deref-array buffer + '(:array :unsigned-byte) + (+ (* char-index 2) position ,offset)))) + (loop for i from 0 below char-size + do (setf code (dpb (next-byte (if (eq endian 'endian-little) + i (- char-size i 1))) + (byte 8 (* i 8)) code))) + code))) + (defmethod deserialize-string ((type (eql :utf16le)) bstream &optional temp-string) "All returned strings are simple-strings for, uh, simplicity" (declare (type buffer-stream bstream)) (let* ((length (buffer-read-int32 bstream)) (string (or temp-string (make-string length :element-type 'character))) (pos (elephant-memutil::buffer-stream-position bstream)) - (code 0)) - (macrolet ((next-byte (offset) - `(uffi:deref-array (buffer-stream-buffer bstream) '(:array :unsigned-byte) (+ (* i 2) pos ,offset)))) - (declare (type simple-string string) - (type fixnum length pos code)) - (assert (subtypep (type-of string) 'simple-string)) - (assert (compatible-unicode-support-p :utf16le)) - (loop for i fixnum from 0 below length do - (setf code (dpb (next-byte 0) (byte 8 8) 0)) - (setf code (dpb (next-byte 1) (byte 8 0) code)) - (setf (schar string i) (code-char code))) - (incf (elephant-memutil::buffer-stream-position bstream) - (* length 2))) - (the simple-string string))) + (code 0) (endian 'endian-big)) + (declare (type simple-string string) + (type fixnum length pos code)) + (assert (subtypep (type-of string) 'simple-string)) + (assert (compatible-unicode-support-p :utf16le)) + (when (= (read-utf-char-from-buffer 0 2 (buffer-stream-buffer bstream) + pos (machine-endian)) #xfffe) + (setf endian 'endian-little) + (decf length) + (incf pos 2) + (incf (elephant-memutil::buffer-stream-position bstream) 2)) + (loop for i fixnum from 0 below length + do (setf code + (read-utf-char-from-buffer i 2 (buffer-stream-buffer bstream) + pos endian)) + (setf (schar string i) (code-char code))) + (incf (elephant-memutil::buffer-stream-position bstream) + (* length 2)) + (the simple-string (subseq string 0 length)))) (defmethod deserialize-string ((type (eql :utf32le)) bstream &optional temp-string) (declare (type buffer-stream bstream)) - (macrolet ((next-byte (offset) - `(uffi:deref-array (buffer-stream-buffer bstream) '(:array :unsigned-byte) (+ (* i 4) pos ,offset)))) (let* ((length (buffer-read-int32 bstream)) (string (or temp-string (make-string length :element-type 'character))) (pos (elephant-memutil::buffer-stream-position bstream)) - (code 0)) + (code 0) (endian 'endian-big)) (declare (type string string) (type fixnum length pos code)) (assert (subtypep (type-of string) 'simple-string)) (assert (compatible-unicode-support-p :utf32le)) + (when (= (read-utf-char-from-buffer 0 4 (buffer-stream-buffer bstream) + pos (machine-endian)) #xfffe) + (setf endian 'endian-little) + (decf length) + (incf pos 4) + (incf (elephant-memutil::buffer-stream-position bstream) 4)) (loop for i fixnum from 0 below length do - (setf code (dpb (next-byte 0) (byte 8 24) 0)) - (setf code (dpb (next-byte 1) (byte 8 16) code)) - (setf code (dpb (next-byte 2) (byte 8 8) code)) - (setf code (dpb (next-byte 3) (byte 8 0) code)) - (setf (char string i) (code-char code))) + (setf code (read-utf-char-from-buffer i 4 (buffer-stream-buffer bstream) + pos endian)) + (setf (char string i) (code-char code))) (incf (elephant-memutil::buffer-stream-position bstream) (* length 4)) - (the simple-string string)))) - - - - - + (the simple-string (subseq string 0 length))))
_______________________________________________ elephant-devel site list elephant-devel@common-lisp.net http://common-lisp.net/mailman/listinfo/elephant-devel