MonetDB: pythonudf - Proper unicode handling for UCS2 Python dis...

Mark Raasveldt Tue, 10 May 2016 09:31:33 -0700

Changeset: 9ba567019857 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9ba567019857
Modified Files:
        monetdb5/extras/pyapi/pyapi.c
        monetdb5/extras/pyapi/type_conversion.c
        monetdb5/extras/pyapi/unicode.c
        monetdb5/extras/pyapi/unicode.h
Branch: pythonudf
Log Message:


Proper unicode handling for UCS2 Python distributions.


diffs (122 lines):

diff --git a/monetdb5/extras/pyapi/pyapi.c b/monetdb5/extras/pyapi/pyapi.c
--- a/monetdb5/extras/pyapi/pyapi.c
+++ b/monetdb5/extras/pyapi/pyapi.c
@@ -2543,8 +2543,12 @@ BAT *PyObject_ConvertToBAT(PyReturn *ret
                             } else if (PyUnicode_CheckExact(obj)) {
 #ifndef IS_PY3K
                                 Py_UNICODE *str = 
(Py_UNICODE*)((PyUnicodeObject*)obj)->str;
+#if Py_UNICODE_SIZE >= 4
                                 utf32_to_utf8(0, 
((PyUnicodeObject*)obj)->length, utf8_string, str);
 #else
+                                ucs2_to_utf8(0, 
((PyUnicodeObject*)obj)->length, utf8_string, str);
+#endif
+#else
                                 char *str = PyUnicode_AsUTF8(obj);
                                 if (!string_copy(str, utf8_string, strlen(str) 
+ 1, true)) {
                                     msg = createException(MAL, "pyapi.eval", 
"Invalid string encoding used. Please return a regular ASCII string, or a 
Numpy_Unicode object.\n");
diff --git a/monetdb5/extras/pyapi/type_conversion.c 
b/monetdb5/extras/pyapi/type_conversion.c
--- a/monetdb5/extras/pyapi/type_conversion.c
+++ b/monetdb5/extras/pyapi/type_conversion.c
@@ -161,7 +161,7 @@ str pyobject_to_##type(PyObject **pyobj,
     str unicode_to_##tpe(Py_UNICODE *ptr, size_t maxsize, tpe *value) \
     {                                                              \
         char utf8[255];                                            \
-        utf32_to_utf8(0, 255, utf8, ptr);                          \
+        unicode_to_utf8(0, 255, utf8, ptr);                          \
         return str_to_##tpe(utf8, maxsize, value);                 \
     }                                                              \
     PY_TO_(tpe, inttpe);
diff --git a/monetdb5/extras/pyapi/unicode.c b/monetdb5/extras/pyapi/unicode.c
--- a/monetdb5/extras/pyapi/unicode.c
+++ b/monetdb5/extras/pyapi/unicode.c
@@ -46,15 +46,13 @@ int utf8_length(unsigned char utf8_char)
     else return -1; //invalid utf8 character, the maximum value of the first 
byte is 0xf7
 }
 
-int utf32_char_to_utf8_char(size_t position, char *utf8_storage, Py_UNICODE 
utf32_char)
+int utf32_char_to_utf8_char(size_t position, char *utf8_storage, unsigned int 
utf32_char)
 {
     int utf8_size = 4;
     if      (utf32_char < 0x80)        utf8_size = 1;
     else if (utf32_char < 0x800)       utf8_size = 2;
-#if Py_UNICODE_SIZE >= 4
     else if (utf32_char < 0x10000)     utf8_size = 3;
     else if (utf32_char > 0x0010FFFF)  return -1; //utf32 character is out of 
legal range
-#endif
     
     switch(utf8_size)
     {
@@ -79,13 +77,36 @@ int utf32_char_to_utf8_char(size_t posit
     }
 }
 
-bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *utf32)
+bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *ucs2)
 {
     size_t i = 0;
     int position = 0;
     int shift;
     for(i = 0; i < size; i++)
     {
+        if (ucs2[offset + i] == 0) 
+        {
+            utf8_storage[position] = '\0';
+            return true;
+        }
+        shift = utf32_char_to_utf8_char(position, utf8_storage, ucs2[offset + 
i]);
+        if (shift < 0) return false;
+        position += shift;
+    }
+    utf8_storage[position] = '\0';
+    return true;
+}
+
+
+bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *utf32_input)
+{
+    size_t i = 0;
+    int position = 0;
+    int shift;
+    unsigned int *utf32 = (unsigned int*) utf32_input;
+
+    for(i = 0; i < size; i++)
+    {
         if (utf32[offset + i] == 0) 
         {
             utf8_storage[position] = '\0';
@@ -100,6 +121,13 @@ bool utf32_to_utf8(size_t offset, size_t
     return true;
 }
 
+bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *unicode) {
+#if Py_UNICODE_SIZE == 2
+    return ucs2_to_utf8(offset, size, utf8_storage, unicode);
+#else
+    return utf32_to_utf8(offset, size, utf8_storage, unicode);
+#endif
+}
 
 int utf8_char_to_utf32_char(size_t position, Py_UNICODE *utf32_storage, int 
offset, const unsigned char *utf8_char)
 {
diff --git a/monetdb5/extras/pyapi/unicode.h b/monetdb5/extras/pyapi/unicode.h
--- a/monetdb5/extras/pyapi/unicode.h
+++ b/monetdb5/extras/pyapi/unicode.h
@@ -43,6 +43,9 @@ int utf8_length(unsigned char utf8_char)
 */
 bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *utf32);
 
+bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *ucs2);
+
+bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage, const 
Py_UNICODE *unicode);
 //! Converts a utf8 string to a utf32 string, returns TRUE on success and 
FALSE on failure
 /* Arguments:
        offset: The offset in the utf8 array in bytes
@@ -67,7 +70,7 @@ int utf8_char_to_utf32_char(size_t posit
        utf32_storage:
        utf8_char:
 */
-int utf32_char_to_utf8_char(size_t position, char *utf8_storage, Py_UNICODE 
utf32_char);
+int utf32_char_to_utf8_char(size_t position, char *utf8_storage, unsigned int 
utf32_char);
 
 void _unicode_init(void);
 
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: pythonudf - Proper unicode handling for UCS2 Python dis...

Reply via email to