Changeset: 9ba567019857 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9ba567019857
Modified Files:
monetdb5/extras/pyapi/pyapi.c
monetdb5/extras/pyapi/type_conversion.c
monetdb5/extras/pyapi/unicode.c
monetdb5/extras/pyapi/unicode.h
Branch: pythonudf
Log Message:
Proper unicode handling for UCS2 Python distributions.
diffs (122 lines):
diff --git a/monetdb5/extras/pyapi/pyapi.c b/monetdb5/extras/pyapi/pyapi.c
--- a/monetdb5/extras/pyapi/pyapi.c
+++ b/monetdb5/extras/pyapi/pyapi.c
@@ -2543,8 +2543,12 @@ BAT *PyObject_ConvertToBAT(PyReturn *ret
} else if (PyUnicode_CheckExact(obj)) {
#ifndef IS_PY3K
Py_UNICODE *str =
(Py_UNICODE*)((PyUnicodeObject*)obj)->str;
+#if Py_UNICODE_SIZE >= 4
utf32_to_utf8(0,
((PyUnicodeObject*)obj)->length, utf8_string, str);
#else
+ ucs2_to_utf8(0,
((PyUnicodeObject*)obj)->length, utf8_string, str);
+#endif
+#else
char *str = PyUnicode_AsUTF8(obj);
if (!string_copy(str, utf8_string, strlen(str)
+ 1, true)) {
msg = createException(MAL, "pyapi.eval",
"Invalid string encoding used. Please return a regular ASCII string, or a
Numpy_Unicode object.\n");
diff --git a/monetdb5/extras/pyapi/type_conversion.c
b/monetdb5/extras/pyapi/type_conversion.c
--- a/monetdb5/extras/pyapi/type_conversion.c
+++ b/monetdb5/extras/pyapi/type_conversion.c
@@ -161,7 +161,7 @@ str pyobject_to_##type(PyObject **pyobj,
str unicode_to_##tpe(Py_UNICODE *ptr, size_t maxsize, tpe *value) \
{ \
char utf8[255]; \
- utf32_to_utf8(0, 255, utf8, ptr); \
+ unicode_to_utf8(0, 255, utf8, ptr); \
return str_to_##tpe(utf8, maxsize, value); \
} \
PY_TO_(tpe, inttpe);
diff --git a/monetdb5/extras/pyapi/unicode.c b/monetdb5/extras/pyapi/unicode.c
--- a/monetdb5/extras/pyapi/unicode.c
+++ b/monetdb5/extras/pyapi/unicode.c
@@ -46,15 +46,13 @@ int utf8_length(unsigned char utf8_char)
else return -1; //invalid utf8 character, the maximum value of the first
byte is 0xf7
}
-int utf32_char_to_utf8_char(size_t position, char *utf8_storage, Py_UNICODE
utf32_char)
+int utf32_char_to_utf8_char(size_t position, char *utf8_storage, unsigned int
utf32_char)
{
int utf8_size = 4;
if (utf32_char < 0x80) utf8_size = 1;
else if (utf32_char < 0x800) utf8_size = 2;
-#if Py_UNICODE_SIZE >= 4
else if (utf32_char < 0x10000) utf8_size = 3;
else if (utf32_char > 0x0010FFFF) return -1; //utf32 character is out of
legal range
-#endif
switch(utf8_size)
{
@@ -79,13 +77,36 @@ int utf32_char_to_utf8_char(size_t posit
}
}
-bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *utf32)
+bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *ucs2)
{
size_t i = 0;
int position = 0;
int shift;
for(i = 0; i < size; i++)
{
+ if (ucs2[offset + i] == 0)
+ {
+ utf8_storage[position] = '\0';
+ return true;
+ }
+ shift = utf32_char_to_utf8_char(position, utf8_storage, ucs2[offset +
i]);
+ if (shift < 0) return false;
+ position += shift;
+ }
+ utf8_storage[position] = '\0';
+ return true;
+}
+
+
+bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *utf32_input)
+{
+ size_t i = 0;
+ int position = 0;
+ int shift;
+ unsigned int *utf32 = (unsigned int*) utf32_input;
+
+ for(i = 0; i < size; i++)
+ {
if (utf32[offset + i] == 0)
{
utf8_storage[position] = '\0';
@@ -100,6 +121,13 @@ bool utf32_to_utf8(size_t offset, size_t
return true;
}
+bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *unicode) {
+#if Py_UNICODE_SIZE == 2
+ return ucs2_to_utf8(offset, size, utf8_storage, unicode);
+#else
+ return utf32_to_utf8(offset, size, utf8_storage, unicode);
+#endif
+}
int utf8_char_to_utf32_char(size_t position, Py_UNICODE *utf32_storage, int
offset, const unsigned char *utf8_char)
{
diff --git a/monetdb5/extras/pyapi/unicode.h b/monetdb5/extras/pyapi/unicode.h
--- a/monetdb5/extras/pyapi/unicode.h
+++ b/monetdb5/extras/pyapi/unicode.h
@@ -43,6 +43,9 @@ int utf8_length(unsigned char utf8_char)
*/
bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *utf32);
+bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *ucs2);
+
+bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage, const
Py_UNICODE *unicode);
//! Converts a utf8 string to a utf32 string, returns TRUE on success and
FALSE on failure
/* Arguments:
offset: The offset in the utf8 array in bytes
@@ -67,7 +70,7 @@ int utf8_char_to_utf32_char(size_t posit
utf32_storage:
utf8_char:
*/
-int utf32_char_to_utf8_char(size_t position, char *utf8_storage, Py_UNICODE
utf32_char);
+int utf32_char_to_utf8_char(size_t position, char *utf8_storage, unsigned int
utf32_char);
void _unicode_init(void);
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list