Enlightenment CVS committal Author : dj2 Project : e17 Module : libs/ewl
Dir : e17/libs/ewl/src/lib Modified Files: ewl_text.c Log Message: - patch from pfritz to make the utf8 stuff a bit smarter =================================================================== RCS file: /cvs/e/e17/libs/ewl/src/lib/ewl_text.c,v retrieving revision 1.127 retrieving revision 1.128 diff -u -3 -r1.127 -r1.128 --- ewl_text.c 14 Sep 2006 05:35:38 -0000 1.127 +++ ewl_text.c 23 Sep 2006 21:12:03 -0000 1.128 @@ -18,6 +18,21 @@ */ static Ecore_Hash *context_hash = NULL; +static const char ewl_text_trailing_bytes[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 +}; + +/* returns length of next utf-8 sequence */ +#define EWL_TEXT_CHAR_BYTE_LEN(s) \ + (ewl_text_trailing_bytes[(unsigned int)(unsigned char)((s)[0])]) + static void ewl_text_context_cb_free(void *data); static void ewl_text_context_print(Ewl_Text_Context *tx, const char *indent); static char *ewl_text_context_name_get(Ewl_Text_Context *tx, @@ -68,8 +83,11 @@ unsigned int byte_len, unsigned int *char_idx, unsigned int *char_len); -static unsigned int ewl_text_char_length_get(const char *text); -static char *ewl_text_text_next_char(const char *text, unsigned int *idx); +static char *ewl_text_text_next_char(const char *text, + unsigned int *idx); +static char *ewl_text_text_utf8_validate(const char *text, + unsigned int *char_len, + unsigned int *byte_len); /** @@ -331,9 +349,9 @@ } else { - evas_textblock_cursor_char_geometry_get(cursor, + evas_textblock_cursor_char_geometry_get(cursor, &cx, &cy, &cw, &ch); - if (tx > (cx + ((cw + 1) >> 1))) + if (tx > (cx + ((cw + 1) >> 1))) char_idx ++; } @@ -450,16 +468,15 @@ int char_len = 0; int byte_len = 0; unsigned int byte_idx; + char *valid_text = NULL; DENTER_FUNCTION(DLEVEL_STABLE); DCHECK_PARAM_PTR("t", t); DCHECK_TYPE("t", t, EWL_TEXT_TYPE); - if (text) - { - byte_len = strlen(text); - char_len = ewl_text_char_length_get(text); - } + if (text) + valid_text = ewl_text_text_utf8_validate(text, &char_len, + &byte_len); /* Limit the index to be within safe boundaries */ if (char_idx > t->length.chars + 1) @@ -497,7 +514,8 @@ t->text + byte_idx, t->length.bytes - byte_idx); - memcpy(t->text + byte_idx, text, byte_len); + memcpy(t->text + byte_idx, valid_text, byte_len); + FREE(valid_text); t->length.chars += char_len; t->length.bytes += byte_len; t->text[t->length.bytes] = '\0'; @@ -2348,7 +2366,7 @@ DCHECK_PARAM_PTR("t", t); DCHECK_TYPE("t", t, EWL_TEXT_TYPE); - child = ewl_text_tree_node_get(t->formatting.tree, char_idx, TRUE); /* XXX TRUE or FALSE? */ + child = ewl_text_tree_node_get(t->formatting.tree, char_idx, TRUE); parent = child->parent; while (parent) { @@ -2422,7 +2440,7 @@ DCHECK_TYPE("t", t, EWL_TEXT_TYPE); child = ewl_text_tree_node_in_bytes_get(t->formatting.tree, - byte_idx, TRUE); /* XXX TRUE or FALSE? */ + byte_idx, TRUE); parent = child->parent; while (parent) { @@ -2479,67 +2497,144 @@ DLEAVE_FUNCTION(DLEVEL_STABLE); } -/* Counts the number of characters in the given piece of text. Assume the - * text is utf8 so take that into account when counting. */ -static unsigned int -ewl_text_char_length_get(const char *text) +/* + * This function checks if a given character is a utf character. + * It only checks the first character in the string. + */ +static int +ewl_text_char_is_legal_utf8(const char *c) { - unsigned int length = 0, idx; - const char *t; + unsigned const char *t; DENTER_FUNCTION(DLEVEL_STABLE); + DCHECK_PARAM_PTR_RET("c", c, FALSE); - if (!text || (strlen(text) == 0)) - DRETURN_INT(length, DLEVEL_STABLE); - - t = text; - while ((t = ewl_text_text_next_char(t, &idx))) - length ++; + t = c; + if (!t) DRETURN_INT(FALSE, DLEVEL_STABLE); + + if (t[0] < 0x80) + { + /* + * this a noraml 7-bit ASCII character + * -> legal utf8 + */ + DRETURN_INT(TRUE, DLEVEL_STABLE); + } - DRETURN_INT(length, DLEVEL_STABLE); + switch (EWL_TEXT_CHAR_BYTE_LEN(t)) + { + case 2: + /* 2 byte */ + if ((t[1] & 0xc0) != 0x80) + DRETURN_INT(FALSE, DLEVEL_STABLE); + break; + + case 3: + /* 3 byte */ + if (((t[1] & 0xc0) != 0x80) + || ((t[2] & 0xc0) != 0x80)) + DRETURN_INT(FALSE, DLEVEL_STABLE); + break; + + case 4: + /* 4 byte */ + if (((t[1] & 0xc0) != 0x80) + || ((t[2] & 0xc0) != 0x80) + || ((t[3] & 0xc0) != 0x80)) + DRETURN_INT(FALSE, DLEVEL_STABLE); + break; + + default: + /* + * this is actually: + * case 1: + * We already checked if it is a 7-bit ASCII character, + * so anything else with the length of 1 byte is not + * a valid utf8 character + * case 5: case 6: + * Although a character sequences of the length 5 or 6 + * is possible it is not a legal utf8 character + */ + return FALSE; + } + + DRETURN_INT(TRUE, DLEVEL_STABLE); } -/* This is stolen from evas_common_font_utf8_get_next() */ +/* + * This function return the next character of a utf string. + * The text pointer should point on the leading byte of the + * current character, otherwise it will return the adress of + * the next byte. + */ static char * ewl_text_text_next_char(const char *text, unsigned int *idx) { - unsigned char d, d2, d3, d4; - + int len; + DENTER_FUNCTION(DLEVEL_STABLE); + DCHECK_PARAM_PTR_RET("text", text, NULL); - *idx = 0; + len = EWL_TEXT_CHAR_BYTE_LEN(text); + if (idx) *idx = len; - if (!text || (text[0] == '\0')) DRETURN_PTR(NULL, DLEVEL_STABLE); + DRETURN_PTR(text + len, DLEVEL_STABLE); +} - d = text[(*idx)++]; - if (!d) DRETURN_PTR(text + *idx, DLEVEL_STABLE); /* error .. */ +/* + * This function valdiates a a given utf-string and return a copy + * of it. Should the string contain illegal bytes, it will replace + * them with a question mark. This function doesn't check if the + * correspondending unicode exists for the single character nor + * if the font provides it. + */ +static char * +ewl_text_text_utf8_validate(const char *text, unsigned int *char_len, + unsigned int *byte_len) +{ + char *t, *new_t; + unsigned int idx; + unsigned int c_len; - if (d < 0x80) - DRETURN_PTR(text + *idx, DLEVEL_STABLE); + DENTER_FUNCTION(DLEVEL_STABLE); + DCHECK_PARAM_PTR_RET("text", text, NULL); + + new_t = t = strdup(text); + c_len = 0; - if ((d & 0xe0) == 0xc0) - { - /* 2 byte */ - if (((d2 = text[(*idx)++]) & 0xc0) != 0x80) - DRETURN_PTR(text + *idx, DLEVEL_STABLE); /* error .. */ - } - else if ((d & 0xf0) == 0xe0) + while (*t) { - /* 3 byte */ - if ((((d2 = text[(*idx)++]) & 0xc0) != 0x80) - || (((d3 = text[(*idx)++]) & 0xc0) != 0x80)) - DRETURN_PTR(text + *idx, DLEVEL_STABLE); /* error .. */ - } - else - { - /* 4 byte */ - if ((((d2 = text[(*idx)++]) & 0xc0) != 0x80) - || (((d3 = text[(*idx)++]) & 0xc0) != 0x80) - || (((d4 = text[(*idx)++]) & 0xc0) != 0x80)) - DRETURN_PTR(text + *idx, DLEVEL_STABLE); /* error .. */ + if (ewl_text_char_is_legal_utf8(t)) + { + /* + * the current character is valid utf-character + * so we can jump to the next character + */ + t = ewl_text_text_next_char(t, &idx); + } + else + { + /* + * oops, we found a illegal utf-character, or better + * something else. Replace this byte and hope + * the next one will be better :) + */ + *t = '?'; + t++; + + printf("found a non utf8 character\n"); + } + c_len++; } - - DRETURN_PTR(text + *idx, DLEVEL_STABLE); + + /* + * Well this is just a by-product, so we can use it + * without doing this loop again + */ + if (char_len) *char_len = c_len; + if (byte_len) *byte_len = t - new_t; + + DRETURN_PTR(new_t, DLEVEL_STABLE); } static void @@ -2597,7 +2692,7 @@ txt = ewl_text_text_next_char(tmp, &idx); } - else if (*tmp == '\r' && *(ewl_text_text_next_char(tmp, &idx)) == '\n') + else if (*tmp == '\r' && *(tmp + 1) == '\n') { *tmp = '\0'; if (*txt) evas_textblock_cursor_text_append(cursor, txt); ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ enlightenment-cvs mailing list enlightenment-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/enlightenment-cvs