Re: strcasecmp and multibyte encodings
Thanks for feedback to Stephan and Matthew. Updating patch with all your feedback, and having into account that n in strncmp counts bytes on s1 (as stated in the thread of the POSIX list sent byt Matthew). On Tue, Apr 09, 2013 at 11:17:26AM -0700, Matthew Dempsky wrote: These strlen() calls are also wrong, because they could read past the n bytes allowed for strncasecmp(). O It seems strncasecmp() cannot return errors according to POSIX. Only the _l variants have errors defined. The convention for methods like these to return errors is the caller is required to set errno = 0; before the call, and then check for errno != 0 after the call. :( While I recognize that POSIX mandates the current locale to be used for case conversion, I'm not sure this change is worth the extra complexity. As you point out, the standard seems to throw its hands up in the air when the question about conversion errors with strcasecmp() and strncasecmp() is raised, possibly because these functions are older than the locale concept. The definition of strncasecmp() has actually been a pretty contentious topic on the POSIX mailing list for the past few weeks, albeit mostly focused on whether LC_CTYPE=POSIX allows for UTF-8 or not. E.g., http://austingroupbugs.net/view.php?id=663 I'm inclined to leave the functions as is for now. -- Dios, gracias por tu amor infinito. -- Vladimir Támara Patiño. http://vtamara.pasosdeJesus.org/ http://www.pasosdejesus.org/dominio_publico_colombia.html --- src53/lib/libc/string/strcasecmp.c Mon Mar 25 18:28:29 2013 +++ src/lib/libc/string/strcasecmp.cWed Apr 10 04:20:02 2013 @@ -29,7 +29,10 @@ * SUCH DAMAGE. */ +#include stdlib.h #include string.h +#include wchar.h +#include wctype.h typedef unsigned char u_char; @@ -76,18 +79,11 @@ int strcasecmp(const char *s1, const char *s2) { - const u_char *cm = charmap; - const u_char *us1 = (const u_char *)s1; - const u_char *us2 = (const u_char *)s2; - - while (cm[*us1] == cm[*us2++]) - if (*us1++ == '\0') - return (0); - return (cm[*us1] - cm[*--us2]); + return strncasecmp(s1, s2, strlen(s1) + 1); } int -strncasecmp(const char *s1, const char *s2, size_t n) +sbstrncasecmp(const char *s1, const char *s2, size_t n) { if (n != 0) { const u_char *cm = charmap; @@ -100,6 +96,42 @@ if (*us1++ == '\0') break; } while (--n != 0); + } + return (0); +} + +int +strncasecmp(const char *s1, const char *s2, size_t n) +{ + mbstate_t mb1, mb2; + + bzero(mb1, sizeof(mb1)); + bzero(mb2, sizeof(mb2)); + if (n != 0) { + const u_char *us1 = (const u_char *)s1; + const u_char *us2 = (const u_char *)s2; + size_t d1, d2; + + do { + wchar_t w1, w2, l1, l2; + size_t ml = n MB_CUR_MAX ? n : MB_CUR_MAX; + d1 = mbrtowc(w1, us1, ml, mb1); + if (d1 == (size_t)-1 || d1 == (size_t)-2) { + return sbstrncasecmp(s1, s2, n); + } + d2 = mbrtowc(w2, us2, ml, mb2); + if (d2 == (size_t)-1 || d2 == (size_t)-2) { + return sbstrncasecmp(s1, s2, n); + } + if ((l1 = towlower(w1)) != (l2 = towlower(w2))) { + return l1 - l2; + } + if (*us1 == '\0') + break; + us2 += d2; + us1 += d1; + n -= d1; + } while (n != 0); } return (0); }
Re: strcasecmp and multibyte encodings
On Wed, Apr 10, 2013 at 06:12:01AM -0500, Vladimir Támara Patiño wrote: Thanks for feedback to Stephan and Matthew. Updating patch with all your feedback, and having into account that n in strncmp counts bytes on s1 (as stated in the thread of the POSIX list sent byt Matthew). I think you misunderstood. We don't want to change the strcasecmp() and strcasencmp() functions because their behaviour with respect to locales and non-ASCII character sets aren't well defined by POSIX. The bits of review of your code were just for your information.
strcasecmp and multibyte encodings
Although the behavior of strcasecmp is unsepecified for multibyte encodings (Is that right?) http://pubs.opengroup.org/onlinepubs/9699919799/ I wish the attached test (encoded in UTF-8) would pass, so I'm also attaching a patch for strcasecmp and strncasecmp that makes this test pass, it uses only LC_CTYPE (but not LC_COLLATE) and a simple numeric comparision when the strings are different (as the previous version was doing). -- Dios, gracias por tu amor infinito. -- Vladimir Támara Patiño. http://vtamara.pasosdeJesus.org/ http://www.pasosdejesus.org/dominio_publico_colombia.html #include locale.h #include stdio.h #include string.h int main() { char *nl = setlocale(LC_ALL, es_CO.UTF-8); if (strcasecmp(ñ, Ñ) == 0) { printf(OK); } else { printf(Error); } return 0; } --- src53/lib/libc/string/strcasecmp.c Mon Mar 25 18:28:29 2013 +++ src/lib/libc/string/strcasecmp.cTue Apr 9 11:34:44 2013 @@ -30,6 +30,8 @@ */ #include string.h +#include wchar.h +#include wctype.h typedef unsigned char u_char; @@ -76,29 +78,36 @@ int strcasecmp(const char *s1, const char *s2) { - const u_char *cm = charmap; - const u_char *us1 = (const u_char *)s1; - const u_char *us2 = (const u_char *)s2; - - while (cm[*us1] == cm[*us2++]) - if (*us1++ == '\0') - return (0); - return (cm[*us1] - cm[*--us2]); + return strncasecmp(s1, s2, strlen(s1) + 1); } +/** Uses LC_CTYPE but not LC_COLLATE */ int strncasecmp(const char *s1, const char *s2, size_t n) { + mbstate_t mb1, mb2; + bzero(mb1, sizeof(mb1)); + bzero(mb2, sizeof(mb2)); + mbsinit(mb1); + mbsinit(mb2); + if (n != 0) { - const u_char *cm = charmap; const u_char *us1 = (const u_char *)s1; const u_char *us2 = (const u_char *)s2; + size_t lus1 = strlen(us1); + size_t lus2 = strlen(us2); do { - if (cm[*us1] != cm[*us2++]) - return (cm[*us1] - cm[*--us2]); - if (*us1++ == '\0') + wchar_t w1, w2, l1, l2; + size_t d1 = mbrtowc(w1, us1, lus1, mb1); + size_t d2 = mbrtowc(w2, us2, lus2, mb2); + if ((l1 = towlower(w1)) != (l2 = towlower(w2))) { + return l1 - l2; + } + if (*us1 == '\0') break; + us2 += d2; + us1 += d1; } while (--n != 0); } return (0);
Re: strcasecmp and multibyte encodings
On Tue, Apr 09, 2013 at 12:25:23PM -0500, Vladimir Támara Patiño wrote: Although the behavior of strcasecmp is unsepecified for multibyte encodings (Is that right?) http://pubs.opengroup.org/onlinepubs/9699919799/ I wish the attached test (encoded in UTF-8) would pass, so I'm also attaching a patch for strcasecmp and strncasecmp that makes this test pass, it uses only LC_CTYPE (but not LC_COLLATE) and a simple numeric comparision when the strings are different (as the previous version was doing). -- Dios, gracias por tu amor infinito. -- Vladimir Támara Patiño. http://vtamara.pasosdeJesus.org/ http://www.pasosdejesus.org/dominio_publico_colombia.html #include locale.h #include stdio.h #include string.h int main() { char *nl = setlocale(LC_ALL, es_CO.UTF-8); if (strcasecmp(ñ, Ñ) == 0) { printf(OK); } else { printf(Error); } return 0; } --- src53/lib/libc/string/strcasecmp.cMon Mar 25 18:28:29 2013 +++ src/lib/libc/string/strcasecmp.c Tue Apr 9 11:34:44 2013 @@ -30,6 +30,8 @@ */ #include string.h +#include wchar.h +#include wctype.h typedef unsigned char u_char; @@ -76,29 +78,36 @@ int strcasecmp(const char *s1, const char *s2) { - const u_char *cm = charmap; - const u_char *us1 = (const u_char *)s1; - const u_char *us2 = (const u_char *)s2; - - while (cm[*us1] == cm[*us2++]) - if (*us1++ == '\0') - return (0); - return (cm[*us1] - cm[*--us2]); + return strncasecmp(s1, s2, strlen(s1) + 1); } +/** Uses LC_CTYPE but not LC_COLLATE */ int strncasecmp(const char *s1, const char *s2, size_t n) { + mbstate_t mb1, mb2; Please put a blank line between declarations and code. + bzero(mb1, sizeof(mb1)); + bzero(mb2, sizeof(mb2)); + mbsinit(mb1); + mbsinit(mb2); These mbsinit() calls make no sense. mbsinit does not initialize anything. See the mbsinit man page. The bzero() calls are sufficient. + if (n != 0) { - const u_char *cm = charmap; const u_char *us1 = (const u_char *)s1; const u_char *us2 = (const u_char *)s2; + size_t lus1 = strlen(us1); + size_t lus2 = strlen(us2); do { - if (cm[*us1] != cm[*us2++]) - return (cm[*us1] - cm[*--us2]); - if (*us1++ == '\0') + wchar_t w1, w2, l1, l2; + size_t d1 = mbrtowc(w1, us1, lus1, mb1); + size_t d2 = mbrtowc(w2, us2, lus2, mb2); How are you going to handle errors from mbrtowc()? It seems strncasecmp() cannot return errors according to POSIX. Only the _l variants have errors defined. + if ((l1 = towlower(w1)) != (l2 = towlower(w2))) { Here, you could be comparing (size_t)-1 to (size_t)-2, for example. Should this code fall back to byte-to-byte comparison if mbrtowc() fails? While I recognize that POSIX mandates the current locale to be used for case conversion, I'm not sure this change is worth the extra complexity. As you point out, the standard seems to throw its hands up in the air when the question about conversion errors with strcasecmp() and strncasecmp() is raised, possibly because these functions are older than the locale concept. I would say leave these functions as they are, and rely on applications to use wide characters or the _l variants of these functions instead. + return l1 - l2; + } + if (*us1 == '\0') break; + us2 += d2; + us1 += d1; } while (--n != 0); } return (0);
Re: strcasecmp and multibyte encodings
On Tue, Apr 9, 2013 at 11:10 AM, Stefan Sperling s...@openbsd.org wrote: + size_t lus1 = strlen(us1); + size_t lus2 = strlen(us2); These strlen() calls are also wrong, because they could read past the n bytes allowed for strncasecmp(). It seems strncasecmp() cannot return errors according to POSIX. Only the _l variants have errors defined. The convention for methods like these to return errors is the caller is required to set errno = 0; before the call, and then check for errno != 0 after the call. :( While I recognize that POSIX mandates the current locale to be used for case conversion, I'm not sure this change is worth the extra complexity. As you point out, the standard seems to throw its hands up in the air when the question about conversion errors with strcasecmp() and strncasecmp() is raised, possibly because these functions are older than the locale concept. The definition of strncasecmp() has actually been a pretty contentious topic on the POSIX mailing list for the past few weeks, albeit mostly focused on whether LC_CTYPE=POSIX allows for UTF-8 or not. E.g., http://austingroupbugs.net/view.php?id=663 I'm inclined to leave the functions as is for now.