Hello, a few months ago I mentioned[1] an issue with _isctype/iswctype 
functions when it comes to handling TAB character.

I'm attaching simple programs which can be used to demonstrate inconsistencies 
between generic _isctype/iswctype and class-specific function such as 
isblank/iswblank. First argument to them is locale to test, you can use simple 
"C" and "english" as an argument.

Notable differences include:

1. is[w]ctype(_BLANK) return inconsistent results for TAB character, as was 
mentioned in [1].
2. isdigit returns non-zero only for digits 0-9, while _isctype may return 
non-zero for other code points such as 0xB2, 0xB3 and 0xB9 in ISO-8859-1 (code 
page 28591). Behavior of _isctype is consistent with iswctype and iswdigit 
though.

While writing those simple test programs I realized that mingw-w64's wctype.c 
does not handle "blank" class, so this definitely needs to be fixed.

The main source of issues comes from _BLANK macro in [w]ctype.h. Microsoft 
header files mention that _BLANK is only used for *space* characters, so TAB 
must be handled explicitly. In practice, usage of _BLANK produces inconsistent 
results.

Also note that _BLANK is used in macro versions of is[w]print, which means they 
may return non-zero for TAB in the same situations when is[w]ctype would. 
Library versions of is[w]print seem to follow behavior of is[w]ctype(_BLANK) 
for TAB character:

1. When iswctype(L'\t', _BLANK) is non-zero, iswprint(L'\t') is also non-zero. 
This is behavior of CRTs which do not have is[w]blank functions: msvcr110.dll 
and older.
2. When iswctype(L'\t', _BLANK) is zero, iswprint(L'\t') is also zero. This is 
behavior of CRTs which have is[w]blank functions: msvcr120.dll and UCRT.

However, library's isprint ('\t') is always zero in "C" locale and non-zero 
otherwise.

In addition to adding support for "blank" in wctype.c, we may want to replace 
is[w]ctype functions for all CRTs, which sounds like an overkill to me. I would 
like to know what others think about this matter and whether it is worth 
replacing is[w]ctype functions to properly handle TAB character.

- Kirill Makurin

[1] https://sourceforge.net/p/mingw-w64/mailman/message/59189945/
#define __USE_MINGW_ANSI_STDIO 0

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <io.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>

#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#undef iscntrl
#undef isprint
#undef isgraph
#undef isalnum
#undef isalpha
#undef isupper
#undef islower
#undef isdigit
#undef ispunct
#undef isspace
#undef isblank

static void Mismatch(const wchar_t *class, int c, int specific, int generic,
                     int print) {
  if (print) {
    wprintf(L"0x%-4.2X (%hc) | %s | %-8d %-7d\n", c, c, class, specific,
            generic);
  } else {
    wprintf(L"0x%-8.2X | %s | %-8d %-7d\n", c, class, specific, generic);
  }
}

int wmain(int argc, wchar_t **argv) {
#if defined(_UCRT) || (defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ >= 
0x0800)
  _setmode(_fileno(stdout), _O_U8TEXT);
  _setmode(_fileno(stderr), _O_U8TEXT);
#endif

  if (_wsetlocale(LC_ALL, argc > 1 ? argv[1] : L"") == NULL) {
    fwprintf(stderr, L"Failed to set locale\n");
    exit(EXIT_FAILURE);
  }

  wctype_t alnum = wctype("alnum");
  assert(alnum != (wctype_t)0);
  wctype_t alpha = wctype("alpha");
  assert(alpha != (wctype_t)0);
#if defined(_UCRT) || (defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ >= 
0x0C00)
  wctype_t blank = wctype("blank");
  assert(blank != (wctype_t)0);
#else
  wctype_t blank = _BLANK;
#endif
  wctype_t cntrl = wctype("cntrl");
  assert(cntrl != (wctype_t)0);
  wctype_t digit = wctype("digit");
  assert(digit != (wctype_t)0);
  wctype_t graph = wctype("graph");
  assert(graph != (wctype_t)0);
  wctype_t lower = wctype("lower");
  assert(lower != (wctype_t)0);
  wctype_t print = wctype("print");
  assert(print != (wctype_t)0);
  wctype_t punct = wctype("punct");
  assert(punct != (wctype_t)0);
  wctype_t space = wctype("space");
  assert(space != (wctype_t)0);
  wctype_t upper = wctype("upper");
  assert(upper != (wctype_t)0);

  wprintf(L"Code point | Class | Specific Generic\n");

  for (int c = 0; c < 0x100; ++c) {
    int specific = 0;
    int generic = 0;

    if ((specific = !!isalnum(c)) != (generic = !!_isctype(c, alnum))) {
      Mismatch(L"alnum", c, specific, generic, 1);
    }
    if ((specific = !!isalpha(c)) != (generic = !!_isctype(c, alpha))) {
      Mismatch(L"alpha", c, specific, generic, 1);
    }
    if ((specific = !!isblank(c)) != (generic = !!_isctype(c, blank))) {
      Mismatch(L"blank", c, specific, generic, 0);
    }
    if ((specific = !!iscntrl(c)) != (generic = !!_isctype(c, cntrl))) {
      Mismatch(L"cntrl", c, specific, generic, 0);
    }
    if ((specific = !!isdigit(c)) != (generic = !!_isctype(c, digit))) {
      Mismatch(L"digit", c, specific, generic, 1);
    }
    if ((specific = !!isgraph(c)) != (generic = !!_isctype(c, graph))) {
      Mismatch(L"graph", c, specific, generic, 1);
    }
    if ((specific = !!islower(c)) != (generic = !!_isctype(c, lower))) {
      Mismatch(L"lower", c, specific, generic, 1);
    }
    if ((specific = !!isprint(c)) != (generic = !!_isctype(c, print))) {
      Mismatch(L"print", c, specific, generic, 1);
    }
    if ((specific = !!ispunct(c)) != (generic = !!_isctype(c, punct))) {
      Mismatch(L"punct", c, specific, generic, 1);
    }
    if ((specific = !!isspace(c)) != (generic = !!_isctype(c, space))) {
      Mismatch(L"space", c, specific, generic, 1);
    }
    if ((specific = !!isupper(c)) != (generic = !!_isctype(c, upper))) {
      Mismatch(L"upper", c, specific, generic, 1);
    }
  }

  exit(EXIT_SUCCESS);
}
#define __USE_MINGW_ANSI_STDIO 0

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <io.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>

#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#undef iswcntrl
#undef iswprint
#undef iswgraph
#undef iswalnum
#undef iswalpha
#undef iswupper
#undef iswlower
#undef iswdigit
#undef iswpunct
#undef iswspace
#undef iswblank

static void Mismatch(const wchar_t *class, wint_t wc, int specific, int generic,
                     int print) {
  if (print) {
    wprintf(L"U+%-4.4X (%c) | %s | %-8d %-7d\n", wc, wc, class, specific,
            generic);
  } else {
    wprintf(L"U+%-8.4X | %s | %-8d %-7d\n", wc, class, specific, generic);
  }
}

int wmain(int argc, wchar_t **argv) {
#if defined(_UCRT) || (defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ >= 
0x0800)
  _setmode(_fileno(stdout), _O_U8TEXT);
  _setmode(_fileno(stderr), _O_U8TEXT);
#endif

  if (_wsetlocale(LC_ALL, argc > 1 ? argv[1] : L"") == NULL) {
    fwprintf(stderr, L"Failed to set locale\n");
    exit(EXIT_FAILURE);
  }

  wctype_t alnum = wctype("alnum");
  assert(alnum != (wctype_t)0);
  wctype_t alpha = wctype("alpha");
  assert(alpha != (wctype_t)0);
#if defined(_UCRT) || (defined(__MSVCRT_VERSION__) && __MSVCRT_VERSION__ >= 
0x0C00)
  wctype_t blank = wctype("blank");
  assert(blank != (wctype_t)0);
#else
  wctype_t blank = _BLANK;
#endif
  wctype_t cntrl = wctype("cntrl");
  assert(cntrl != (wctype_t)0);
  wctype_t digit = wctype("digit");
  assert(digit != (wctype_t)0);
  wctype_t graph = wctype("graph");
  assert(graph != (wctype_t)0);
  wctype_t lower = wctype("lower");
  assert(lower != (wctype_t)0);
  wctype_t print = wctype("print");
  assert(print != (wctype_t)0);
  wctype_t punct = wctype("punct");
  assert(punct != (wctype_t)0);
  wctype_t space = wctype("space");
  assert(space != (wctype_t)0);
  wctype_t upper = wctype("upper");
  assert(upper != (wctype_t)0);

  wprintf(L"Code point | Class | Specific Generic\n");

  for (wint_t wc = 0;; ++wc) {
    int specific = 0;
    int generic = 0;

    if ((specific = !!iswalnum(wc)) != (generic = !!iswctype(wc, alnum))) {
      Mismatch(L"alnum", wc, specific, generic, 1);
    }
    if ((specific = !!iswalpha(wc)) != (generic = !!iswctype(wc, alpha))) {
      Mismatch(L"alpha", wc, specific, generic, 1);
    }
    if ((specific = !!iswblank(wc)) != (generic = !!iswctype(wc, blank))) {
      Mismatch(L"blank", wc, specific, generic, 0);
    }
    if ((specific = !!iswcntrl(wc)) != (generic = !!iswctype(wc, cntrl))) {
      Mismatch(L"cntrl", wc, specific, generic, 0);
    }
    if ((specific = !!iswdigit(wc)) != (generic = !!iswctype(wc, digit))) {
      Mismatch(L"digit", wc, specific, generic, 1);
    }
    if ((specific = !!iswgraph(wc)) != (generic = !!iswctype(wc, graph))) {
      Mismatch(L"graph", wc, specific, generic, 1);
    }
    if ((specific = !!iswlower(wc)) != (generic = !!iswctype(wc, lower))) {
      Mismatch(L"lower", wc, specific, generic, 1);
    }
    if ((specific = !!iswprint(wc)) != (generic = !!iswctype(wc, print))) {
      Mismatch(L"print", wc, specific, generic, 1);
    }
    if ((specific = !!iswpunct(wc)) != (generic = !!iswctype(wc, punct))) {
      Mismatch(L"punct", wc, specific, generic, 1);
    }
    if ((specific = !!iswspace(wc)) != (generic = !!iswctype(wc, space))) {
      Mismatch(L"space", wc, specific, generic, 1);
    }
    if ((specific = !!iswupper(wc)) != (generic = !!iswctype(wc, upper))) {
      Mismatch(L"upper", wc, specific, generic, 1);
    }

    if (wc == WEOF) {
      break;
    }
  }

  exit(EXIT_SUCCESS);
}
_______________________________________________
Mingw-w64-public mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public

Reply via email to