Jim Meyering wrote: > Bruno Haible wrote: > >> Hi Jim, >> >>> diff --git a/src/dfa.c b/src/dfa.c >>> index e28726d..8f79508 100644 >>> --- a/src/dfa.c >>> +++ b/src/dfa.c >>> @@ -1071,8 +1071,18 @@ parse_bracket_exp (void) >>> return CSET + charclass_index(ccl); >>> } >>> >>> +/* Add this to the test for whether a byte is word-constituent, since on >>> + BSD-based systems, many values in the 128..255 range are classified as >>> + alphabetic, while on glibc-based systems, they are not. */ >>> +#ifdef __GLIBC__ >>> +# define octet_valid_as_wide_char(c) 1 >>> +#else >>> +# define octet_valid_as_wide_char(c) (MBS_SUPPORT && btowc (c) != WEOF) >>> +#endif >>> + >>> /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ >>> -#define IS_WORD_CONSTITUENT(C) (isalnum(C) || (C) == '_') >>> +#define IS_WORD_CONSTITUENT(C) \ >>> + (octet_valid_as_wide_char(C) && (isalnum(C) || (C) == '_')) >>> >> >> This code would do the job. >> >> Only, I find this macro name 'octet_valid_as_wide_char' confusing - >> because values such as 0xC3 are valid octets and also valid wide characters. >> I would call this macro 'is_valid_single_byte_character' or >> 'is_valid_unibyte_character'. Then it's clear why it has to map 0xC3 to false >> in UTF-8 encoding. > > Thanks. I prefer your names, too. > I'll use is_valid_unibyte_character.
I pushed this: >From f3d95e96a371111f8b9b4941f1075933c904142a Mon Sep 17 00:00:00 2001 From: Jim Meyering <[email protected]> Date: Mon, 21 Nov 2011 18:38:17 +0100 Subject: [PATCH] portability: work consistently on *BSD systems * src/dfa.c (is_valid_unibyte_character): Define. (IS_WORD_CONSTITUENT): Use it here, to make grep work consistently even on *BSD systems, which use different tables for ctype macros like isalpha. http://thread.gmane.org/gmane.comp.gnu.grep.bugs/4022 With help from Bruno Haible. --- src/dfa.c | 12 +++++++++++- 1 files changed, 11 insertions(+), 1 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index e28726d..9e7c136 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -1071,8 +1071,18 @@ parse_bracket_exp (void) return CSET + charclass_index(ccl); } +/* Add this to the test for whether a byte is word-constituent, since on + BSD-based systems, many values in the 128..255 range are classified as + alphabetic, while on glibc-based systems, they are not. */ +#ifdef __GLIBC__ +# define is_valid_unibyte_character(c) 1 +#else +# define is_valid_unibyte_character(c) (MBS_SUPPORT && btowc (c) != WEOF) +#endif + /* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ -#define IS_WORD_CONSTITUENT(C) (isalnum(C) || (C) == '_') +#define IS_WORD_CONSTITUENT(C) \ + (is_valid_unibyte_character(C) && (isalnum(C) || (C) == '_')) static token lex (void) -- 1.7.8.rc2.3.g0911
