Paul Eggert wrote:
I'll look into writing patches for that, one for grep (which affects dfa.c), one for gawk (which will use the same patches to dfa.c).
OK, I got it to work, and it simplifies grep (not surprising) and gawk as well (a bit surprising, but there it is). I'm attaching the patches, one for each program. These patches include Paolo's original suggestion at the start of bug#17157, plus several other simplifications to dfa.c.
From 3a0e92f05691137bd95130df296956e548876f39 Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Thu, 3 Apr 2014 18:14:15 -0700 Subject: [PATCH] grep: simplify dfa.c by having it not include mbsupport.h directly * src/mbsupport.h: Remove. * src/Makefile.am (noinst_HEADERS): Remove mbsupport.h. * src/dfa.c, src/grep.c, src/search.h: Don't include mbsupport.h. * src/dfa.c: Include wchar.h and wctype.h unconditionally, as this simplifies the use of dfa.c in grep, and it does no harm in gawk. (setlocale, static_assert): Remove gawk-specific hacks, as gawk now does these itself. (struct dfa, dfambcache, mbs_to_wchar) (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): * src/dfasearch.c (EGexecute): * src/grep.c (main): * src/searchutils.c (mbtoupper): Assume MBS_SUPPORT. --- src/Makefile.am | 2 +- src/dfa.c | 94 +++++++++---------------------------------------------- src/dfasearch.c | 3 -- src/grep.c | 3 -- src/mbsupport.h | 29 ----------------- src/search.h | 3 -- src/searchutils.c | 2 -- 7 files changed, 16 insertions(+), 120 deletions(-) delete mode 100644 src/mbsupport.h diff --git a/src/Makefile.am b/src/Makefile.am index 3487848..f8c9415 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -27,7 +27,7 @@ grep_SOURCES = grep.c searchutils.c \ dfa.c dfasearch.c \ kwset.c kwsearch.c \ pcresearch.c -noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h +noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h # Sometimes, the expansion of $(LIBINTL) includes -lc which may # include modules defining variables like 'optind', so libgreputils.a diff --git a/src/dfa.c b/src/dfa.c index b6fbd58..0d7eab5 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -34,16 +34,6 @@ #include <locale.h> #include <stdbool.h> -/* Gawk doesn't use Gnulib, so don't assume that setlocale and - static_assert are present. */ -#ifndef LC_ALL -# define setlocale(category, locale) NULL -#endif -#ifndef static_assert -# define static_assert(cond, diagnostic) \ - extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] -#endif - #define STREQ(a, b) (strcmp (a, b) == 0) /* ISASCIIDIGIT differs from isdigit, as follows: @@ -60,12 +50,8 @@ #include "gettext.h" #define _(str) gettext (str) -#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */ -#if MBS_SUPPORT -/* We can handle multibyte strings. */ -# include <wchar.h> -# include <wctype.h> -#endif +#include <wchar.h> +#include <wctype.h> #if HAVE_LANGINFO_CODESET # include <langinfo.h> @@ -376,13 +362,11 @@ struct dfa size_t nmultibyte_prop; int *multibyte_prop; -#if MBS_SUPPORT /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is the leading byte of a multibyte character. Invalid and null bytes are mapped to themselves. */ wint_t mbrtowc_cache[NOTCHAR]; -#endif /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; @@ -488,7 +472,6 @@ static void regexp (void); static void dfambcache (struct dfa *d) { -#if MBS_SUPPORT int i; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { @@ -505,10 +488,8 @@ dfambcache (struct dfa *d) } d->mbrtowc_cache[uc] = wi; } -#endif } -#if MBS_SUPPORT /* Given the dfa D, store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, updating the conversion state in *MBS. On conversion error, convert just a @@ -542,7 +523,6 @@ mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n, *pwc = wc; return 1; } -#endif #ifdef DEBUG @@ -712,7 +692,7 @@ static charclass newline; #ifdef __GLIBC__ # define is_valid_unibyte_character(c) 1 #else -# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF)) +# define is_valid_unibyte_character(c) (btowc (c) != WEOF) #endif /* Return non-zero if C is a "word-constituent" byte; zero otherwise. */ @@ -773,17 +753,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) static bool setbit_wc (wint_t wc, charclass c) { -#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -#else - abort (); - /*NOTREACHED*/ return false; -#endif } /* Set a bit for B and its case variants in the charclass C. @@ -808,7 +783,7 @@ using_utf8 (void) static int utf8 = -1; if (utf8 == -1) { -#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT +#if defined HAVE_LANGINFO_CODESET utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8")); #else utf8 = 0; @@ -897,7 +872,6 @@ static unsigned char const *buf_begin; /* reference to begin in dfaexec. */ static unsigned char const *buf_end; /* reference to end in dfaexec. */ -#if MBS_SUPPORT /* Note that characters become unsigned here. */ # define FETCH_WC(c, wc, eoferr) \ do { \ @@ -920,23 +894,6 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ } \ } while (0) -#else -/* Note that characters become unsigned here. */ -# define FETCH_WC(c, unused, eoferr) \ - do { \ - if (! lexleft) \ - { \ - if ((eoferr) != 0) \ - dfaerror (eoferr); \ - else \ - return lasttok = END; \ - } \ - (c) = to_uchar (*lexptr++); \ - --lexleft; \ - } while (0) - -#endif /* MBS_SUPPORT */ - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif @@ -1720,7 +1677,6 @@ addtok (token t) } } -#if MBS_SUPPORT /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. @@ -1752,17 +1708,10 @@ addtok_wc (wint_t wc) addtok (CAT); } } -#else -static void -addtok_wc (wint_t wc) -{ -} -#endif static void add_utf8_anychar (void) { -#if MBS_SUPPORT static const charclass utf8_classes[5] = { {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-leading bytes */ {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */ @@ -1807,7 +1756,6 @@ add_utf8_anychar (void) addtok (CAT); addtok (OR); } -#endif } /* The grammar understood by the parser is as follows. @@ -1848,7 +1796,7 @@ add_utf8_anychar (void) static void atom (void) { - if (MBS_SUPPORT && tok == WCHAR) + if (tok == WCHAR) { addtok_wc (wctok); @@ -1865,7 +1813,7 @@ atom (void) tok = lex (); } - else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ()) + else if (tok == ANYCHAR && using_utf8 ()) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1879,9 +1827,7 @@ atom (void) } else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD -#if MBS_SUPPORT || tok == ANYCHAR || tok == MBCSET -#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok (tok); @@ -2164,11 +2110,9 @@ state_index (struct dfa *d, position_set const *s, int context) d->states[i].backref = 0; d->states[i].constraint = 0; d->states[i].first_end = 0; - if (MBS_SUPPORT) - { - d->states[i].mbps.nelem = 0; - d->states[i].mbps.elems = NULL; - } + d->states[i].mbps.nelem = 0; + d->states[i].mbps.elems = NULL; + for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { @@ -2206,10 +2150,8 @@ epsclosure (position_set * s, struct dfa const *d) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF -#if MBS_SUPPORT && d->tokens[s->elems[i].index] != ANYCHAR && d->tokens[s->elems[i].index] != MBCSET -#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; @@ -2526,9 +2468,7 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF -#if MBS_SUPPORT || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET -#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -2638,9 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (MBS_SUPPORT - && (d->tokens[pos.index] == ANYCHAR - || d->tokens[pos.index] == MBCSET)) + else if (d->tokens[pos.index] == ANYCHAR + || d->tokens[pos.index] == MBCSET) /* MB_CUR_MAX > 1 */ { /* ANYCHAR and MBCSET must match with a single character, so we @@ -2814,7 +2753,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ if (d->searchflag - && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte))) + && (d->mb_cur_max == 1 || !next_isnt_1st_byte)) for (j = 0; j < d->states[0].elems.nelem; ++j) insert (d->states[0].elems.elems[j], &follows); @@ -3366,7 +3305,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) static void prepare_wc_buf (struct dfa *d, const char *begin, const char *end) { -#if MBS_SUPPORT unsigned char eol = eolbyte; size_t i; size_t ilim = end - begin + 1; @@ -3390,7 +3328,6 @@ prepare_wc_buf (struct dfa *d, const char *begin, const char *end) buf_end = (unsigned char *) (begin + i); mblen_buf[i] = 0; inputwcs[i] = 0; /* sentinel */ -#endif /* MBS_SUPPORT */ } /* Search through a buffer looking for a match to the given struct dfa. @@ -3613,7 +3550,7 @@ dfaoptimize (struct dfa *d) { size_t i; - if (!MBS_SUPPORT || !using_utf8 ()) + if (!using_utf8 ()) return; for (i = 0; i < d->tindex; ++i) @@ -3663,8 +3600,7 @@ dfafree (struct dfa *d) for (i = 0; i < d->sindex; ++i) { free (d->states[i].elems.elems); - if (MBS_SUPPORT) - free (d->states[i].mbps.elems); + free (d->states[i].mbps.elems); } free (d->states); for (i = 0; i < d->tindex; ++i) @@ -4139,7 +4075,7 @@ dfamust (struct dfa *d) /* not on *my* shift */ goto done; } - else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET) + else if (t >= CSET || t == ANYCHAR || t == MBCSET) { /* easy enough */ resetmust (mp); diff --git a/src/dfasearch.c b/src/dfasearch.c index d098a9b..5665b82 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -239,9 +239,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size, char const *dfa_start = beg; if (kwsm.index < kwset_exact_matches) { - if (!MBS_SUPPORT) - goto success; - if (mb_start < beg) mb_start = beg; if (MB_CUR_MAX == 1 diff --git a/src/grep.c b/src/grep.c index a1bccdb..7033730 100644 --- a/src/grep.c +++ b/src/grep.c @@ -21,7 +21,6 @@ #include <config.h> #include <sys/types.h> #include <sys/stat.h> -#include "mbsupport.h" #include <wchar.h> #include <wctype.h> #include <fcntl.h> @@ -2461,10 +2460,8 @@ main (int argc, char **argv) } } -#if MBS_SUPPORT if (MB_CUR_MAX > 1) build_mbclen_cache (); -#endif compile (keys, keycc); free (keys); diff --git a/src/mbsupport.h b/src/mbsupport.h deleted file mode 100644 index 49c7926..0000000 --- a/src/mbsupport.h +++ /dev/null @@ -1,29 +0,0 @@ -/* mbsupport.h --- Localize determination of whether we have multibyte stuff. - - Copyright (C) 2004-2005, 2007, 2009-2014 Free Software Foundation, Inc. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 3, or (at your option) - any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA - 02110-1301, USA. */ - -#include <stdlib.h> - -#ifndef MBS_SUPPORT -# define MBS_SUPPORT 1 -#endif - -#if ! MBS_SUPPORT -# undef MB_CUR_MAX -# define MB_CUR_MAX 1 -#endif diff --git a/src/search.h b/src/search.h index 69e3afd..871b7d5 100644 --- a/src/search.h +++ b/src/search.h @@ -23,9 +23,6 @@ #include <sys/types.h> #include <stdint.h> - -#include "mbsupport.h" - #include <wchar.h> #include <wctype.h> #include <regex.h> diff --git a/src/searchutils.c b/src/searchutils.c index babb31f..6749945 100644 --- a/src/searchutils.c +++ b/src/searchutils.c @@ -48,7 +48,6 @@ kwsinit (kwset_t *kwset) xalloc_die (); } -#if MBS_SUPPORT /* Convert BEG, an *N-byte string, to uppercase, and write the NUL-terminated result into malloc'd storage. Upon success, set *N to the length (in bytes) of the resulting string (not including the @@ -276,4 +275,3 @@ is_mb_middle (const char **good, const char *buf, const char *end, /* P == BUF here. */ return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state); } -#endif /* MBS_SUPPORT */ -- 1.9.0
From 7e2b51d00133ab8a0dbcd21b5e0f39a6984f858f Mon Sep 17 00:00:00 2001 From: Paul Eggert <[email protected]> Date: Thu, 3 Apr 2014 18:04:52 -0700 Subject: [PATCH] awk: simplify dfa.c by having it not include mbsupport.h directly This syncs dfa.c better with 'grep'. * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h. * custom.h: Include mbsupport.h here instead. (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the number of differences between grep's dfa.c and ours. * dfa.c: Include wchar.h and wctype.h unconditionally, as this simplifies the use of dfa.c in grep, and it does no harm in gawk. (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]: Move to mbsupport.h (needed for consistency in all uses), and fix mbrtowc to return size_t. (setlocale, static_assert): Likewise. (struct dfa, dfambcache, mbs_to_wchar) (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): * dfasearch.c (EGexecute): * grep.c (main): * searchutils.c (mbtoupper): Assume MBS_SUPPORT. * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h before overriding their definitions. (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper) (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll): (btowc): Parenthesize properly. (mbrtowc, wcrtomb): New macros. (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid collisions with standard library. * missing_d/wcmisc.c: Remove now-unnecessary ifdefs. --- ChangeLog | 32 +++++++++++++++ awk.h | 2 - custom.h | 9 +++++ dfa.c | 111 +++++++--------------------------------------------- mbsupport.h | 54 +++++++++++++++++++++++-- missing_d/ChangeLog | 4 ++ missing_d/wcmisc.c | 10 ----- regex_internal.h | 2 - 8 files changed, 110 insertions(+), 114 deletions(-) diff --git a/ChangeLog b/ChangeLog index a0efd89..36fb0f4 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,35 @@ +2014-04-03 Paul Eggert <[email protected]> + + awk: simplify dfa.c by having it not include mbsupport.h directly + This syncs dfa.c better with 'grep'. + * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h. + * custom.h: Include mbsupport.h here instead. + (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the + number of differences between grep's dfa.c and ours. + * dfa.c: Include wchar.h and wctype.h unconditionally, as + this simplifies the use of dfa.c in grep, and it does no harm + in gawk. + (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]: + Move to mbsupport.h (needed for consistency in all uses), + and fix mbrtowc to return size_t. + (struct dfa, dfambcache, mbs_to_wchar) + (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC) + (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure) + (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust): + * dfasearch.c (EGexecute): + * grep.c (main): + * searchutils.c (mbtoupper): + Assume MBS_SUPPORT. + * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h + before overriding their definitions. + (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper) + (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll): + #undef before #defining. + (btowc): Parenthesize properly. + (mbrtowc, wcrtomb): New macros. + (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid + collisions with standard library. + 2014-04-03 Arnold D. Robbins <[email protected]> * regcomp.c (parse_bracket_exp): Move a call to `re_free' inside diff --git a/awk.h b/awk.h index aefdd07..cdba7a8 100644 --- a/awk.h +++ b/awk.h @@ -95,8 +95,6 @@ extern int errno; #include "missing_d/gawkbool.h" #endif -#include "mbsupport.h" /* defines MBS_SUPPORT */ - #if MBS_SUPPORT /* We can handle multibyte strings. */ #include <wchar.h> diff --git a/custom.h b/custom.h index 36b4aa0..bade4cf 100644 --- a/custom.h +++ b/custom.h @@ -76,3 +76,12 @@ extern int setenv(const char *name, const char *value, int rewrite); extern int unsetenv(const char *name); #endif + +/* The __pure__ attribute was added in gcc 2.96. */ +#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) +# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__)) +#else +# define _GL_ATTRIBUTE_PURE /* empty */ +#endif + +#include "mbsupport.h" diff --git a/dfa.c b/dfa.c index 378305d..ee6edd8 100644 --- a/dfa.c +++ b/dfa.c @@ -43,16 +43,6 @@ #include "missing_d/gawkbool.h" #endif /* HAVE_STDBOOL_H */ -/* Gawk doesn't use Gnulib, so don't assume that setlocale and - static_assert are present. */ -#ifndef LC_ALL -# define setlocale(category, locale) NULL -#endif -#ifndef static_assert -# define static_assert(cond, diagnostic) \ - extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] -#endif - #define STREQ(a, b) (strcmp (a, b) == 0) /* ISASCIIDIGIT differs from isdigit, as follows: @@ -69,21 +59,8 @@ #include "gettext.h" #define _(str) gettext (str) -#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate. */ -#if MBS_SUPPORT -/* We can handle multibyte strings. */ -# include <wchar.h> -# include <wctype.h> -#endif - -#ifdef GAWK -/* The __pure__ attribute was added in gcc 2.96. */ -#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96) -# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__)) -#else -# define _GL_ATTRIBUTE_PURE /* empty */ -#endif -#endif /* GAWK */ +#include <wchar.h> +#include <wctype.h> #if HAVE_LANGINFO_CODESET # include <langinfo.h> @@ -101,14 +78,6 @@ is_blank (int c) } #endif /* GAWK */ -#ifdef LIBC_IS_BORKED -extern int gawk_mb_cur_max; -#undef MB_CUR_MAX -#define MB_CUR_MAX gawk_mb_cur_max -#undef mbrtowc -#define mbrtowc(a, b, c, d) (-1) -#endif - /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit # undef setbit @@ -412,13 +381,11 @@ struct dfa size_t nmultibyte_prop; int *multibyte_prop; -#if MBS_SUPPORT /* A table indexed by byte values that contains the corresponding wide character (if any) for that byte. WEOF means the byte is the leading byte of a multibyte character. Invalid and null bytes are mapped to themselves. */ wint_t mbrtowc_cache[NOTCHAR]; -#endif /* Array of the bracket expression in the DFA. */ struct mb_char_classes *mbcsets; @@ -525,7 +492,6 @@ static void regexp (void); static void dfambcache (struct dfa *d) { -#if MBS_SUPPORT int i; for (i = CHAR_MIN; i <= CHAR_MAX; ++i) { @@ -542,10 +508,8 @@ dfambcache (struct dfa *d) } d->mbrtowc_cache[uc] = wi; } -#endif } -#if MBS_SUPPORT /* Given the dfa D, store into *PWC the result of converting the leading bytes of the multibyte buffer S of length N bytes, updating the conversion state in *MBS. On conversion error, convert just a @@ -579,7 +543,6 @@ mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, size_t n, *pwc = wc; return 1; } -#endif #ifdef DEBUG @@ -749,7 +712,7 @@ static charclass newline; #ifdef __GLIBC__ # define is_valid_unibyte_character(c) 1 #else -# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF)) +# define is_valid_unibyte_character(c) (btowc (c) != WEOF) #endif /* Return non-zero if C is a "word-constituent" byte; zero otherwise. */ @@ -810,17 +773,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) static bool setbit_wc (wint_t wc, charclass c) { -#if MBS_SUPPORT int b = wctob (wc); if (b == EOF) return false; setbit (b, c); return true; -#else - abort (); - /*NOTREACHED*/ return false; -#endif } /* Set a bit for B and its case variants in the charclass C. @@ -845,7 +803,7 @@ using_utf8 (void) static int utf8 = -1; if (utf8 == -1) { -#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT +#if defined HAVE_LANGINFO_CODESET utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8")); #else utf8 = 0; @@ -938,7 +896,6 @@ static unsigned char const *buf_begin; /* reference to begin in dfaexec. */ static unsigned char const *buf_end; /* reference to end in dfaexec. */ -#if MBS_SUPPORT /* Note that characters become unsigned here. */ # define FETCH_WC(c, wc, eoferr) \ do { \ @@ -961,23 +918,6 @@ static unsigned char const *buf_end; /* reference to end in dfaexec. */ } \ } while (0) -#else -/* Note that characters become unsigned here. */ -# define FETCH_WC(c, unused, eoferr) \ - do { \ - if (! lexleft) \ - { \ - if ((eoferr) != 0) \ - dfaerror (eoferr); \ - else \ - return lasttok = END; \ - } \ - (c) = to_uchar (*lexptr++); \ - --lexleft; \ - } while (0) - -#endif /* MBS_SUPPORT */ - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif @@ -1761,7 +1701,6 @@ addtok (token t) } } -#if MBS_SUPPORT /* We treat a multibyte character as a single atom, so that DFA can treat a multibyte character as a single expression. @@ -1793,17 +1732,10 @@ addtok_wc (wint_t wc) addtok (CAT); } } -#else -static void -addtok_wc (wint_t wc) -{ -} -#endif static void add_utf8_anychar (void) { -#if MBS_SUPPORT static const charclass utf8_classes[5] = { {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-leading bytes */ {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */ @@ -1848,7 +1780,6 @@ add_utf8_anychar (void) addtok (CAT); addtok (OR); } -#endif } /* The grammar understood by the parser is as follows. @@ -1889,7 +1820,7 @@ add_utf8_anychar (void) static void atom (void) { - if (MBS_SUPPORT && tok == WCHAR) + if (tok == WCHAR) { addtok_wc (wctok); @@ -1906,7 +1837,7 @@ atom (void) tok = lex (); } - else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ()) + else if (tok == ANYCHAR && using_utf8 ()) { /* For UTF-8 expand the period to a series of CSETs that define a valid UTF-8 character. This avoids using the slow multibyte path. I'm @@ -1920,9 +1851,7 @@ atom (void) } else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD -#if MBS_SUPPORT || tok == ANYCHAR || tok == MBCSET -#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok (tok); @@ -2205,11 +2134,9 @@ state_index (struct dfa *d, position_set const *s, int context) d->states[i].backref = 0; d->states[i].constraint = 0; d->states[i].first_end = 0; - if (MBS_SUPPORT) - { - d->states[i].mbps.nelem = 0; - d->states[i].mbps.elems = NULL; - } + d->states[i].mbps.nelem = 0; + d->states[i].mbps.elems = NULL; + for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { @@ -2247,10 +2174,8 @@ epsclosure (position_set * s, struct dfa const *d) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF -#if MBS_SUPPORT && d->tokens[s->elems[i].index] != ANYCHAR && d->tokens[s->elems[i].index] != MBCSET -#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; @@ -2567,9 +2492,7 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF -#if MBS_SUPPORT || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET -#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -2679,9 +2602,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) setbit (d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset (d->charclasses[d->tokens[pos.index] - CSET], matches); - else if (MBS_SUPPORT - && (d->tokens[pos.index] == ANYCHAR - || d->tokens[pos.index] == MBCSET)) + else if (d->tokens[pos.index] == ANYCHAR + || d->tokens[pos.index] == MBCSET) /* MB_CUR_MAX > 1 */ { /* ANYCHAR and MBCSET must match with a single character, so we @@ -2855,7 +2777,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[]) /* If we are building a searching matcher, throw in the positions of state 0 as well. */ if (d->searchflag - && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte))) + && (d->mb_cur_max == 1 || !next_isnt_1st_byte)) for (j = 0; j < d->states[0].elems.nelem; ++j) insert (d->states[0].elems.elems[j], &follows); @@ -3407,7 +3329,6 @@ transit_state (struct dfa *d, state_num s, unsigned char const **pp) static void prepare_wc_buf (struct dfa *d, const char *begin, const char *end) { -#if MBS_SUPPORT unsigned char eol = eolbyte; size_t i; size_t ilim = end - begin + 1; @@ -3431,7 +3352,6 @@ prepare_wc_buf (struct dfa *d, const char *begin, const char *end) buf_end = (unsigned char *) (begin + i); mblen_buf[i] = 0; inputwcs[i] = 0; /* sentinel */ -#endif /* MBS_SUPPORT */ } /* Search through a buffer looking for a match to the given struct dfa. @@ -3653,7 +3573,7 @@ dfaoptimize (struct dfa *d) { size_t i; - if (!MBS_SUPPORT || !using_utf8 ()) + if (!using_utf8 ()) return; for (i = 0; i < d->tindex; ++i) @@ -3703,8 +3623,7 @@ dfafree (struct dfa *d) for (i = 0; i < d->sindex; ++i) { free (d->states[i].elems.elems); - if (MBS_SUPPORT) - free (d->states[i].mbps.elems); + free (d->states[i].mbps.elems); } free (d->states); for (i = 0; i < d->tindex; ++i) @@ -4179,7 +4098,7 @@ dfamust (struct dfa *d) /* not on *my* shift */ goto done; } - else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET) + else if (t >= CSET || t == ANYCHAR || t == MBCSET) { /* easy enough */ resetmust (mp); diff --git a/mbsupport.h b/mbsupport.h index 9a62486..ab33e91 100644 --- a/mbsupport.h +++ b/mbsupport.h @@ -66,6 +66,15 @@ #endif #if ! MBS_SUPPORT + +/* Include wchar.h and wctype.h so their definitions can be overridden. */ + +# include <wchar.h> +# include <wctype.h> + +/* Override the definitions of wchar.h and wctype.h to provide a + unibyte substitute that is good enough for Gawk. */ + # undef MB_CUR_MAX # define MB_CUR_MAX 1 @@ -78,15 +87,24 @@ #define wctype_t int #define wint_t int #define mbstate_t int +#undef WEOF #define WEOF EOF +#undef towupper #define towupper toupper +#undef towlower #define towlower tolower #ifndef __DJGPP__ -#define btowc(x) ((int)x) +#undef btowc +#define btowc(x) ((int) (x)) #endif +#undef iswalnum #define iswalnum isalnum +#undef iswalpha #define iswalpha isalpha +#undef iswupper #define iswupper isupper +#undef iswlower +#define iswlower islower #if defined(ZOS_USS) #undef towupper #undef towlower @@ -94,12 +112,40 @@ #undef iswalnum #undef iswalpha #undef iswupper -#undef wctype -#undef iswctype -#undef wcscoll #endif +#undef mbrtowc +#define mbrtowc(pwc, s, n, ps) ((size_t) -1) +#undef wcrtomb +#define wcrtomb(s, wc, ps) ((size_t) -1) + +#undef wctype +#define wctype gawk_wctype extern wctype_t wctype(const char *name); +#undef iswctype +#define iswctype gawk_iswctype extern int iswctype(wint_t wc, wctype_t desc); +#undef wcscoll +#define wcscoll gawk_wcscoll extern int wcscoll(const wchar_t *ws1, const wchar_t *ws2); #endif + +#ifdef LIBC_IS_BORKED +# include <wchar.h> +extern int gawk_mb_cur_max; +# undef MB_CUR_MAX +# undef mbrtowc +# define MB_CUR_MAX gawk_mb_cur_max +# define mbrtowc(a, b, c, d) ((size_t) -1) +#endif + +#include <locale.h> +#ifndef LC_ALL +# define setlocale(category, locale) NULL +#endif + +#include <assert.h> +#ifndef static_assert +# define static_assert(cond, diagnostic) \ + extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })] +#endif diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog index f94c070..7fa6541 100644 --- a/missing_d/ChangeLog +++ b/missing_d/ChangeLog @@ -1,3 +1,7 @@ +2014-04-03 Paul Eggert <[email protected]> + + * wcmisc.c: Remove now-unnecessary ifdefs. + 2013-05-09 Arnold D. Robbins <[email protected]> * 4.1.0: Release tar ball made. diff --git a/missing_d/wcmisc.c b/missing_d/wcmisc.c index d2b7aa0..89e24c9 100644 --- a/missing_d/wcmisc.c +++ b/missing_d/wcmisc.c @@ -16,7 +16,6 @@ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ -#if !defined(HAVE_WCTYPE) || !defined(HAVE_ISWCTYPE) static const char *classes[] = { "<dummy>", "alnum", @@ -33,16 +32,12 @@ static const char *classes[] = { "xdigit", NULL }; -#endif -#ifndef HAVE_ISWCTYPE static int is_blank (int c) { return (c == ' ' || c == '\t'); } -#endif -#ifndef HAVE_WCTYPE wctype_t wctype(const char *name) { int i; @@ -53,9 +48,7 @@ wctype_t wctype(const char *name) return 0; } -#endif -#ifndef HAVE_ISWCTYPE int iswctype(wint_t wc, wctype_t desc) { int j = sizeof(classes) / sizeof(classes[0]); @@ -79,9 +72,7 @@ int iswctype(wint_t wc, wctype_t desc) default: return 0; } } -#endif -#ifndef HAVE_WCSCOLL int wcscoll(const wchar_t *ws1, const wchar_t *ws2) { size_t i; @@ -95,6 +86,5 @@ int wcscoll(const wchar_t *ws1, const wchar_t *ws2) return (ws1[i] - ws2[i]); } -#endif /*wcmisc.c*/ diff --git a/regex_internal.h b/regex_internal.h index c8981a0..758cf47 100644 --- a/regex_internal.h +++ b/regex_internal.h @@ -26,8 +26,6 @@ #include <stdlib.h> #include <string.h> -#include "mbsupport.h" /* gawk */ - #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC # include <langinfo.h> #endif -- 1.9.0
