bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP"

Paul Eggert Thu, 03 Apr 2014 19:23:29 -0700

Paul Eggert wrote:

I'll look into writing patches for that, one for grep (which affects
dfa.c), one for gawk (which will use the same patches to dfa.c).

OK, I got it to work, and it simplifies grep (not surprising) and gawkas well (a bit surprising, but there it is). I'm attaching the patches,one for each program. These patches include Paolo's original suggestionat the start of bug#17157, plus several other simplifications to dfa.c.

From 3a0e92f05691137bd95130df296956e548876f39 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 3 Apr 2014 18:14:15 -0700
Subject: [PATCH] grep: simplify dfa.c by having it not include mbsupport.h
 directly

* src/mbsupport.h: Remove.
* src/Makefile.am (noinst_HEADERS): Remove mbsupport.h.
* src/dfa.c, src/grep.c, src/search.h: Don't include mbsupport.h.
* src/dfa.c: Include wchar.h and wctype.h unconditionally, as
this simplifies the use of dfa.c in grep, and it does no harm
in gawk.
(setlocale, static_assert): Remove gawk-specific hacks, as
gawk now does these itself.
(struct dfa, dfambcache, mbs_to_wchar)
(is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC)
(addtok_wc, add_utf8_anychar, atom, state_index, epsclosure)
(dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust):
* src/dfasearch.c (EGexecute):
* src/grep.c (main):
* src/searchutils.c (mbtoupper):
Assume MBS_SUPPORT.
---
 src/Makefile.am   |  2 +-
 src/dfa.c         | 94 +++++++++----------------------------------------------
 src/dfasearch.c   |  3 --
 src/grep.c        |  3 --
 src/mbsupport.h   | 29 -----------------
 src/search.h      |  3 --
 src/searchutils.c |  2 --
 7 files changed, 16 insertions(+), 120 deletions(-)
 delete mode 100644 src/mbsupport.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 3487848..f8c9415 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -27,7 +27,7 @@ grep_SOURCES = grep.c searchutils.c \
           dfa.c dfasearch.c \
           kwset.c kwsearch.c \
           pcresearch.c
-noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h
+noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h
 
 # Sometimes, the expansion of $(LIBINTL) includes -lc which may
 # include modules defining variables like 'optind', so libgreputils.a
diff --git a/src/dfa.c b/src/dfa.c
index b6fbd58..0d7eab5 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -34,16 +34,6 @@
 #include <locale.h>
 #include <stdbool.h>
 
-/* Gawk doesn't use Gnulib, so don't assume that setlocale and
-   static_assert are present.  */
-#ifndef LC_ALL
-# define setlocale(category, locale) NULL
-#endif
-#ifndef static_assert
-# define static_assert(cond, diagnostic) \
-    extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
-#endif
-
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
 /* ISASCIIDIGIT differs from isdigit, as follows:
@@ -60,12 +50,8 @@
 #include "gettext.h"
 #define _(str) gettext (str)
 
-#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate.  */
-#if MBS_SUPPORT
-/* We can handle multibyte strings.  */
-# include <wchar.h>
-# include <wctype.h>
-#endif
+#include <wchar.h>
+#include <wctype.h>
 
 #if HAVE_LANGINFO_CODESET
 # include <langinfo.h>
@@ -376,13 +362,11 @@ struct dfa
   size_t nmultibyte_prop;
   int *multibyte_prop;
 
-#if MBS_SUPPORT
   /* A table indexed by byte values that contains the corresponding wide
      character (if any) for that byte.  WEOF means the byte is the
      leading byte of a multibyte character.  Invalid and null bytes are
      mapped to themselves.  */
   wint_t mbrtowc_cache[NOTCHAR];
-#endif
 
   /* Array of the bracket expression in the DFA.  */
   struct mb_char_classes *mbcsets;
@@ -488,7 +472,6 @@ static void regexp (void);
 static void
 dfambcache (struct dfa *d)
 {
-#if MBS_SUPPORT
   int i;
   for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
     {
@@ -505,10 +488,8 @@ dfambcache (struct dfa *d)
         }
       d->mbrtowc_cache[uc] = wi;
     }
-#endif
 }
 
-#if MBS_SUPPORT
 /* Given the dfa D, store into *PWC the result of converting the
    leading bytes of the multibyte buffer S of length N bytes, updating
    the conversion state in *MBS.  On conversion error, convert just a
@@ -542,7 +523,6 @@ mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, 
size_t n,
   *pwc = wc;
   return 1;
 }
-#endif
 
 #ifdef DEBUG
 
@@ -712,7 +692,7 @@ static charclass newline;
 #ifdef __GLIBC__
 # define is_valid_unibyte_character(c) 1
 #else
-# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
+# define is_valid_unibyte_character(c) (btowc (c) != WEOF)
 #endif
 
 /* Return non-zero if C is a "word-constituent" byte; zero otherwise.  */
@@ -773,17 +753,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
-#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-#else
-  abort ();
-   /*NOTREACHED*/ return false;
-#endif
 }
 
 /* Set a bit for B and its case variants in the charclass C.
@@ -808,7 +783,7 @@ using_utf8 (void)
   static int utf8 = -1;
   if (utf8 == -1)
     {
-#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
+#if defined HAVE_LANGINFO_CODESET
       utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
 #else
       utf8 = 0;
@@ -897,7 +872,6 @@ static unsigned char const *buf_begin;  /* reference to 
begin in dfaexec.  */
 static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
 
 
-#if MBS_SUPPORT
 /* Note that characters become unsigned here.  */
 # define FETCH_WC(c, wc, eoferr)               \
   do {                                         \
@@ -920,23 +894,6 @@ static unsigned char const *buf_end;    /* reference to 
end in dfaexec.  */
       }                                                \
   } while (0)
 
-#else
-/* Note that characters become unsigned here.  */
-# define FETCH_WC(c, unused, eoferr)  \
-  do {                               \
-    if (! lexleft)                   \
-      {                                      \
-        if ((eoferr) != 0)           \
-          dfaerror (eoferr);         \
-        else                         \
-          return lasttok = END;              \
-      }                                      \
-    (c) = to_uchar (*lexptr++);       \
-    --lexleft;                       \
-  } while (0)
-
-#endif /* MBS_SUPPORT */
-
 #ifndef MIN
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
@@ -1720,7 +1677,6 @@ addtok (token t)
     }
 }
 
-#if MBS_SUPPORT
 /* We treat a multibyte character as a single atom, so that DFA
    can treat a multibyte character as a single expression.
 
@@ -1752,17 +1708,10 @@ addtok_wc (wint_t wc)
       addtok (CAT);
     }
 }
-#else
-static void
-addtok_wc (wint_t wc)
-{
-}
-#endif
 
 static void
 add_utf8_anychar (void)
 {
-#if MBS_SUPPORT
   static const charclass utf8_classes[5] = {
     {0, 0, 0, 0, ~0, ~0, 0, 0},                /* 80-bf: non-leading bytes */
     {~0, ~0, ~0, ~0, 0, 0, 0, 0},       /* 00-7f: 1-byte sequence */
@@ -1807,7 +1756,6 @@ add_utf8_anychar (void)
       addtok (CAT);
       addtok (OR);
     }
-#endif
 }
 
 /* The grammar understood by the parser is as follows.
@@ -1848,7 +1796,7 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (MBS_SUPPORT && tok == WCHAR)
+  if (tok == WCHAR)
     {
       addtok_wc (wctok);
 
@@ -1865,7 +1813,7 @@ atom (void)
 
       tok = lex ();
     }
-  else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
+  else if (tok == ANYCHAR && using_utf8 ())
     {
       /* For UTF-8 expand the period to a series of CSETs that define a valid
          UTF-8 character.  This avoids using the slow multibyte path.  I'm
@@ -1879,9 +1827,7 @@ atom (void)
     }
   else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
            || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
-#if MBS_SUPPORT
            || tok == ANYCHAR || tok == MBCSET
-#endif /* MBS_SUPPORT */
            || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
     {
       addtok (tok);
@@ -2164,11 +2110,9 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   d->states[i].backref = 0;
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
-  if (MBS_SUPPORT)
-    {
-      d->states[i].mbps.nelem = 0;
-      d->states[i].mbps.elems = NULL;
-    }
+  d->states[i].mbps.nelem = 0;
+  d->states[i].mbps.elems = NULL;
+
   for (j = 0; j < s->nelem; ++j)
     if (d->tokens[s->elems[j].index] < 0)
       {
@@ -2206,10 +2150,8 @@ epsclosure (position_set * s, struct dfa const *d)
   for (i = 0; i < s->nelem; ++i)
     if (d->tokens[s->elems[i].index] >= NOTCHAR
         && d->tokens[s->elems[i].index] != BACKREF
-#if MBS_SUPPORT
         && d->tokens[s->elems[i].index] != ANYCHAR
         && d->tokens[s->elems[i].index] != MBCSET
-#endif
         && d->tokens[s->elems[i].index] < CSET)
       {
         old = s->elems[i];
@@ -2526,9 +2468,7 @@ dfaanalyze (struct dfa *d, int searchflag)
      it with its epsilon closure.  */
   for (i = 0; i < d->tindex; ++i)
     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
-#if MBS_SUPPORT
         || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
-#endif
         || d->tokens[i] >= CSET)
       {
 #ifdef DEBUG
@@ -2638,9 +2578,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (MBS_SUPPORT
-               && (d->tokens[pos.index] == ANYCHAR
-                   || d->tokens[pos.index] == MBCSET))
+      else if (d->tokens[pos.index] == ANYCHAR
+               || d->tokens[pos.index] == MBCSET)
         /* MB_CUR_MAX > 1  */
         {
           /* ANYCHAR and MBCSET must match with a single character, so we
@@ -2814,7 +2753,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well.  */
       if (d->searchflag
-          && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
+          && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
         for (j = 0; j < d->states[0].elems.nelem; ++j)
           insert (d->states[0].elems.elems[j], &follows);
 
@@ -3366,7 +3305,6 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp)
 static void
 prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
 {
-#if MBS_SUPPORT
   unsigned char eol = eolbyte;
   size_t i;
   size_t ilim = end - begin + 1;
@@ -3390,7 +3328,6 @@ prepare_wc_buf (struct dfa *d, const char *begin, const 
char *end)
   buf_end = (unsigned char *) (begin + i);
   mblen_buf[i] = 0;
   inputwcs[i] = 0;              /* sentinel */
-#endif /* MBS_SUPPORT */
 }
 
 /* Search through a buffer looking for a match to the given struct dfa.
@@ -3613,7 +3550,7 @@ dfaoptimize (struct dfa *d)
 {
   size_t i;
 
-  if (!MBS_SUPPORT || !using_utf8 ())
+  if (!using_utf8 ())
     return;
 
   for (i = 0; i < d->tindex; ++i)
@@ -3663,8 +3600,7 @@ dfafree (struct dfa *d)
   for (i = 0; i < d->sindex; ++i)
     {
       free (d->states[i].elems.elems);
-      if (MBS_SUPPORT)
-        free (d->states[i].mbps.elems);
+      free (d->states[i].mbps.elems);
     }
   free (d->states);
   for (i = 0; i < d->tindex; ++i)
@@ -4139,7 +4075,7 @@ dfamust (struct dfa *d)
               /* not on *my* shift */
               goto done;
             }
-          else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
+          else if (t >= CSET || t == ANYCHAR || t == MBCSET)
             {
               /* easy enough */
               resetmust (mp);
diff --git a/src/dfasearch.c b/src/dfasearch.c
index d098a9b..5665b82 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -239,9 +239,6 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
               char const *dfa_start = beg;
               if (kwsm.index < kwset_exact_matches)
                 {
-                  if (!MBS_SUPPORT)
-                    goto success;
-
                   if (mb_start < beg)
                     mb_start = beg;
                   if (MB_CUR_MAX == 1
diff --git a/src/grep.c b/src/grep.c
index a1bccdb..7033730 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -21,7 +21,6 @@
 #include <config.h>
 #include <sys/types.h>
 #include <sys/stat.h>
-#include "mbsupport.h"
 #include <wchar.h>
 #include <wctype.h>
 #include <fcntl.h>
@@ -2461,10 +2460,8 @@ main (int argc, char **argv)
         }
     }
 
-#if MBS_SUPPORT
   if (MB_CUR_MAX > 1)
     build_mbclen_cache ();
-#endif
 
   compile (keys, keycc);
   free (keys);
diff --git a/src/mbsupport.h b/src/mbsupport.h
deleted file mode 100644
index 49c7926..0000000
--- a/src/mbsupport.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* mbsupport.h --- Localize determination of whether we have multibyte stuff.
-
-   Copyright (C) 2004-2005, 2007, 2009-2014 Free Software Foundation, Inc.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-   02110-1301, USA.  */
-
-#include <stdlib.h>
-
-#ifndef MBS_SUPPORT
-# define MBS_SUPPORT 1
-#endif
-
-#if ! MBS_SUPPORT
-# undef MB_CUR_MAX
-# define MB_CUR_MAX 1
-#endif
diff --git a/src/search.h b/src/search.h
index 69e3afd..871b7d5 100644
--- a/src/search.h
+++ b/src/search.h
@@ -23,9 +23,6 @@
 
 #include <sys/types.h>
 #include <stdint.h>
-
-#include "mbsupport.h"
-
 #include <wchar.h>
 #include <wctype.h>
 #include <regex.h>
diff --git a/src/searchutils.c b/src/searchutils.c
index babb31f..6749945 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -48,7 +48,6 @@ kwsinit (kwset_t *kwset)
     xalloc_die ();
 }
 
-#if MBS_SUPPORT
 /* Convert BEG, an *N-byte string, to uppercase, and write the
    NUL-terminated result into malloc'd storage.  Upon success, set *N
    to the length (in bytes) of the resulting string (not including the
@@ -276,4 +275,3 @@ is_mb_middle (const char **good, const char *buf, const 
char *end,
   /* P == BUF here.  */
   return 0 < match_len && match_len < mbrlen (p, end - p, &cur_state);
 }
-#endif /* MBS_SUPPORT */
-- 
1.9.0

From 7e2b51d00133ab8a0dbcd21b5e0f39a6984f858f Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 3 Apr 2014 18:04:52 -0700
Subject: [PATCH] awk: simplify dfa.c by having it not include mbsupport.h
 directly

This syncs dfa.c better with 'grep'.
* awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h.
* custom.h: Include mbsupport.h here instead.
(_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the
number of differences between grep's dfa.c and ours.
* dfa.c: Include wchar.h and wctype.h unconditionally, as
this simplifies the use of dfa.c in grep, and it does no harm
in gawk.
(gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]:
Move to mbsupport.h (needed for consistency in all uses),
and fix mbrtowc to return size_t.
(setlocale, static_assert): Likewise.
(struct dfa, dfambcache, mbs_to_wchar)
(is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC)
(addtok_wc, add_utf8_anychar, atom, state_index, epsclosure)
(dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust):
* dfasearch.c (EGexecute):
* grep.c (main):
* searchutils.c (mbtoupper):
Assume MBS_SUPPORT.
* mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h
before overriding their definitions.
(WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper)
(iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll):
(btowc): Parenthesize properly.
(mbrtowc, wcrtomb): New macros.
(wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid
collisions with standard library.
* missing_d/wcmisc.c: Remove now-unnecessary ifdefs.
---
 ChangeLog           |  32 +++++++++++++++
 awk.h               |   2 -
 custom.h            |   9 +++++
 dfa.c               | 111 +++++++---------------------------------------------
 mbsupport.h         |  54 +++++++++++++++++++++++--
 missing_d/ChangeLog |   4 ++
 missing_d/wcmisc.c  |  10 -----
 regex_internal.h    |   2 -
 8 files changed, 110 insertions(+), 114 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a0efd89..36fb0f4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2014-04-03  Paul Eggert  <[email protected]>
+
+       awk: simplify dfa.c by having it not include mbsupport.h directly
+       This syncs dfa.c better with 'grep'.
+       * awk.h, regex_internal.h, dfa.c: Don't include mbsupport.h.
+       * custom.h: Include mbsupport.h here instead.
+       (_GL_ATTRIBUTE_PURE): Move here from dfa.c, to lessen the
+       number of differences between grep's dfa.c and ours.
+       * dfa.c: Include wchar.h and wctype.h unconditionally, as
+       this simplifies the use of dfa.c in grep, and it does no harm
+       in gawk.
+       (gawk_mb_cur_max, MB_CUR_MAX, mbrtowc) [LIBC_IS_BORKED]:
+       Move to mbsupport.h (needed for consistency in all uses),
+       and fix mbrtowc to return size_t.
+       (struct dfa, dfambcache, mbs_to_wchar)
+       (is_valid_unibyte_character, setbit_wc, using_utf8, FETCH_WC)
+       (addtok_wc, add_utf8_anychar, atom, state_index, epsclosure)
+       (dfaanalyze, dfastate, prepare_wc_buf, dfaoptimize, dfafree, dfamust):
+       * dfasearch.c (EGexecute):
+       * grep.c (main):
+       * searchutils.c (mbtoupper):
+       Assume MBS_SUPPORT.
+       * mbsupport.h [!MBS_SUPPORT]: Include wchar.h, wctype.h
+       before overriding their definitions.
+       (WEOF, towupper, towlower, btowc, iswalnum, iswalpha, iswupper)
+       (iswlower, mbrtowc, wcrtomb, wctype, iswctype, wcscoll):
+       #undef before #defining.
+       (btowc): Parenthesize properly.
+       (mbrtowc, wcrtomb): New macros.
+       (wctype, iswctype, wcscoll): Define to gawk_wctype etc. to avoid
+       collisions with standard library.
+
 2014-04-03         Arnold D. Robbins     <[email protected]>
 
        * regcomp.c (parse_bracket_exp): Move a call to `re_free' inside
diff --git a/awk.h b/awk.h
index aefdd07..cdba7a8 100644
--- a/awk.h
+++ b/awk.h
@@ -95,8 +95,6 @@ extern int errno;
 #include "missing_d/gawkbool.h"
 #endif
 
-#include "mbsupport.h" /* defines MBS_SUPPORT */
-
 #if MBS_SUPPORT
 /* We can handle multibyte strings.  */
 #include <wchar.h>
diff --git a/custom.h b/custom.h
index 36b4aa0..bade4cf 100644
--- a/custom.h
+++ b/custom.h
@@ -76,3 +76,12 @@
 extern int setenv(const char *name, const char *value, int rewrite);
 extern int unsetenv(const char *name);
 #endif
+
+/* The __pure__ attribute was added in gcc 2.96.  */
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
+# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__))
+#else
+# define _GL_ATTRIBUTE_PURE /* empty */
+#endif
+
+#include "mbsupport.h"
diff --git a/dfa.c b/dfa.c
index 378305d..ee6edd8 100644
--- a/dfa.c
+++ b/dfa.c
@@ -43,16 +43,6 @@
 #include "missing_d/gawkbool.h"
 #endif /* HAVE_STDBOOL_H */
 
-/* Gawk doesn't use Gnulib, so don't assume that setlocale and
-   static_assert are present.  */
-#ifndef LC_ALL
-# define setlocale(category, locale) NULL
-#endif
-#ifndef static_assert
-# define static_assert(cond, diagnostic) \
-    extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
-#endif
-
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
 /* ISASCIIDIGIT differs from isdigit, as follows:
@@ -69,21 +59,8 @@
 #include "gettext.h"
 #define _(str) gettext (str)
 
-#include "mbsupport.h" /* Define MBS_SUPPORT to 1 or 0, as appropriate.  */
-#if MBS_SUPPORT
-/* We can handle multibyte strings.  */
-# include <wchar.h>
-# include <wctype.h>
-#endif
-
-#ifdef GAWK
-/* The __pure__ attribute was added in gcc 2.96.  */
-#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96)
-# define _GL_ATTRIBUTE_PURE __attribute__ ((__pure__))
-#else
-# define _GL_ATTRIBUTE_PURE /* empty */
-#endif
-#endif /* GAWK */
+#include <wchar.h>
+#include <wctype.h>
 
 #if HAVE_LANGINFO_CODESET
 # include <langinfo.h>
@@ -101,14 +78,6 @@ is_blank (int c)
 }
 #endif /* GAWK */
 
-#ifdef LIBC_IS_BORKED
-extern int gawk_mb_cur_max;
-#undef MB_CUR_MAX
-#define MB_CUR_MAX gawk_mb_cur_max
-#undef mbrtowc
-#define mbrtowc(a, b, c, d) (-1)
-#endif
-
 /* HPUX defines these as macros in sys/param.h.  */
 #ifdef setbit
 # undef setbit
@@ -412,13 +381,11 @@ struct dfa
   size_t nmultibyte_prop;
   int *multibyte_prop;
 
-#if MBS_SUPPORT
   /* A table indexed by byte values that contains the corresponding wide
      character (if any) for that byte.  WEOF means the byte is the
      leading byte of a multibyte character.  Invalid and null bytes are
      mapped to themselves.  */
   wint_t mbrtowc_cache[NOTCHAR];
-#endif
 
   /* Array of the bracket expression in the DFA.  */
   struct mb_char_classes *mbcsets;
@@ -525,7 +492,6 @@ static void regexp (void);
 static void
 dfambcache (struct dfa *d)
 {
-#if MBS_SUPPORT
   int i;
   for (i = CHAR_MIN; i <= CHAR_MAX; ++i)
     {
@@ -542,10 +508,8 @@ dfambcache (struct dfa *d)
         }
       d->mbrtowc_cache[uc] = wi;
     }
-#endif
 }
 
-#if MBS_SUPPORT
 /* Given the dfa D, store into *PWC the result of converting the
    leading bytes of the multibyte buffer S of length N bytes, updating
    the conversion state in *MBS.  On conversion error, convert just a
@@ -579,7 +543,6 @@ mbs_to_wchar (struct dfa *d, wchar_t *pwc, char const *s, 
size_t n,
   *pwc = wc;
   return 1;
 }
-#endif
 
 #ifdef DEBUG
 
@@ -749,7 +712,7 @@ static charclass newline;
 #ifdef __GLIBC__
 # define is_valid_unibyte_character(c) 1
 #else
-# define is_valid_unibyte_character(c) (! (MBS_SUPPORT && btowc (c) == WEOF))
+# define is_valid_unibyte_character(c) (btowc (c) != WEOF)
 #endif
 
 /* Return non-zero if C is a "word-constituent" byte; zero otherwise.  */
@@ -810,17 +773,12 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
-#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-#else
-  abort ();
-   /*NOTREACHED*/ return false;
-#endif
 }
 
 /* Set a bit for B and its case variants in the charclass C.
@@ -845,7 +803,7 @@ using_utf8 (void)
   static int utf8 = -1;
   if (utf8 == -1)
     {
-#if defined HAVE_LANGINFO_CODESET && MBS_SUPPORT
+#if defined HAVE_LANGINFO_CODESET
       utf8 = (STREQ (nl_langinfo (CODESET), "UTF-8"));
 #else
       utf8 = 0;
@@ -938,7 +896,6 @@ static unsigned char const *buf_begin;  /* reference to 
begin in dfaexec.  */
 static unsigned char const *buf_end;    /* reference to end in dfaexec.  */
 
 
-#if MBS_SUPPORT
 /* Note that characters become unsigned here.  */
 # define FETCH_WC(c, wc, eoferr)               \
   do {                                         \
@@ -961,23 +918,6 @@ static unsigned char const *buf_end;    /* reference to 
end in dfaexec.  */
       }                                                \
   } while (0)
 
-#else
-/* Note that characters become unsigned here.  */
-# define FETCH_WC(c, unused, eoferr)  \
-  do {                               \
-    if (! lexleft)                   \
-      {                                      \
-        if ((eoferr) != 0)           \
-          dfaerror (eoferr);         \
-        else                         \
-          return lasttok = END;              \
-      }                                      \
-    (c) = to_uchar (*lexptr++);       \
-    --lexleft;                       \
-  } while (0)
-
-#endif /* MBS_SUPPORT */
-
 #ifndef MIN
 # define MIN(a,b) ((a) < (b) ? (a) : (b))
 #endif
@@ -1761,7 +1701,6 @@ addtok (token t)
     }
 }
 
-#if MBS_SUPPORT
 /* We treat a multibyte character as a single atom, so that DFA
    can treat a multibyte character as a single expression.
 
@@ -1793,17 +1732,10 @@ addtok_wc (wint_t wc)
       addtok (CAT);
     }
 }
-#else
-static void
-addtok_wc (wint_t wc)
-{
-}
-#endif
 
 static void
 add_utf8_anychar (void)
 {
-#if MBS_SUPPORT
   static const charclass utf8_classes[5] = {
     {0, 0, 0, 0, ~0, ~0, 0, 0},                /* 80-bf: non-leading bytes */
     {~0, ~0, ~0, ~0, 0, 0, 0, 0},       /* 00-7f: 1-byte sequence */
@@ -1848,7 +1780,6 @@ add_utf8_anychar (void)
       addtok (CAT);
       addtok (OR);
     }
-#endif
 }
 
 /* The grammar understood by the parser is as follows.
@@ -1889,7 +1820,7 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (MBS_SUPPORT && tok == WCHAR)
+  if (tok == WCHAR)
     {
       addtok_wc (wctok);
 
@@ -1906,7 +1837,7 @@ atom (void)
 
       tok = lex ();
     }
-  else if (MBS_SUPPORT && tok == ANYCHAR && using_utf8 ())
+  else if (tok == ANYCHAR && using_utf8 ())
     {
       /* For UTF-8 expand the period to a series of CSETs that define a valid
          UTF-8 character.  This avoids using the slow multibyte path.  I'm
@@ -1920,9 +1851,7 @@ atom (void)
     }
   else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
            || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
-#if MBS_SUPPORT
            || tok == ANYCHAR || tok == MBCSET
-#endif /* MBS_SUPPORT */
            || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)
     {
       addtok (tok);
@@ -2205,11 +2134,9 @@ state_index (struct dfa *d, position_set const *s, int 
context)
   d->states[i].backref = 0;
   d->states[i].constraint = 0;
   d->states[i].first_end = 0;
-  if (MBS_SUPPORT)
-    {
-      d->states[i].mbps.nelem = 0;
-      d->states[i].mbps.elems = NULL;
-    }
+  d->states[i].mbps.nelem = 0;
+  d->states[i].mbps.elems = NULL;
+
   for (j = 0; j < s->nelem; ++j)
     if (d->tokens[s->elems[j].index] < 0)
       {
@@ -2247,10 +2174,8 @@ epsclosure (position_set * s, struct dfa const *d)
   for (i = 0; i < s->nelem; ++i)
     if (d->tokens[s->elems[i].index] >= NOTCHAR
         && d->tokens[s->elems[i].index] != BACKREF
-#if MBS_SUPPORT
         && d->tokens[s->elems[i].index] != ANYCHAR
         && d->tokens[s->elems[i].index] != MBCSET
-#endif
         && d->tokens[s->elems[i].index] < CSET)
       {
         old = s->elems[i];
@@ -2567,9 +2492,7 @@ dfaanalyze (struct dfa *d, int searchflag)
      it with its epsilon closure.  */
   for (i = 0; i < d->tindex; ++i)
     if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF
-#if MBS_SUPPORT
         || d->tokens[i] == ANYCHAR || d->tokens[i] == MBCSET
-#endif
         || d->tokens[i] >= CSET)
       {
 #ifdef DEBUG
@@ -2679,9 +2602,8 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         setbit (d->tokens[pos.index], matches);
       else if (d->tokens[pos.index] >= CSET)
         copyset (d->charclasses[d->tokens[pos.index] - CSET], matches);
-      else if (MBS_SUPPORT
-               && (d->tokens[pos.index] == ANYCHAR
-                   || d->tokens[pos.index] == MBCSET))
+      else if (d->tokens[pos.index] == ANYCHAR
+               || d->tokens[pos.index] == MBCSET)
         /* MB_CUR_MAX > 1  */
         {
           /* ANYCHAR and MBCSET must match with a single character, so we
@@ -2855,7 +2777,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
       /* If we are building a searching matcher, throw in the positions
          of state 0 as well.  */
       if (d->searchflag
-          && (!MBS_SUPPORT || (d->mb_cur_max == 1 || !next_isnt_1st_byte)))
+          && (d->mb_cur_max == 1 || !next_isnt_1st_byte))
         for (j = 0; j < d->states[0].elems.nelem; ++j)
           insert (d->states[0].elems.elems[j], &follows);
 
@@ -3407,7 +3329,6 @@ transit_state (struct dfa *d, state_num s, unsigned char 
const **pp)
 static void
 prepare_wc_buf (struct dfa *d, const char *begin, const char *end)
 {
-#if MBS_SUPPORT
   unsigned char eol = eolbyte;
   size_t i;
   size_t ilim = end - begin + 1;
@@ -3431,7 +3352,6 @@ prepare_wc_buf (struct dfa *d, const char *begin, const 
char *end)
   buf_end = (unsigned char *) (begin + i);
   mblen_buf[i] = 0;
   inputwcs[i] = 0;              /* sentinel */
-#endif /* MBS_SUPPORT */
 }
 
 /* Search through a buffer looking for a match to the given struct dfa.
@@ -3653,7 +3573,7 @@ dfaoptimize (struct dfa *d)
 {
   size_t i;
 
-  if (!MBS_SUPPORT || !using_utf8 ())
+  if (!using_utf8 ())
     return;
 
   for (i = 0; i < d->tindex; ++i)
@@ -3703,8 +3623,7 @@ dfafree (struct dfa *d)
   for (i = 0; i < d->sindex; ++i)
     {
       free (d->states[i].elems.elems);
-      if (MBS_SUPPORT)
-        free (d->states[i].mbps.elems);
+      free (d->states[i].mbps.elems);
     }
   free (d->states);
   for (i = 0; i < d->tindex; ++i)
@@ -4179,7 +4098,7 @@ dfamust (struct dfa *d)
               /* not on *my* shift */
               goto done;
             }
-          else if (t >= CSET || !MBS_SUPPORT || t == ANYCHAR || t == MBCSET)
+          else if (t >= CSET || t == ANYCHAR || t == MBCSET)
             {
               /* easy enough */
               resetmust (mp);
diff --git a/mbsupport.h b/mbsupport.h
index 9a62486..ab33e91 100644
--- a/mbsupport.h
+++ b/mbsupport.h
@@ -66,6 +66,15 @@
 #endif
 
 #if ! MBS_SUPPORT
+
+/* Include wchar.h and wctype.h so their definitions can be overridden.  */
+
+# include <wchar.h>
+# include <wctype.h>
+
+/* Override the definitions of wchar.h and wctype.h to provide a
+   unibyte substitute that is good enough for Gawk.  */
+
 # undef MB_CUR_MAX
 # define MB_CUR_MAX 1
 
@@ -78,15 +87,24 @@
 #define wctype_t       int
 #define wint_t         int
 #define mbstate_t      int
+#undef WEOF
 #define WEOF           EOF
+#undef towupper
 #define towupper       toupper
+#undef towlower
 #define towlower       tolower
 #ifndef __DJGPP__
-#define btowc(x)       ((int)x)
+#undef btowc
+#define btowc(x)       ((int) (x))
 #endif
+#undef iswalnum
 #define iswalnum       isalnum
+#undef iswalpha
 #define iswalpha       isalpha
+#undef iswupper
 #define iswupper       isupper
+#undef iswlower
+#define iswlower       islower
 #if defined(ZOS_USS)
 #undef towupper
 #undef towlower
@@ -94,12 +112,40 @@
 #undef iswalnum
 #undef iswalpha
 #undef iswupper
-#undef wctype
-#undef iswctype
-#undef wcscoll
 #endif
 
+#undef mbrtowc
+#define mbrtowc(pwc, s, n, ps) ((size_t) -1)
+#undef wcrtomb
+#define wcrtomb(s, wc, ps) ((size_t) -1)
+
+#undef wctype
+#define wctype gawk_wctype
 extern wctype_t wctype(const char *name);
+#undef iswctype
+#define iswctype gawk_iswctype
 extern int iswctype(wint_t wc, wctype_t desc);
+#undef wcscoll
+#define wcscoll gawk_wcscoll
 extern int wcscoll(const wchar_t *ws1, const wchar_t *ws2);
 #endif
+
+#ifdef LIBC_IS_BORKED
+# include <wchar.h>
+extern int gawk_mb_cur_max;
+# undef MB_CUR_MAX
+# undef mbrtowc
+# define MB_CUR_MAX gawk_mb_cur_max
+# define mbrtowc(a, b, c, d) ((size_t) -1)
+#endif
+
+#include <locale.h>
+#ifndef LC_ALL
+# define setlocale(category, locale) NULL
+#endif
+
+#include <assert.h>
+#ifndef static_assert
+# define static_assert(cond, diagnostic) \
+    extern int (*foo (void)) [!!sizeof (struct { int foo: (cond) ? 8 : -1; })]
+#endif
diff --git a/missing_d/ChangeLog b/missing_d/ChangeLog
index f94c070..7fa6541 100644
--- a/missing_d/ChangeLog
+++ b/missing_d/ChangeLog
@@ -1,3 +1,7 @@
+2014-04-03  Paul Eggert  <[email protected]>
+
+       * wcmisc.c: Remove now-unnecessary ifdefs.
+
 2013-05-09         Arnold D. Robbins     <[email protected]>
 
        * 4.1.0: Release tar ball made.
diff --git a/missing_d/wcmisc.c b/missing_d/wcmisc.c
index d2b7aa0..89e24c9 100644
--- a/missing_d/wcmisc.c
+++ b/missing_d/wcmisc.c
@@ -16,7 +16,6 @@
    Foundation, Inc.,
    51 Franklin Street - Fifth Floor, Boston, MA  02110-1301, USA */
 
-#if !defined(HAVE_WCTYPE) || !defined(HAVE_ISWCTYPE)
 static const char *classes[] = {
        "<dummy>",
        "alnum",
@@ -33,16 +32,12 @@ static const char *classes[] = {
        "xdigit",
        NULL
 };
-#endif
 
-#ifndef HAVE_ISWCTYPE
 static int is_blank (int c)
 {
    return (c == ' ' || c == '\t');
 }
-#endif
 
-#ifndef HAVE_WCTYPE
 wctype_t wctype(const char *name)
 {
        int i;
@@ -53,9 +48,7 @@ wctype_t wctype(const char *name)
 
        return 0;
 }
-#endif
 
-#ifndef HAVE_ISWCTYPE
 int iswctype(wint_t wc, wctype_t desc)
 {
        int j = sizeof(classes) / sizeof(classes[0]);
@@ -79,9 +72,7 @@ int iswctype(wint_t wc, wctype_t desc)
        default:        return 0;
        }
 }
-#endif
 
-#ifndef HAVE_WCSCOLL
 int wcscoll(const wchar_t *ws1, const wchar_t *ws2)
 {
        size_t i;
@@ -95,6 +86,5 @@ int wcscoll(const wchar_t *ws1, const wchar_t *ws2)
 
        return (ws1[i] - ws2[i]);
 }
-#endif
 
 /*wcmisc.c*/
diff --git a/regex_internal.h b/regex_internal.h
index c8981a0..758cf47 100644
--- a/regex_internal.h
+++ b/regex_internal.h
@@ -26,8 +26,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "mbsupport.h" /* gawk */
-
 #if defined HAVE_LANGINFO_H || defined HAVE_LANGINFO_CODESET || defined _LIBC
 # include <langinfo.h>
 #endif
-- 
1.9.0

bug#17157: [PATCH 1/5] Partially revert "dfa: improve port to freestanding DJGPP"

Reply via email to