bug#17443: Don't assume 'unsigned' is exactly 32 bits wide

Paul Eggert Thu, 08 May 2014 17:51:07 -0700

I installed the attached patches, which port GNU grep's DFA code tounusual platforms where 'int' is wider than 32 bits. Acurrently-produced example is the Unisys ClearPath MCP, where 'unsignedint' is 39 bits wide.

The 2nd patch is the real change; the other patches are minor cleanup.None of these changes should affect typical hosts where 'unsigned int'is 32 bits wide.


My source for that "39 bits" is page 4-490 of:

http://public.support.unisys.com/aseries/docs/clearpath-mcp-15.0/pdf/86002278-305.pdf

From cdb90cb9cbe47588fac7537896f1cfa98792b827 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 8 May 2014 16:13:54 -0700
Subject: [PATCH 1/3] maint: fix indenting to pacify
 'prohibit_tab_based_indentation'

* src/dfa.c: Use spaces and not tabs to indent some lines.
---
 src/dfa.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index fc2acdc..93638ec 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -182,25 +182,25 @@ enum
                                    a backtracking matcher.  */
 
   BEGLINE,                      /* BEGLINE is a terminal symbol that matches
-                                  the empty string at the beginning of a
-                                  line.  */
+                                   the empty string at the beginning of a
+                                   line.  */
 
   ENDLINE,                      /* ENDLINE is a terminal symbol that matches
-                                  the empty string at the end of a line.  */
+                                   the empty string at the end of a line.  */
 
   BEGWORD,                      /* BEGWORD is a terminal symbol that matches
-                                  the empty string at the beginning of a
-                                  word.  */
+                                   the empty string at the beginning of a
+                                   word.  */
 
   ENDWORD,                      /* ENDWORD is a terminal symbol that matches
-                                  the empty string at the end of a word.  */
+                                   the empty string at the end of a word.  */
 
   LIMWORD,                      /* LIMWORD is a terminal symbol that matches
-                                  the empty string at the beginning or the
-                                  end of a word.  */
+                                   the empty string at the beginning or the
+                                   end of a word.  */
 
   NOTLIMWORD,                   /* NOTLIMWORD is a terminal symbol that
-                                  matches the empty string not at
+                                   matches the empty string not at
                                    the beginning or end of a word.  */
 
   QMARK,                        /* QMARK is an operator of one argument that
@@ -839,9 +839,9 @@ static int cur_mb_len = 1;      /* Length of the multibyte 
representation of
                                    wctok.  */
 
 static wint_t wctok;           /* Wide character representation of the current
-                                  multibyte character, or WEOF if there was
-                                  an encoding error.  Used only if
-                                  MB_CUR_MAX > 1.  */
+                                   multibyte character, or WEOF if there was
+                                   an encoding error.  Used only if
+                                   MB_CUR_MAX > 1.  */
 
 
 /* Fetch the next lexical input character.  Set C (of type int) to the
-- 
1.9.0

From 80322a1bcd61581fd0fb17117cf986b52ca60cfb Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 8 May 2014 17:24:32 -0700
Subject: [PATCH 2/3] dfa: don't assume unsigned int is exactly 32 bits wide

Sun C 5.12 (sparc) warns of the potential unportability.
* src/dfa.c (charclass_word): New type, for clarity.
All relevant uses of 'unsigned' changed.
(CHARCLASS_WORD_BITS): Rename from INTBITS.  All uses changed.
Now an enum, since it needn't be a macro.
(CHARCLASS_WORD_MASK): New macro.
(CHARCLASS_WORDS): Rename from CHARCLASS_INTS.  All uses changed.
(setbit, clrbit): Cast 1 to charclass_word, for clarity.
(notset, add_utf8_anychar, dfastats):
Don't assume unsigned int is exactly 32 bits wide.
(dfastate): Don't rely on implementation-defined conversion of
greater-than-INT_MAX unsigned to int.  Change bit test to resemble
tstbit more.
---
 src/dfa.c | 87 +++++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 93638ec..7704d5e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -70,16 +70,26 @@
 /* First integer value that is greater than any character code.  */
 #define NOTCHAR (1 << CHARBITS)
 
-/* INTBITS need not be exact, just a lower bound.  */
-#ifndef INTBITS
-# define INTBITS (CHARBITS * sizeof (int))
-#endif
+/* This represents part of a character class.  It must be unsigned and
+   at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
+typedef unsigned int charclass_word;
+
+/* The number of bits used in a charclass word.  utf8_classes assumes
+   this is exactly 32.  */
+enum { CHARCLASS_WORD_BITS = 32 };
+
+/* The maximum useful value of a charclass_word; all used bits are 1.  */
+#define CHARCLASS_WORD_MASK \
+  (((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1)
 
-/* Number of ints required to hold a bit for every character.  */
-#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
+/* Number of words required to hold a bit for every character.  */
+enum
+{
+  CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
+};
 
 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
-typedef unsigned int charclass[CHARCLASS_INTS];
+typedef charclass_word charclass[CHARCLASS_WORDS];
 
 /* Convert a possibly-signed character to an unsigned character.  This is
    a bit safer than casting to unsigned char, since it catches some type
@@ -575,19 +585,20 @@ prtok (token t)
 static bool
 tstbit (unsigned int b, charclass const c)
 {
-  return c[b / INTBITS] >> b % INTBITS & 1;
+  return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
 }
 
 static void
 setbit (unsigned int b, charclass c)
 {
-  c[b / INTBITS] |= 1U << b % INTBITS;
+  c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS;
 }
 
 static void
 clrbit (unsigned int b, charclass c)
 {
-  c[b / INTBITS] &= ~(1U << b % INTBITS);
+  c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1
+                                  << b % CHARCLASS_WORD_BITS);
 }
 
 static void
@@ -607,8 +618,8 @@ notset (charclass s)
 {
   int i;
 
-  for (i = 0; i < CHARCLASS_INTS; ++i)
-    s[i] = ~s[i];
+  for (i = 0; i < CHARCLASS_WORDS; ++i)
+    s[i] = CHARCLASS_WORD_MASK & ~s[i];
 }
 
 static bool
@@ -1696,11 +1707,21 @@ static void
 add_utf8_anychar (void)
 {
   static const charclass utf8_classes[5] = {
-    {0, 0, 0, 0, ~0, ~0, 0, 0},                /* 80-bf: non-leading bytes */
-    {~0, ~0, ~0, ~0, 0, 0, 0, 0},       /* 00-7f: 1-byte sequence */
-    {0, 0, 0, 0, 0, 0, ~3, 0},          /* c2-df: 2-byte sequence */
-    {0, 0, 0, 0, 0, 0, 0, 0xffff},      /* e0-ef: 3-byte sequence */
-    {0, 0, 0, 0, 0, 0, 0, 0xff0000}     /* f0-f7: 4-byte sequence */
+    /* 80-bf: non-leading bytes.  */
+    {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0},
+
+    /* 00-7f: 1-byte sequence.  */
+    {CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK,
+     CHARCLASS_WORD_MASK, 0, 0, 0, 0},
+
+    /* c2-df: 2-byte sequence.  */
+    {0, 0, 0, 0, 0, 0, ~3 & CHARCLASS_WORD_MASK, 0},
+
+    /* e0-ef: 3-byte sequence.  */
+    {0, 0, 0, 0, 0, 0, 0, 0xffff},
+
+    /* f0-f7: 4-byte sequence.  */
+    {0, 0, 0, 0, 0, 0, 0, 0xff0000}
   };
   const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
   unsigned int i;
@@ -2211,7 +2232,7 @@ charclass_context (charclass c)
   if (tstbit (eolbyte, c))
     context |= CTX_NEWLINE;
 
-  for (j = 0; j < CHARCLASS_INTS; ++j)
+  for (j = 0; j < CHARCLASS_WORDS; ++j)
     {
       if (c[j] & letters[j])
         context |= CTX_LETTER;
@@ -2544,11 +2565,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
   size_t ngrps = 0;             /* Number of groups actually used.  */
   position pos;                 /* Current position being considered.  */
   charclass matches;            /* Set of matching characters.  */
-  unsigned int matchesf;        /* Nonzero if matches is nonempty.  */
+  charclass_word matchesf;     /* Nonzero if matches is nonempty.  */
   charclass intersect;          /* Intersection with some label set.  */
-  unsigned int intersectf;      /* Nonzero if intersect is nonempty.  */
+  charclass_word intersectf;   /* Nonzero if intersect is nonempty.  */
   charclass leftovers;          /* Stuff in the label that didn't match.  */
-  unsigned int leftoversf;      /* Nonzero if leftovers is nonempty.  */
+  charclass_word leftoversf;   /* Nonzero if leftovers is nonempty.  */
   position_set follows;         /* Union of the follows of some group.  */
   position_set tmp;             /* Temporary space for merging sets.  */
   int possible_contexts;        /* Contexts that this group can match.  */
@@ -2592,21 +2613,21 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         {
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_NEWLINE))
-            for (j = 0; j < CHARCLASS_INTS; ++j)
+            for (j = 0; j < CHARCLASS_WORDS; ++j)
               matches[j] &= ~newline[j];
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_LETTER))
-            for (j = 0; j < CHARCLASS_INTS; ++j)
+            for (j = 0; j < CHARCLASS_WORDS; ++j)
               matches[j] &= ~letters[j];
           if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
                                     d->states[s].context, CTX_NONE))
-            for (j = 0; j < CHARCLASS_INTS; ++j)
+            for (j = 0; j < CHARCLASS_WORDS; ++j)
               matches[j] &= letters[j] | newline[j];
 
           /* If there are no characters left, there's no point in going on.  */
-          for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
+          for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
             continue;
-          if (j == CHARCLASS_INTS)
+          if (j == CHARCLASS_WORDS)
             continue;
         }
 
@@ -2622,17 +2643,17 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
           /* Check if this group's label has a nonempty intersection with
              matches.  */
           intersectf = 0;
-          for (k = 0; k < CHARCLASS_INTS; ++k)
+          for (k = 0; k < CHARCLASS_WORDS; ++k)
             intersectf |= intersect[k] = matches[k] & labels[j][k];
           if (!intersectf)
             continue;
 
           /* It does; now find the set differences both ways.  */
           leftoversf = matchesf = 0;
-          for (k = 0; k < CHARCLASS_INTS; ++k)
+          for (k = 0; k < CHARCLASS_WORDS; ++k)
             {
               /* Even an optimizing compiler can't know this for sure.  */
-              int match = matches[k], label = labels[j][k];
+              charclass_word match = matches[k], label = labels[j][k];
 
               leftoversf |= leftovers[k] = ~match & label;
               matchesf |= matches[k] = match & ~label;
@@ -2771,11 +2792,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
         state_letter = state;
 
       /* Set the transitions for each character in the current label.  */
-      for (j = 0; j < CHARCLASS_INTS; ++j)
-        for (k = 0; k < INTBITS; ++k)
-          if (labels[i][j] & 1U << k)
+      for (j = 0; j < CHARCLASS_WORDS; ++j)
+        for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
+          if (labels[i][j] >> k & 1)
             {
-              int c = j * INTBITS + k;
+              int c = j * CHARCLASS_WORD_BITS + k;
 
               if (c == eolbyte)
                 trans[c] = state_newline;
-- 
1.9.0

From 84668e5624aaf7a46c16ab890bbbe52c03ba8dc1 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 8 May 2014 17:27:40 -0700
Subject: [PATCH 3/3] dfa: assume C89 for CHAR_BIT

* src/dfa.c (CHARBITS): Remove.  All uses replaced by CHAR_BIT.
(NOTCHAR): Now an enum, since it need not be a macro.
---
 src/dfa.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 7704d5e..fc6ce72 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -62,13 +62,8 @@
 # undef clrbit
 #endif
 
-/* Number of bits in an unsigned char.  */
-#ifndef CHARBITS
-# define CHARBITS 8
-#endif
-
 /* First integer value that is greater than any character code.  */
-#define NOTCHAR (1 << CHARBITS)
+enum { NOTCHAR = 1 << CHAR_BIT };
 
 /* This represents part of a character class.  It must be unsigned and
    at least CHARCLASS_WORD_BITS wide.  Any excess bits are zero.  */
-- 
1.9.0

bug#17443: Don't assume 'unsigned' is exactly 32 bits wide

Reply via email to