I installed the attached patches, which port GNU grep's DFA code to
unusual platforms where 'int' is wider than 32 bits. A
currently-produced example is the Unisys ClearPath MCP, where 'unsigned
int' is 39 bits wide.
The 2nd patch is the real change; the other patches are minor cleanup.
None of these changes should affect typical hosts where 'unsigned int'
is 32 bits wide.
My source for that "39 bits" is page 4-490 of:
http://public.support.unisys.com/aseries/docs/clearpath-mcp-15.0/pdf/86002278-305.pdf
From cdb90cb9cbe47588fac7537896f1cfa98792b827 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 8 May 2014 16:13:54 -0700
Subject: [PATCH 1/3] maint: fix indenting to pacify
'prohibit_tab_based_indentation'
* src/dfa.c: Use spaces and not tabs to indent some lines.
---
src/dfa.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index fc2acdc..93638ec 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -182,25 +182,25 @@ enum
a backtracking matcher. */
BEGLINE, /* BEGLINE is a terminal symbol that matches
- the empty string at the beginning of a
- line. */
+ the empty string at the beginning of a
+ line. */
ENDLINE, /* ENDLINE is a terminal symbol that matches
- the empty string at the end of a line. */
+ the empty string at the end of a line. */
BEGWORD, /* BEGWORD is a terminal symbol that matches
- the empty string at the beginning of a
- word. */
+ the empty string at the beginning of a
+ word. */
ENDWORD, /* ENDWORD is a terminal symbol that matches
- the empty string at the end of a word. */
+ the empty string at the end of a word. */
LIMWORD, /* LIMWORD is a terminal symbol that matches
- the empty string at the beginning or the
- end of a word. */
+ the empty string at the beginning or the
+ end of a word. */
NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that
- matches the empty string not at
+ matches the empty string not at
the beginning or end of a word. */
QMARK, /* QMARK is an operator of one argument that
@@ -839,9 +839,9 @@ static int cur_mb_len = 1; /* Length of the multibyte
representation of
wctok. */
static wint_t wctok; /* Wide character representation of the current
- multibyte character, or WEOF if there was
- an encoding error. Used only if
- MB_CUR_MAX > 1. */
+ multibyte character, or WEOF if there was
+ an encoding error. Used only if
+ MB_CUR_MAX > 1. */
/* Fetch the next lexical input character. Set C (of type int) to the
--
1.9.0
From 80322a1bcd61581fd0fb17117cf986b52ca60cfb Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 8 May 2014 17:24:32 -0700
Subject: [PATCH 2/3] dfa: don't assume unsigned int is exactly 32 bits wide
Sun C 5.12 (sparc) warns of the potential unportability.
* src/dfa.c (charclass_word): New type, for clarity.
All relevant uses of 'unsigned' changed.
(CHARCLASS_WORD_BITS): Rename from INTBITS. All uses changed.
Now an enum, since it needn't be a macro.
(CHARCLASS_WORD_MASK): New macro.
(CHARCLASS_WORDS): Rename from CHARCLASS_INTS. All uses changed.
(setbit, clrbit): Cast 1 to charclass_word, for clarity.
(notset, add_utf8_anychar, dfastats):
Don't assume unsigned int is exactly 32 bits wide.
(dfastate): Don't rely on implementation-defined conversion of
greater-than-INT_MAX unsigned to int. Change bit test to resemble
tstbit more.
---
src/dfa.c | 87 +++++++++++++++++++++++++++++++++++++++------------------------
1 file changed, 54 insertions(+), 33 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 93638ec..7704d5e 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -70,16 +70,26 @@
/* First integer value that is greater than any character code. */
#define NOTCHAR (1 << CHARBITS)
-/* INTBITS need not be exact, just a lower bound. */
-#ifndef INTBITS
-# define INTBITS (CHARBITS * sizeof (int))
-#endif
+/* This represents part of a character class. It must be unsigned and
+ at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
+typedef unsigned int charclass_word;
+
+/* The number of bits used in a charclass word. utf8_classes assumes
+ this is exactly 32. */
+enum { CHARCLASS_WORD_BITS = 32 };
+
+/* The maximum useful value of a charclass_word; all used bits are 1. */
+#define CHARCLASS_WORD_MASK \
+ (((charclass_word) 1 << (CHARCLASS_WORD_BITS - 1) << 1) - 1)
-/* Number of ints required to hold a bit for every character. */
-#define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)
+/* Number of words required to hold a bit for every character. */
+enum
+{
+ CHARCLASS_WORDS = (NOTCHAR + CHARCLASS_WORD_BITS - 1) / CHARCLASS_WORD_BITS
+};
/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
-typedef unsigned int charclass[CHARCLASS_INTS];
+typedef charclass_word charclass[CHARCLASS_WORDS];
/* Convert a possibly-signed character to an unsigned character. This is
a bit safer than casting to unsigned char, since it catches some type
@@ -575,19 +585,20 @@ prtok (token t)
static bool
tstbit (unsigned int b, charclass const c)
{
- return c[b / INTBITS] >> b % INTBITS & 1;
+ return c[b / CHARCLASS_WORD_BITS] >> b % CHARCLASS_WORD_BITS & 1;
}
static void
setbit (unsigned int b, charclass c)
{
- c[b / INTBITS] |= 1U << b % INTBITS;
+ c[b / CHARCLASS_WORD_BITS] |= (charclass_word) 1 << b % CHARCLASS_WORD_BITS;
}
static void
clrbit (unsigned int b, charclass c)
{
- c[b / INTBITS] &= ~(1U << b % INTBITS);
+ c[b / CHARCLASS_WORD_BITS] &= ~((charclass_word) 1
+ << b % CHARCLASS_WORD_BITS);
}
static void
@@ -607,8 +618,8 @@ notset (charclass s)
{
int i;
- for (i = 0; i < CHARCLASS_INTS; ++i)
- s[i] = ~s[i];
+ for (i = 0; i < CHARCLASS_WORDS; ++i)
+ s[i] = CHARCLASS_WORD_MASK & ~s[i];
}
static bool
@@ -1696,11 +1707,21 @@ static void
add_utf8_anychar (void)
{
static const charclass utf8_classes[5] = {
- {0, 0, 0, 0, ~0, ~0, 0, 0}, /* 80-bf: non-leading bytes */
- {~0, ~0, ~0, ~0, 0, 0, 0, 0}, /* 00-7f: 1-byte sequence */
- {0, 0, 0, 0, 0, 0, ~3, 0}, /* c2-df: 2-byte sequence */
- {0, 0, 0, 0, 0, 0, 0, 0xffff}, /* e0-ef: 3-byte sequence */
- {0, 0, 0, 0, 0, 0, 0, 0xff0000} /* f0-f7: 4-byte sequence */
+ /* 80-bf: non-leading bytes. */
+ {0, 0, 0, 0, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, 0, 0},
+
+ /* 00-7f: 1-byte sequence. */
+ {CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK, CHARCLASS_WORD_MASK,
+ CHARCLASS_WORD_MASK, 0, 0, 0, 0},
+
+ /* c2-df: 2-byte sequence. */
+ {0, 0, 0, 0, 0, 0, ~3 & CHARCLASS_WORD_MASK, 0},
+
+ /* e0-ef: 3-byte sequence. */
+ {0, 0, 0, 0, 0, 0, 0, 0xffff},
+
+ /* f0-f7: 4-byte sequence. */
+ {0, 0, 0, 0, 0, 0, 0, 0xff0000}
};
const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]);
unsigned int i;
@@ -2211,7 +2232,7 @@ charclass_context (charclass c)
if (tstbit (eolbyte, c))
context |= CTX_NEWLINE;
- for (j = 0; j < CHARCLASS_INTS; ++j)
+ for (j = 0; j < CHARCLASS_WORDS; ++j)
{
if (c[j] & letters[j])
context |= CTX_LETTER;
@@ -2544,11 +2565,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
size_t ngrps = 0; /* Number of groups actually used. */
position pos; /* Current position being considered. */
charclass matches; /* Set of matching characters. */
- unsigned int matchesf; /* Nonzero if matches is nonempty. */
+ charclass_word matchesf; /* Nonzero if matches is nonempty. */
charclass intersect; /* Intersection with some label set. */
- unsigned int intersectf; /* Nonzero if intersect is nonempty. */
+ charclass_word intersectf; /* Nonzero if intersect is nonempty. */
charclass leftovers; /* Stuff in the label that didn't match. */
- unsigned int leftoversf; /* Nonzero if leftovers is nonempty. */
+ charclass_word leftoversf; /* Nonzero if leftovers is nonempty. */
position_set follows; /* Union of the follows of some group. */
position_set tmp; /* Temporary space for merging sets. */
int possible_contexts; /* Contexts that this group can match. */
@@ -2592,21 +2613,21 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
{
if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_NEWLINE))
- for (j = 0; j < CHARCLASS_INTS; ++j)
+ for (j = 0; j < CHARCLASS_WORDS; ++j)
matches[j] &= ~newline[j];
if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_LETTER))
- for (j = 0; j < CHARCLASS_INTS; ++j)
+ for (j = 0; j < CHARCLASS_WORDS; ++j)
matches[j] &= ~letters[j];
if (!SUCCEEDS_IN_CONTEXT (pos.constraint,
d->states[s].context, CTX_NONE))
- for (j = 0; j < CHARCLASS_INTS; ++j)
+ for (j = 0; j < CHARCLASS_WORDS; ++j)
matches[j] &= letters[j] | newline[j];
/* If there are no characters left, there's no point in going on. */
- for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
+ for (j = 0; j < CHARCLASS_WORDS && !matches[j]; ++j)
continue;
- if (j == CHARCLASS_INTS)
+ if (j == CHARCLASS_WORDS)
continue;
}
@@ -2622,17 +2643,17 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
/* Check if this group's label has a nonempty intersection with
matches. */
intersectf = 0;
- for (k = 0; k < CHARCLASS_INTS; ++k)
+ for (k = 0; k < CHARCLASS_WORDS; ++k)
intersectf |= intersect[k] = matches[k] & labels[j][k];
if (!intersectf)
continue;
/* It does; now find the set differences both ways. */
leftoversf = matchesf = 0;
- for (k = 0; k < CHARCLASS_INTS; ++k)
+ for (k = 0; k < CHARCLASS_WORDS; ++k)
{
/* Even an optimizing compiler can't know this for sure. */
- int match = matches[k], label = labels[j][k];
+ charclass_word match = matches[k], label = labels[j][k];
leftoversf |= leftovers[k] = ~match & label;
matchesf |= matches[k] = match & ~label;
@@ -2771,11 +2792,11 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
state_letter = state;
/* Set the transitions for each character in the current label. */
- for (j = 0; j < CHARCLASS_INTS; ++j)
- for (k = 0; k < INTBITS; ++k)
- if (labels[i][j] & 1U << k)
+ for (j = 0; j < CHARCLASS_WORDS; ++j)
+ for (k = 0; k < CHARCLASS_WORD_BITS; ++k)
+ if (labels[i][j] >> k & 1)
{
- int c = j * INTBITS + k;
+ int c = j * CHARCLASS_WORD_BITS + k;
if (c == eolbyte)
trans[c] = state_newline;
--
1.9.0
From 84668e5624aaf7a46c16ab890bbbe52c03ba8dc1 Mon Sep 17 00:00:00 2001
From: Paul Eggert <[email protected]>
Date: Thu, 8 May 2014 17:27:40 -0700
Subject: [PATCH 3/3] dfa: assume C89 for CHAR_BIT
* src/dfa.c (CHARBITS): Remove. All uses replaced by CHAR_BIT.
(NOTCHAR): Now an enum, since it need not be a macro.
---
src/dfa.c | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/src/dfa.c b/src/dfa.c
index 7704d5e..fc6ce72 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -62,13 +62,8 @@
# undef clrbit
#endif
-/* Number of bits in an unsigned char. */
-#ifndef CHARBITS
-# define CHARBITS 8
-#endif
-
/* First integer value that is greater than any character code. */
-#define NOTCHAR (1 << CHARBITS)
+enum { NOTCHAR = 1 << CHAR_BIT };
/* This represents part of a character class. It must be unsigned and
at least CHARCLASS_WORD_BITS wide. Any excess bits are zero. */
--
1.9.0