On Mon, Nov 18, 2013 at 9:25 PM, Jim Meyering <[email protected]> wrote:
> On Mon, Nov 18, 2013 at 6:16 PM, Paul Eggert <[email protected]> wrote:
>> Jim Meyering wrote:
>>>  static int
>>>  tstbit (unsigned int b, charclass const c)
>>>  {
>>> -  return c[b / INTBITS] & 1 << b % INTBITS;
>>> +  return c[b / INTBITS] & 1U << b % INTBITS;
>>>  }
>>
>> On a machine with 32-bit int and where b % INTBITS is 31,
>> the expression c[b / INTBITS] & 1U << b % INTBITS
>> is of type 'unsigned' and can have the value 2**31, and
>> this will overflow when tstbit converts that value as an int,
>> leading to implementation-defined behavior, which can include
>> raising a signal.
>>
>> Better would be something like this:
>>
>> static bool
>> tstbit (unsigned int b, charclass const c)
>> {
>>   return c[b / INTBITS] >> b % INTBITS & 1;
>> }
>>
>> and it'd probably be better to encourage this style in
>> other places where the problem occurs, e.g., quotearg.
>
> Good point.  "bool" is a better return type, too.
> I will adjust.

Here's an updated patch:
From b041f8bc3986d1adcba8d2ea1c48f37c435fcc62 Mon Sep 17 00:00:00 2001
From: Jim Meyering <[email protected]>
Date: Mon, 18 Nov 2013 17:53:33 -0800
Subject: [PATCH] dfa: avoid undefined behavior of "1 << 31"

* src/dfa.c (charclass): Change type from "int" to "unsigned int".
(tstbit): Rather than shifting "1" left to form a mask, shift the
LHS bits the right and use "1" as the mask.  Also, return bool, rather
than "int".
(setbit, clrbit, dfastate): Don't shift "1" (aka (int)1) left by 31 bits.
Instead, use "1U" as the operand, to avoid undefined behavior.
Spotted by gcc's new -fsanitize=undefined.
Co-authored-by: Paul Eggert <[email protected]>
---
 src/dfa.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index 92c410e..f196b8a 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -88,7 +88,7 @@
 #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS)

 /* Sets of unsigned characters are stored as bit vectors in arrays of ints.  */
-typedef int charclass[CHARCLASS_INTS];
+typedef unsigned int charclass[CHARCLASS_INTS];

 /* Convert a possibly-signed character to an unsigned character.  This is
    a bit safer than casting to unsigned char, since it catches some type
@@ -547,22 +547,22 @@ prtok (token t)

 /* Stuff pertaining to charclasses.  */

-static int
+static bool
 tstbit (unsigned int b, charclass const c)
 {
-  return c[b / INTBITS] & 1 << b % INTBITS;
+  return c[b / INTBITS] >> b % INTBITS & 1;
 }

 static void
 setbit (unsigned int b, charclass c)
 {
-  c[b / INTBITS] |= 1 << b % INTBITS;
+  c[b / INTBITS] |= 1U << b % INTBITS;
 }

 static void
 clrbit (unsigned int b, charclass c)
 {
-  c[b / INTBITS] &= ~(1 << b % INTBITS);
+  c[b / INTBITS] &= ~(1U << b % INTBITS);
 }

 static void
@@ -2738,7 +2738,7 @@ dfastate (state_num s, struct dfa *d, state_num trans[])
       /* Set the transitions for each character in the current label.  */
       for (j = 0; j < CHARCLASS_INTS; ++j)
         for (k = 0; k < INTBITS; ++k)
-          if (labels[i][j] & 1 << k)
+          if (labels[i][j] & 1U << k)
             {
               int c = j * INTBITS + k;

--
1.8.4.rc0.11.g35f5eaa

Reply via email to