Bug#624387: [PATCH] dfa: fix case folding logic for character ranges

2011-06-07 Thread Paolo Bonzini
* src/dfa.c (setbit_case_fold): Remove, replace with...
(setbit_wc, setbit_c, setbit_case_fold_c): ... these.
(parse_bracket_exp): Use setbit_case_fold_c when iterating over
single-byte sequences.  Use setbit_wc for multi-byte character sets,
and setbit_case_fold_c for single-byte character sets.
(lex): Use setbit_case_fold_c for single-byte character sets.
---
 At first I was going to say this:

   You are using ru_RU.KOI8-R, which is a uni-byte locale, yet your
   inputs (both stdin and the grep regexp) use the two-byte
   representation \xd0\9f for П, instead of the uni-byte \360.

 But it fails even with the single-byte version.
 So it is indeed a bug in grep, but at least this time
 it affects relatively few locales.

 Here's the fix I expect to use and a test case to exercise it.

The bug affects all single-byte locales except ISO-8859-1 ones.
It is quite serious---the logic to map wide characters back to
bytes makes no sense.

The attached patch fixes it and does not regress high-bit-range,
while removing the debatable uses of wctob and checks for EOF.  Ok
to apply together with your testcase?
---
 src/dfa.c |  102 ++---
 1 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/src/dfa.c b/src/dfa.c
index b41cbb6..6602ae8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -536,51 +536,67 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
   eolbyte = eol;
 }
 
-/* Like setbit, but if case is folded, set both cases of a letter.
-   For MB_CUR_MAX  1, one or both of the two cases may not be set,
-   so the resulting charset may only be used as an optimization.  */
-static void
-setbit_case_fold (
+/* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
+   is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
+   this may happen when folding case in weird Turkish locales where
+   dotless i/dotted I are not included in the chosen character set.
+   Return whether a bit was set in the charclass.  */
 #if MBS_SUPPORT
-  wint_t b,
+static bool
+setbit_wc (wint_t wc, charclass c)
+{
+  int b = wctob (wc);
+  if (b != EOF)
+{
+  setbit (b, c);
+  return true;
+}
+  else
+return false;
+}
+
+/* Set a bit in the charclass for the given single byte character,
+   if it is valid in the current character set.  */
+static void
+setbit_c (int b, charclass c)
+{
+  /* Do nothing if b is invalid in this character set.  */
+  if (MB_CUR_MAX  1  btowc (b) == EOF)
+return;
+  setbit (b, c);
+}
 #else
-  unsigned int b,
+#define setbit_c setbit
 #endif
-  charclass c)
+
+/* Like setbit_c, but if case is folded, set both cases of a letter.  For
+   MB_CUR_MAX  1, the resulting charset is only used as an optimization,
+   and the caller takes care of setting the appropriate field of struct
+   mb_char_classes.  */
+static void
+setbit_case_fold_c (int b, charclass c)
 {
-  if (case_fold)
-{
 #if MBS_SUPPORT
-  if (MB_CUR_MAX  1)
-{
-  wint_t b1 = iswupper(b) ? towlower(b) : b;
-  wint_t b2 = iswlower(b) ? towupper(b) : b;
-  if (wctob ((unsigned char)b1) == b1)
-setbit (b1, c);
-  if (b2 != b1  wctob ((unsigned char)b2) == b2)
-setbit (b2, c);
-}
-  else
-#endif
-{
-  unsigned char b1 = isupper(b) ? tolower(b) : b;
-  unsigned char b2 = islower(b) ? toupper(b) : b;
-  setbit (b1, c);
-  if (b2 != b1)
-setbit (b2, c);
-}
+  if (MB_CUR_MAX  1)
+{
+  wint_t wc = btowc (b);
+  if (wc == EOF)
+return;
+  setbit (b, c);
+  if (case_fold  iswalpha (wc))
+setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
 }
   else
-{
-#if MBS_SUPPORT
-  int b2 = wctob ((unsigned char) b);
-  if (b2 == EOF || b2 == b)
 #endif
-setbit (b, c);
+{
+  setbit (b, c);
+  if (case_fold  isalpha (b))
+setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
 }
 }
 
 
+
 /* UTF-8 encoding allows some optimizations that we can't otherwise
assume in a multibyte encoding. */
 static inline int
@@ -859,7 +875,7 @@ parse_bracket_exp (void)
 
   for (c2 = 0; c2  NOTCHAR; ++c2)
 if (pred-func(c2))
-  setbit_case_fold (c2, ccl);
+  setbit_case_fold_c (c2, ccl);
 }
 
 #if MBS_SUPPORT
@@ -970,7 +986,7 @@ parse_bracket_exp (void)
 }
   if (!hard_LC_COLLATE)
 for (c = c1; c = c2; c++)
-  setbit_case_fold (c, ccl);
+  setbit_case_fold_c (c, ccl);
   else
 {
   /* Defer to the system regex library about the meaning
@@ -984,7 

Bug#624387: [PATCH] dfa: fix case folding logic for character ranges

2011-06-07 Thread Jim Meyering
Paolo Bonzini wrote:
 * src/dfa.c (setbit_case_fold): Remove, replace with...
 (setbit_wc, setbit_c, setbit_case_fold_c): ... these.
 (parse_bracket_exp): Use setbit_case_fold_c when iterating over
 single-byte sequences.  Use setbit_wc for multi-byte character sets,
 and setbit_case_fold_c for single-byte character sets.
 (lex): Use setbit_case_fold_c for single-byte character sets.
 ---
At first I was going to say this:
   
  You are using ru_RU.KOI8-R, which is a uni-byte locale, yet your
  inputs (both stdin and the grep regexp) use the two-byte
  representation \xd0\9f for П, instead of the uni-byte \360.
   
But it fails even with the single-byte version.
So it is indeed a bug in grep, but at least this time
it affects relatively few locales.
   
Here's the fix I expect to use and a test case to exercise it.

 The bug affects all single-byte locales except ISO-8859-1 ones.
 It is quite serious---the logic to map wide characters back to
 bytes makes no sense.

 The attached patch fixes it and does not regress high-bit-range,
 while removing the debatable uses of wctob and checks for EOF.  Ok
 to apply together with your testcase?
 ---
  src/dfa.c |  102 
 ++---
  1 files changed, 57 insertions(+), 45 deletions(-)

Hi Paolo,

Thanks for following through on this.
At first glance (I'll look carefully today)
this looks like the right approach.

However, I've gone ahead and pushed my patch and test case,
since it does solve the problem at hand, and I have not seen
inputs that make that code misbehave.



--
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of unsubscribe. Trouble? Contact listmas...@lists.debian.org



Bug#624387: [PATCH] dfa: fix case folding logic for character ranges

2011-06-07 Thread Jim Meyering
Paolo Bonzini wrote:
 * src/dfa.c (setbit_case_fold): Remove, replace with...
 (setbit_wc, setbit_c, setbit_case_fold_c): ... these.
 (parse_bracket_exp): Use setbit_case_fold_c when iterating over
 single-byte sequences.  Use setbit_wc for multi-byte character sets,
 and setbit_case_fold_c for single-byte character sets.
 (lex): Use setbit_case_fold_c for single-byte character sets.
 ---
At first I was going to say this:
   
  You are using ru_RU.KOI8-R, which is a uni-byte locale, yet your
  inputs (both stdin and the grep regexp) use the two-byte
  representation \xd0\9f for П, instead of the uni-byte \360.
   
But it fails even with the single-byte version.
So it is indeed a bug in grep, but at least this time
it affects relatively few locales.
   
Here's the fix I expect to use and a test case to exercise it.

 The bug affects all single-byte locales except ISO-8859-1 ones.
 It is quite serious---the logic to map wide characters back to
 bytes makes no sense.


Thanks again for working on this.
Your mention of case folding implies that this fixes a bug unrelated
to the one I have just fixed.  My patch affected code that was guarded
by !case_fold.

If your patch does indeed fix a case-folding bug,
I would really like to see a test case.
Grep is at a point in its development that any bug
fix commit really should be accompanied by a test suite
addition, if at all possible.

All that said, your patch looks like a fine improvement.

Ahh.  Perfect.  I see that as I was writing this you have
posted a test case (and a nice optimization!).  Thanks again!

The suggestion below looks like it still applies to your latest.

 diff --git a/src/dfa.c b/src/dfa.c
 index b41cbb6..6602ae8 100644
 --- a/src/dfa.c
 +++ b/src/dfa.c
 @@ -536,51 +536,67 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char 
 eol)
eolbyte = eol;
  }

 -/* Like setbit, but if case is folded, set both cases of a letter.
 -   For MB_CUR_MAX  1, one or both of the two cases may not be set,
 -   so the resulting charset may only be used as an optimization.  */
 -static void
 -setbit_case_fold (
 +/* Set a bit in the charclass for the given wchar_t.  Do nothing if WC
 +   is represented by a multi-byte sequence.  Even for MB_CUR_MAX == 1,
 +   this may happen when folding case in weird Turkish locales where
 +   dotless i/dotted I are not included in the chosen character set.
 +   Return whether a bit was set in the charclass.  */
  #if MBS_SUPPORT
 -  wint_t b,
 +static bool
 +setbit_wc (wint_t wc, charclass c)
 +{
 +  int b = wctob (wc);
 +  if (b != EOF)
 +{
 +  setbit (b, c);
 +  return true;
 +}
 +  else
 +return false;
 +}

We can avoid the negation, curly braces and the else.
Not only does this align slightly better with the new description,
but it seems more readable:

  int b = wctob (wc);
  if (b == EOF)
return false;

  setbit (b, c);
  return true;

 +/* Set a bit in the charclass for the given single byte character,
 +   if it is valid in the current character set.  */
 +static void
 +setbit_c (int b, charclass c)
 +{
 +  /* Do nothing if b is invalid in this character set.  */
 +  if (MB_CUR_MAX  1  btowc (b) == EOF)
 +return;
 +  setbit (b, c);
 +}



--
To UNSUBSCRIBE, email to debian-bugs-dist-requ...@lists.debian.org
with a subject of unsubscribe. Trouble? Contact listmas...@lists.debian.org