[PATCH 7/9] grep: split search.c

Paolo Bonzini Fri, 19 Mar 2010 04:49:17 -0700

* po/POTFILES.in: Update.
* src/Makefile.am (grep_SOURCES, egrep_SOURCES, fgrep_SOURCES): Move
kwset.c and dfa.c to libsearch.a.  Add searchutils.c there too.
* src/search.h, src/dfasearch.c, src/pcresearch.c, src/kwsearch.c,
src/searchutils.c: New files, split out of src/search.c.
* src/esearch.c, src/fsearch.c: Include the new files instead of search.c.
* src/gsearch.c: Likewise, plus move Gcompile/Acompile here.
---
 po/POTFILES.in    |    4 +-
 src/Makefile.am   |   18 +-
 src/dfasearch.c   |  394 +++++++++++++++++++++++++
 src/esearch.c     |    3 +-
 src/fsearch.c     |    3 +-
 src/gsearch.c     |   18 +-
 src/kwsearch.c    |  161 ++++++++++
 src/pcresearch.c  |  177 +++++++++++
 src/search.c      |  844 -----------------------------------------------------
 src/search.h      |   47 +++
 src/searchutils.c |  141 +++++++++
 11 files changed, 952 insertions(+), 858 deletions(-)
 create mode 100644 src/dfasearch.c
 create mode 100644 src/kwsearch.c
 create mode 100644 src/pcresearch.c
 delete mode 100644 src/search.c
 create mode 100644 src/search.h
 create mode 100644 src/searchutils.c


diff --git a/po/POTFILES.in b/po/POTFILES.in
index 920413e..286c731 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -26,4 +26,6 @@ lib/xstrtol-error.c
 src/dfa.c
 src/grep.c
 src/kwset.c
-src/search.c
+src/dfasearch.c
+src/kwsearch.c
+src/pcresearch.c
diff --git a/src/Makefile.am b/src/Makefile.am
index 0b0140e..7ebc126 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -19,16 +19,20 @@ LN = ln
 AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
 
 bin_PROGRAMS = grep egrep fgrep
-grep_SOURCES  =  grep.c gsearch.c kwset.c dfa.c
-egrep_SOURCES = egrep.c esearch.c kwset.c dfa.c
-fgrep_SOURCES = fgrep.c fsearch.c kwset.c
-noinst_HEADERS = grep.h dfa.h kwset.h system.h mbsupport.h
+grep_SOURCES  =  grep.c gsearch.c
+egrep_SOURCES = egrep.c esearch.c
+fgrep_SOURCES = fgrep.c fsearch.c
+noinst_HEADERS = grep.h dfa.h kwset.h search.h system.h mbsupport.h
 
-LDADD = $(LIBINTL) ../lib/libgreputils.a
-grep_LDADD = $(PCRE_LIBS) $(LDADD)
+noinst_LIBRARIES = libsearch.a
+libsearch_a_SOURCES = kwset.c dfa.c searchutils.c
+
+LDADD = $(LIBINTL) libsearch.a ../lib/libgreputils.a
+grep_LDADD = $(LDADD) $(PCRE_LIBS)
 localedir = $(datadir)/locale
 INCLUDES = -I$(top_srcdir)/lib -DLOCALEDIR=\"$(localedir)\"
 
 EXTRA_DIST = \
-             dosbuf.c search.c \
+             dosbuf.c \
+            pcresearch.c dfasearch.c kwsearch.c \
              vms_fab.c vms_fab.h
diff --git a/src/dfasearch.c b/src/dfasearch.c
new file mode 100644
index 0000000..ef9404b
--- /dev/null
+++ b/src/dfasearch.c
@@ -0,0 +1,394 @@
+/* dfasearch.c - searching subroutines using dfa and regex for grep.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+#include "dfa.h"
+
+/* For -w, we also consider _ to be word constituent.  */
+#define WCHAR(C) (ISALNUM(C) || (C) == '_')
+
+/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
+   a list of strings, at least one of which is known to occur in
+   any string matching the regexp. */
+static kwset_t kwset;
+
+/* DFA compiled regexp. */
+static struct dfa dfa;
+
+/* The Regex compiled patterns.  */
+static struct patterns
+{
+  /* Regex compiled regexp. */
+  struct re_pattern_buffer regexbuf;
+  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
+                              q...@#%!# library interface in regex.c.  */
+} patterns0;
+
+struct patterns *patterns;
+size_t pcount;
+
+void
+dfaerror (char const *mesg)
+{
+  error (EXIT_TROUBLE, 0, "%s", mesg);
+}
+
+/* Number of compiled fixed strings known to exactly match the regexp.
+   If kwsexec returns < kwset_exact_matches, then we don't need to
+   call the regexp matcher at all. */
+static int kwset_exact_matches;
+
+static char const *
+kwsincr_case (const char *must)
+{
+  const char *buf;
+  size_t n;
+
+  n = strlen (must);
+#ifdef MBS_SUPPORT
+  if (match_icase && MB_CUR_MAX > 1)
+    buf = mbtolower (must, &n);
+  else
+#endif
+    buf = must;
+  return kwsincr (kwset, buf, n);
+}
+
+/* If the DFA turns out to have some set of fixed strings one of
+   which must occur in the match, then we build a kwset matcher
+   to find those strings, and thus quickly filter out impossible
+   matches. */
+static void
+kwsmusts (void)
+{
+  struct dfamust const *dm;
+  char const *err;
+
+  if (dfa.musts)
+    {
+      kwsinit (&kwset);
+      /* First, we compile in the substrings known to be exact
+        matches.  The kwset matcher will return the index
+        of the matching string that it chooses. */
+      for (dm = dfa.musts; dm; dm = dm->next)
+       {
+         if (!dm->exact)
+           continue;
+         ++kwset_exact_matches;
+         if ((err = kwsincr_case (dm->must)) != NULL)
+           error (EXIT_TROUBLE, 0, "%s", err);
+       }
+      /* Now, we compile the substrings that will require
+        the use of the regexp matcher.  */
+      for (dm = dfa.musts; dm; dm = dm->next)
+       {
+         if (dm->exact)
+           continue;
+         if ((err = kwsincr_case (dm->must)) != NULL)
+           error (EXIT_TROUBLE, 0, "%s", err);
+       }
+      if ((err = kwsprep (kwset)) != NULL)
+       error (EXIT_TROUBLE, 0, "%s", err);
+    }
+}
+
+/* No __VA_ARGS__ in C89.  So we have to do it this way.  */
+static void
+GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
+{
+  const char *err;
+  const char *p, *sep;
+  size_t total = size;
+  char *motif;
+
+  if (match_icase)
+    syntax_bits |= RE_ICASE;
+  re_set_syntax (syntax_bits);
+  dfasyntax (syntax_bits, match_icase, eolbyte);
+
+  /* For GNU regex compiler we have to pass the patterns separately to detect
+     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
+     GNU regex should have raise a syntax error.  The same for backref, where
+     the backref should have been local to each pattern.  */
+  p = pattern;
+  do
+    {
+      size_t len;
+      sep = memchr (p, '\n', total);
+      if (sep)
+       {
+         len = sep - p;
+         sep++;
+         total -= (len + 1);
+       }
+      else
+       {
+         len = total;
+         total = 0;
+       }
+
+      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
+      if (patterns == NULL)
+       error (EXIT_TROUBLE, errno, _("memory exhausted"));
+      patterns[pcount] = patterns0;
+
+      if ((err = re_compile_pattern (p, len,
+                                   &(patterns[pcount].regexbuf))) != NULL)
+       error (EXIT_TROUBLE, 0, "%s", err);
+      pcount++;
+
+      p = sep;
+    } while (sep && total != 0);
+
+  /* In the match_words and match_lines cases, we use a different pattern
+     for the DFA matcher that will quickly throw out cases that won't work.
+     Then if DFA succeeds we do some hairy stuff using the regex matcher
+     to decide whether the match should really count. */
+  if (match_words || match_lines)
+    {
+      static char const line_beg_no_bk[] = "^(";
+      static char const line_end_no_bk[] = ")$";
+      static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
+      static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
+      static char const line_beg_bk[] = "^\\(";
+      static char const line_end_bk[] = "\\)$";
+      static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
+      static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
+      int bk = !(syntax_bits & RE_NO_BK_PARENS);
+      char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
+
+      strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
+                            : (bk ? word_beg_bk : word_beg_no_bk));
+      total = strlen(n);
+      memcpy (n + total, pattern, size);
+      total += size;
+      strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
+                                    : (bk ? word_end_bk : word_end_no_bk));
+      total += strlen (n + total);
+      pattern = motif = n;
+      size = total;
+    }
+  else
+    motif = NULL;
+
+  dfacomp (pattern, size, &dfa, 1);
+  kwsmusts ();
+
+  free(motif);
+}
+
+static void
+Ecompile (char const *pattern, size_t size)
+{
+  return GEAcompile (pattern, size, RE_SYNTAX_POSIX_EGREP);
+}
+
+static size_t
+EGexecute (char const *buf, size_t size, size_t *match_size, char const 
*start_ptr)
+{
+  char const *buflim, *beg, *end, *match, *best_match, *mb_start;
+  char eol = eolbyte;
+  int backref, start, len, best_len;
+  struct kwsmatch kwsm;
+  size_t i, ret_val;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      if (match_icase)
+        {
+          /* mbtolower adds a NUL byte at the end.  That will provide
+            space for the sentinel byte dfaexec may add.  */
+          char *case_buf = mbtolower (buf, &size);
+         if (start_ptr)
+           start_ptr = case_buf + (start_ptr - buf);
+          buf = case_buf;
+        }
+    }
+#endif /* MBS_SUPPORT */
+
+  mb_start = buf;
+  buflim = buf + size;
+
+  for (beg = end = buf; end < buflim; beg = end)
+    {
+      if (!start_ptr)
+       {
+         /* We don't care about an exact match.  */
+         if (kwset)
+           {
+             /* Find a possible match using the KWset matcher. */
+             size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
+             if (offset == (size_t) -1)
+               goto failure;
+             beg += offset;
+             /* Narrow down to the line containing the candidate, and
+                run it through DFA. */
+             if ((end = memchr(beg, eol, buflim - beg)) != NULL)
+               end++;
+              else
+                end = buflim;
+             match = beg;
+             while (beg > buf && beg[-1] != eol)
+               --beg;
+             if (kwsm.index < kwset_exact_matches)
+                {
+#ifdef MBS_SUPPORT
+                  if (mb_start < beg)
+                    mb_start = beg;
+                  if (MB_CUR_MAX == 1 || !is_mb_middle (&mb_start, match, 
buflim))
+#endif
+                    goto success;
+                }
+             if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
+               continue;
+           }
+         else
+           {
+             /* No good fixed strings; start with DFA. */
+             char const *next_beg = dfaexec (&dfa, beg, (char *) buflim,
+                                             0, NULL, &backref);
+             if (next_beg == NULL)
+               break;
+             /* Narrow down to the line we've found. */
+             beg = next_beg;
+             if ((end = memchr(beg, eol, buflim - beg)) != NULL)
+               end++;
+              else
+                end = buflim;
+             while (beg > buf && beg[-1] != eol)
+               --beg;
+           }
+         /* Successful, no backreferences encountered! */
+         if (!backref)
+           goto success;
+       }
+      else
+       {
+         /* We are looking for the leftmost (then longest) exact match.
+            We will go through the outer loop only once.  */
+         beg = start_ptr;
+         end = buflim;
+       }
+
+      /* If we've made it to this point, this means DFA has seen
+        a probable match, and we need to run it through Regex. */
+      best_match = end;
+      best_len = 0;
+      for (i = 0; i < pcount; i++)
+       {
+         patterns[i].regexbuf.not_eol = 0;
+         if (0 <= (start = re_search (&(patterns[i].regexbuf),
+                                      buf, end - buf - 1,
+                                      beg - buf, end - beg - 1,
+                                      &(patterns[i].regs))))
+           {
+             len = patterns[i].regs.end[0] - start;
+             match = buf + start;
+             if (match > best_match)
+               continue;
+             if (start_ptr && !match_words)
+               goto assess_pattern_match;
+             if ((!match_lines && !match_words)
+                 || (match_lines && len == end - beg - 1))
+               {
+                 match = beg;
+                 len = end - beg;
+                 goto assess_pattern_match;
+               }
+             /* If -w, check if the match aligns with word boundaries.
+                We do this iteratively because:
+                (a) the line may contain more than one occurence of the
+                pattern, and
+                (b) Several alternatives in the pattern might be valid at a
+                given point, and we may need to consider a shorter one to
+                find a word boundary.  */
+             if (match_words)
+               while (match <= best_match)
+                 {
+                   if ((match == buf || !WCHAR ((unsigned char) match[-1]))
+                       && (len == end - beg - 1
+                           || !WCHAR ((unsigned char) match[len])))
+                     goto assess_pattern_match;
+                   if (len > 0)
+                     {
+                       /* Try a shorter length anchored at the same place. */
+                       --len;
+                       patterns[i].regexbuf.not_eol = 1;
+                       len = re_match (&(patterns[i].regexbuf),
+                                       buf, match + len - beg, match - buf,
+                                       &(patterns[i].regs));
+                     }
+                   if (len <= 0)
+                     {
+                       /* Try looking further on. */
+                       if (match == end - 1)
+                         break;
+                       match++;
+                       patterns[i].regexbuf.not_eol = 0;
+                       start = re_search (&(patterns[i].regexbuf),
+                                          buf, end - buf - 1,
+                                          match - buf, end - match - 1,
+                                          &(patterns[i].regs));
+                       if (start < 0)
+                         break;
+                       len = patterns[i].regs.end[0] - start;
+                       match = buf + start;
+                     }
+                 } /* while (match <= best_match) */
+             continue;
+           assess_pattern_match:
+             if (!start_ptr)
+               {
+                 /* Good enough for a non-exact match.
+                    No need to look at further patterns, if any.  */
+                 beg = match;
+                 goto success_in_len;
+               }
+             if (match < best_match || (match == best_match && len > best_len))
+               {
+                 /* Best exact match:  leftmost, then longest.  */
+                 best_match = match;
+                 best_len = len;
+               }
+           } /* if re_search >= 0 */
+       } /* for Regex patterns.  */
+       if (best_match < end)
+         {
+           /* We have found an exact match.  We were just
+              waiting for the best one (leftmost then longest).  */
+           beg = best_match;
+           len = best_len;
+           goto success_in_len;
+         }
+    } /* for (beg = end ..) */
+
+ failure:
+  ret_val = -1;
+  goto out;
+
+ success:
+  len = end - beg;
+ success_in_len:
+  *match_size = len;
+  ret_val = beg - buf;
+ out:
+  return ret_val;
+}
diff --git a/src/esearch.c b/src/esearch.c
index d76c310..8c749c8 100644
--- a/src/esearch.c
+++ b/src/esearch.c
@@ -1,5 +1,4 @@
-#define EGREP_PROGRAM
-#include "search.c"
+#include "dfasearch.c"
 
 struct matcher const matchers[] = {
   { "egrep", Ecompile, EGexecute },
diff --git a/src/fsearch.c b/src/fsearch.c
index e1ca0b1..b16e769 100644
--- a/src/fsearch.c
+++ b/src/fsearch.c
@@ -1,5 +1,4 @@
-#define FGREP_PROGRAM
-#include "search.c"
+#include "kwsearch.c"
 
 struct matcher const matchers[] = {
   { "fgrep", Fcompile, Fexecute },
diff --git a/src/gsearch.c b/src/gsearch.c
index e3e0423..07ceba6 100644
--- a/src/gsearch.c
+++ b/src/gsearch.c
@@ -1,4 +1,19 @@
-#include "search.c"
+#include "dfasearch.c"
+#include "pcresearch.c"
+#include "kwsearch.c"
+
+static void
+Gcompile (char const *pattern, size_t size)
+{
+  return GEAcompile (pattern, size,
+                    RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
+}
+
+static void
+Acompile (char const *pattern, size_t size)
+{
+  return GEAcompile (pattern, size, RE_SYNTAX_AWK);
+}
 
 struct matcher const matchers[] = {
   { "grep",    Gcompile, EGexecute },
@@ -8,4 +23,3 @@ struct matcher const matchers[] = {
   { "perl",    Pcompile, Pexecute },
   { NULL, NULL, NULL },
 };
-
diff --git a/src/kwsearch.c b/src/kwsearch.c
new file mode 100644
index 0000000..0e0f3bf
--- /dev/null
+++ b/src/kwsearch.c
@@ -0,0 +1,161 @@
+/* kwsearch.c - searching subroutines using kwset for grep.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+
+/* For -w, we also consider _ to be word constituent.  */
+#define WCHAR(C) (ISALNUM(C) || (C) == '_')
+
+/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
+   a list of strings, at least one of which is known to occur in
+   any string matching the regexp. */
+static kwset_t kwset;
+
+static void
+Fcompile (char const *pattern, size_t size)
+{
+  char const *beg, *end, *lim, *err, *pat;
+  size_t psize;
+
+  kwsinit (&kwset);
+  psize = size;
+  if (match_icase && MB_CUR_MAX > 1)
+    pat = mbtolower (pattern, &psize);
+  else
+    pat = pattern;
+
+  beg = pat;
+  do
+    {
+      for (lim = beg;; ++lim)
+       {
+         end = lim;
+         if (lim >= pat + psize)
+           break;
+        if (*lim == '\n')
+          {
+            lim++;
+            break;
+          }
+#if HAVE_DOS_FILE_CONTENTS
+        if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
+          {
+            lim += 2;
+            break;
+          }
+#endif
+       }
+
+      if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
+       error (EXIT_TROUBLE, 0, "%s", err);
+      beg = lim;
+    }
+  while (beg < pat + psize);
+
+  if ((err = kwsprep (kwset)) != NULL)
+    error (EXIT_TROUBLE, 0, "%s", err);
+}
+
+static size_t
+Fexecute (char const *buf, size_t size, size_t *match_size, char const 
*start_ptr)
+{
+  char const *beg, *try, *end, *mb_start;
+  size_t len;
+  char eol = eolbyte;
+  struct kwsmatch kwsmatch;
+  size_t ret_val;
+#ifdef MBS_SUPPORT
+  if (MB_CUR_MAX > 1)
+    {
+      if (match_icase)
+        {
+          char *case_buf = mbtolower (buf, &size);
+         if (start_ptr)
+           start_ptr = case_buf + (start_ptr - buf);
+          buf = case_buf;
+        }
+    }
+#endif /* MBS_SUPPORT */
+
+  for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
+    {
+      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
+      if (offset == (size_t) -1)
+       goto failure;
+#ifdef MBS_SUPPORT
+      if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
+        {
+          beg = mb_start - 1;
+          continue; /* It is a part of multibyte character.  */
+        }
+#endif /* MBS_SUPPORT */
+      beg += offset;
+      len = kwsmatch.size[0];
+      if (start_ptr && !match_words)
+       goto success_in_beg_and_len;
+      if (match_lines)
+       {
+         if (beg > buf && beg[-1] != eol)
+           continue;
+         if (beg + len < buf + size && beg[len] != eol)
+           continue;
+         goto success;
+       }
+      else if (match_words)
+       for (try = beg; len; )
+         {
+           if (try > buf && WCHAR((unsigned char) try[-1]))
+             break;
+           if (try + len < buf + size && WCHAR((unsigned char) try[len]))
+             {
+               offset = kwsexec (kwset, beg, --len, &kwsmatch);
+               if (offset == (size_t) -1)
+                 break;
+               try = beg + offset;
+               len = kwsmatch.size[0];
+             }
+           else if (!start_ptr)
+             goto success;
+           else
+             goto success_in_beg_and_len;
+         } /* for (try) */
+      else
+       goto success;
+    } /* for (beg in buf) */
+
+ failure:
+  ret_val = -1;
+  goto out;
+
+ success:
+  if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL)
+    end++;
+  else
+    end = buf + size;
+  while (buf < beg && beg[-1] != eol)
+    --beg;
+  len = end - beg;
+ success_in_beg_and_len:
+  *match_size = len;
+  ret_val = beg - buf;
+ out:
+  return ret_val;
+}
diff --git a/src/pcresearch.c b/src/pcresearch.c
new file mode 100644
index 0000000..c3411b7
--- /dev/null
+++ b/src/pcresearch.c
@@ -0,0 +1,177 @@
+/* pcresearch.c - searching subroutines using PCRE for grep.
+   Copyright 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+/* Written August 1992 by Mike Haertel. */
+
+#include <config.h>
+#include "search.h"
+#ifdef HAVE_LIBPCRE
+# include <pcre.h>
+#endif
+
+#if HAVE_LIBPCRE
+/* Compiled internal form of a Perl regular expression.  */
+static pcre *cre;
+
+/* Additional information about the pattern.  */
+static pcre_extra *extra;
+#endif
+
+static void
+Pcompile (char const *pattern, size_t size)
+{
+#if !HAVE_LIBPCRE
+  error (EXIT_TROUBLE, 0, "%s",
+        _("support for the -P option is not compiled into "
+          "this --disable-perl-regexp binary"));
+#else
+  int e;
+  char const *ep;
+  char *re = xmalloc (4 * size + 7);
+  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
+  char const *patlim = pattern + size;
+  char *n = re;
+  char const *p;
+  char const *pnul;
+
+  /* FIXME: Remove these restrictions.  */
+  if (memchr(pattern, '\n', size))
+    error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
+
+  *n = '\0';
+  if (match_lines)
+    strcpy (n, "^(");
+  if (match_words)
+    strcpy (n, "\\b(");
+  n += strlen (n);
+
+  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
+     replace each NUL byte in the pattern with the four characters
+     "\000", removing a preceding backslash if there are an odd
+     number of backslashes before the NUL.
+
+     FIXME: This method does not work with some multibyte character
+     encodings, notably Shift-JIS, where a multibyte character can end
+     in a backslash byte.  */
+  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
+    {
+      memcpy (n, p, pnul - p);
+      n += pnul - p;
+      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
+       continue;
+      n -= (pnul - p) & 1;
+      strcpy (n, "\\000");
+      n += 4;
+    }
+
+  memcpy (n, p, patlim - p);
+  n += patlim - p;
+  *n = '\0';
+  if (match_words)
+    strcpy (n, ")\\b");
+  if (match_lines)
+    strcpy (n, ")$");
+
+  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
+  if (!cre)
+    error (EXIT_TROUBLE, 0, "%s", ep);
+
+  extra = pcre_study (cre, 0, &ep);
+  if (ep)
+    error (EXIT_TROUBLE, 0, "%s", ep);
+
+  free (re);
+#endif
+}
+
+static size_t
+Pexecute (char const *buf, size_t size, size_t *match_size, char const 
*start_ptr)
+{
+#if !HAVE_LIBPCRE
+  abort ();
+  return -1;
+#else
+  /* This array must have at least two elements; everything after that
+     is just for performance improvement in pcre_exec.  */
+  int sub[300];
+
+  const char *line_buf, *line_end, *line_next;
+  int e = PCRE_ERROR_NOMATCH;
+  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+
+  /* PCRE can't limit the matching to single lines, therefore we have to
+     match each line in the buffer separately.  */
+  for (line_next = buf;
+       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
+       start_ofs -= line_next - line_buf)
+    {
+      line_buf = line_next;
+      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
+      if (line_end == NULL)
+        line_next = line_end = buf + size;
+      else
+        line_next = line_end + 1;
+
+      if (start_ptr && start_ptr >= line_end)
+        continue;
+
+      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
+                     start_ofs < 0 ? 0 : start_ofs, 0,
+                     sub, sizeof sub / sizeof *sub);
+    }
+
+  if (e <= 0)
+    {
+      switch (e)
+       {
+       case PCRE_ERROR_NOMATCH:
+         return -1;
+
+       case PCRE_ERROR_NOMEMORY:
+         error (EXIT_TROUBLE, 0, _("memory exhausted"));
+
+       default:
+         abort ();
+       }
+    }
+  else
+    {
+      /* Narrow down to the line we've found.  */
+      char const *beg = line_buf + sub[0];
+      char const *end = line_buf + sub[1];
+      char const *buflim = buf + size;
+      char eol = eolbyte;
+      if (!start_ptr)
+       {
+         /* FIXME: The case when '\n' is not found indicates a bug:
+            Since grep is line oriented, the match should never contain
+            a newline, so there _must_ be a newline following.
+          */
+         if (!(end = memchr (end, eol, buflim - end)))
+           end = buflim;
+         else
+           end++;
+         while (buf < beg && beg[-1] != eol)
+           --beg;
+       }
+
+      *match_size = end - beg;
+      return beg - buf;
+    }
+#endif
+}
diff --git a/src/search.c b/src/search.c
deleted file mode 100644
index 7b601e0..0000000
--- a/src/search.c
+++ /dev/null
@@ -1,844 +0,0 @@
-/* search.c - searching subroutines using dfa, kwset and regex for grep.
-   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
-
-   This program is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program; if not, write to the Free Software
-   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-   02110-1301, USA.  */
-
-/* Written August 1992 by Mike Haertel. */
-
-#include <config.h>
-
-#include <sys/types.h>
-
-#include "mbsupport.h"
-#ifdef MBS_SUPPORT
-/* We can handle multibyte strings. */
-# include <wchar.h>
-# include <wctype.h>
-#endif
-
-#include "system.h"
-#include "grep.h"
-#ifndef FGREP_PROGRAM
-# include <regex.h>
-# include "dfa.h"
-#endif
-#include "kwset.h"
-#include "error.h"
-#include "xalloc.h"
-#ifdef HAVE_LIBPCRE
-# include <pcre.h>
-#endif
-
-#define NCHAR (UCHAR_MAX + 1)
-
-/* For -w, we also consider _ to be word constituent.  */
-#define WCHAR(C) (ISALNUM(C) || (C) == '_')
-
-/* KWset compiled pattern.  For Ecompile and Gcompile, we compile
-   a list of strings, at least one of which is known to occur in
-   any string matching the regexp. */
-static kwset_t kwset;
-
-static void
-kwsinit (void)
-{
-  static char trans[NCHAR];
-  int i;
-
-  if (match_icase && MB_CUR_MAX == 1)
-    {
-      for (i = 0; i < NCHAR; ++i)
-        trans[i] = TOLOWER (i);
-
-      kwset = kwsalloc (trans);
-    }
-  else
-    kwset = kwsalloc (NULL);
-
-  if (!kwset)
-    xalloc_die ();
-}
-
-#ifdef MBS_SUPPORT
-/* Convert the *N-byte string, BEG, to lowercase, and write the
-   NUL-terminated result into malloc'd storage.  Upon success, set *N
-   to the length (in bytes) of the resulting string (not including the
-   trailing NUL byte), and return a pointer to the lowercase string.
-   Upon memory allocation failure, this function exits.
-
-   Note that while this function returns a pointer to malloc'd storage,
-   the caller must not free it, since this function retains a pointer
-   to the buffer and reuses it on any subsequent call.  As a consequence,
-   this function is not thread-safe.  */
-static char *
-mbtolower (const char *beg, size_t *n)
-{
-  static char *out;
-  static size_t outalloc;
-  size_t outlen, mb_cur_max;
-  mbstate_t is, os;
-  const char *end;
-  char *p;
-
-  if (*n > outalloc)
-    {
-      out = xrealloc (out, *n);
-      outalloc = *n;
-    }
-
-  memset (&is, 0, sizeof (is));
-  memset (&os, 0, sizeof (os));
-  end = beg + *n;
-
-  mb_cur_max = MB_CUR_MAX;
-  p = out;
-  outlen = 0;
-  while (beg < end)
-    {
-      wchar_t wc;
-      size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
-      if (outlen + mb_cur_max >= outalloc)
-        {
-          out = x2nrealloc (out, &outalloc, 1);
-          p = out + outlen;
-        }
-
-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
-        {
-          /* An invalid sequence, or a truncated multi-octet character.
-             We treat it as a single-octet character.  */
-          *p++ = *beg++;
-          outlen++;
-          memset (&is, 0, sizeof (is));
-          memset (&os, 0, sizeof (os));
-        }
-      else
-        {
-          beg += mbclen;
-          mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
-          p += mbclen;
-          outlen += mbclen;
-        }
-    }
-
-  *n = p - out;
-  *p++ = 0;
-  return out;
-}
-#endif
-
-
-#ifndef FGREP_PROGRAM
-/* DFA compiled regexp. */
-static struct dfa dfa;
-
-/* The Regex compiled patterns.  */
-static struct patterns
-{
-  /* Regex compiled regexp. */
-  struct re_pattern_buffer regexbuf;
-  struct re_registers regs; /* This is here on account of a BRAIN-DEAD
-                              q...@#%!# library interface in regex.c.  */
-} patterns0;
-
-struct patterns *patterns;
-size_t pcount;
-
-void
-dfaerror (char const *mesg)
-{
-  error (EXIT_TROUBLE, 0, "%s", mesg);
-}
-
-/* Number of compiled fixed strings known to exactly match the regexp.
-   If kwsexec returns < kwset_exact_matches, then we don't need to
-   call the regexp matcher at all. */
-static int kwset_exact_matches;
-
-static char const *
-kwsincr_case (const char *must)
-{
-  const char *buf;
-  size_t n;
-
-  n = strlen (must);
-#ifdef MBS_SUPPORT
-  if (match_icase && MB_CUR_MAX > 1)
-    buf = mbtolower (must, &n);
-  else
-#endif
-    buf = must;
-  return kwsincr (kwset, buf, n);
-}
-
-/* If the DFA turns out to have some set of fixed strings one of
-   which must occur in the match, then we build a kwset matcher
-   to find those strings, and thus quickly filter out impossible
-   matches. */
-static void
-kwsmusts (void)
-{
-  struct dfamust const *dm;
-  char const *err;
-
-  if (dfa.musts)
-    {
-      kwsinit ();
-      /* First, we compile in the substrings known to be exact
-        matches.  The kwset matcher will return the index
-        of the matching string that it chooses. */
-      for (dm = dfa.musts; dm; dm = dm->next)
-       {
-         if (!dm->exact)
-           continue;
-         ++kwset_exact_matches;
-         if ((err = kwsincr_case (dm->must)) != NULL)
-           error (EXIT_TROUBLE, 0, "%s", err);
-       }
-      /* Now, we compile the substrings that will require
-        the use of the regexp matcher.  */
-      for (dm = dfa.musts; dm; dm = dm->next)
-       {
-         if (dm->exact)
-           continue;
-         if ((err = kwsincr_case (dm->must)) != NULL)
-           error (EXIT_TROUBLE, 0, "%s", err);
-       }
-      if ((err = kwsprep (kwset)) != NULL)
-       error (EXIT_TROUBLE, 0, "%s", err);
-    }
-}
-#endif /* !FGREP_PROGRAM */
-
-#ifdef MBS_SUPPORT
-
-static bool
-is_mb_middle(const char **good, const char *buf, const char *end)
-{
-  const char *p = *good;
-  const char *prev = p;
-  mbstate_t cur_state;
-
-  /* TODO: can be optimized for UTF-8.  */
-  memset(&cur_state, 0, sizeof(mbstate_t));
-  while (p < buf)
-    {
-      size_t mbclen = mbrlen(p, end - p, &cur_state);
-
-      /* Store the beginning of the previous complete multibyte character.  */
-      if (mbclen != (size_t) -2)
-        prev = p;
-
-      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
-       {
-         /* An invalid sequence, or a truncated multibyte character.
-            We treat it as a single byte character.  */
-         mbclen = 1;
-       }
-      p += mbclen;
-    }
-
-  *good = prev;
-  return p > buf;
-}
-#endif /* MBS_SUPPORT */
-
-#if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM)
-/* No __VA_ARGS__ in C89.  So we have to do it this way.  */
-static void
-GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits)
-{
-  const char *err;
-  const char *p, *sep;
-  size_t total = size;
-  char *motif;
-
-  if (match_icase)
-    syntax_bits |= RE_ICASE;
-  re_set_syntax (syntax_bits);
-  dfasyntax (syntax_bits, match_icase, eolbyte);
-
-  /* For GNU regex compiler we have to pass the patterns separately to detect
-     errors like "[\nallo\n]\n".  The patterns here are "[", "allo" and "]"
-     GNU regex should have raise a syntax error.  The same for backref, where
-     the backref should have been local to each pattern.  */
-  p = pattern;
-  do
-    {
-      size_t len;
-      sep = memchr (p, '\n', total);
-      if (sep)
-       {
-         len = sep - p;
-         sep++;
-         total -= (len + 1);
-       }
-      else
-       {
-         len = total;
-         total = 0;
-       }
-
-      patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
-      if (patterns == NULL)
-       error (EXIT_TROUBLE, errno, _("memory exhausted"));
-      patterns[pcount] = patterns0;
-
-      if ((err = re_compile_pattern (p, len,
-                                   &(patterns[pcount].regexbuf))) != NULL)
-       error (EXIT_TROUBLE, 0, "%s", err);
-      pcount++;
-
-      p = sep;
-    } while (sep && total != 0);
-
-  /* In the match_words and match_lines cases, we use a different pattern
-     for the DFA matcher that will quickly throw out cases that won't work.
-     Then if DFA succeeds we do some hairy stuff using the regex matcher
-     to decide whether the match should really count. */
-  if (match_words || match_lines)
-    {
-      static char const line_beg_no_bk[] = "^(";
-      static char const line_end_no_bk[] = ")$";
-      static char const word_beg_no_bk[] = "(^|[^[:alnum:]_])(";
-      static char const word_end_no_bk[] = ")([^[:alnum:]_]|$)";
-      static char const line_beg_bk[] = "^\\(";
-      static char const line_end_bk[] = "\\)$";
-      static char const word_beg_bk[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
-      static char const word_end_bk[] = "\\)\\([^[:alnum:]_]\\|$\\)";
-      int bk = !(syntax_bits & RE_NO_BK_PARENS);
-      char *n = xmalloc (sizeof word_beg_bk - 1 + size + sizeof word_end_bk);
-
-      strcpy (n, match_lines ? (bk ? line_beg_bk : line_beg_no_bk)
-                            : (bk ? word_beg_bk : word_beg_no_bk));
-      total = strlen(n);
-      memcpy (n + total, pattern, size);
-      total += size;
-      strcpy (n + total, match_lines ? (bk ? line_end_bk : line_end_no_bk)
-                                    : (bk ? word_end_bk : word_end_no_bk));
-      total += strlen (n + total);
-      pattern = motif = n;
-      size = total;
-    }
-  else
-    motif = NULL;
-
-  dfacomp (pattern, size, &dfa, 1);
-  kwsmusts ();
-
-  free(motif);
-}
-
-#ifndef EGREP_PROGRAM
-static void
-Gcompile (char const *pattern, size_t size)
-{
-  return GEAcompile (pattern, size,
-                    RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
-}
-
-static void
-Acompile (char const *pattern, size_t size)
-{
-  return GEAcompile (pattern, size, RE_SYNTAX_AWK);
-}
-#endif /* !EGREP_PROGRAM */
-
-static void
-Ecompile (char const *pattern, size_t size)
-{
-  return GEAcompile (pattern, size, RE_SYNTAX_POSIX_EGREP);
-}
-
-static size_t
-EGexecute (char const *buf, size_t size, size_t *match_size, char const 
*start_ptr)
-{
-  char const *buflim, *beg, *end, *match, *best_match, *mb_start;
-  char eol = eolbyte;
-  int backref, start, len, best_len;
-  struct kwsmatch kwsm;
-  size_t i, ret_val;
-#ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
-    {
-      if (match_icase)
-        {
-          /* mbtolower adds a NUL byte at the end.  That will provide
-            space for the sentinel byte dfaexec may add.  */
-          char *case_buf = mbtolower (buf, &size);
-         if (start_ptr)
-           start_ptr = case_buf + (start_ptr - buf);
-          buf = case_buf;
-        }
-    }
-#endif /* MBS_SUPPORT */
-
-  mb_start = buf;
-  buflim = buf + size;
-
-  for (beg = end = buf; end < buflim; beg = end)
-    {
-      if (!start_ptr)
-       {
-         /* We don't care about an exact match.  */
-         if (kwset)
-           {
-             /* Find a possible match using the KWset matcher. */
-             size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
-             if (offset == (size_t) -1)
-               goto failure;
-             beg += offset;
-             /* Narrow down to the line containing the candidate, and
-                run it through DFA. */
-             if ((end = memchr(beg, eol, buflim - beg)) != NULL)
-               end++;
-              else
-                end = buflim;
-             match = beg;
-             while (beg > buf && beg[-1] != eol)
-               --beg;
-             if (kwsm.index < kwset_exact_matches)
-                {
-#ifdef MBS_SUPPORT
-                  if (mb_start < beg)
-                    mb_start = beg;
-                  if (MB_CUR_MAX == 1 || !is_mb_middle (&mb_start, match, 
buflim))
-#endif
-                    goto success;
-                }
-             if (dfaexec (&dfa, beg, (char *) end, 0, NULL, &backref) == NULL)
-               continue;
-           }
-         else
-           {
-             /* No good fixed strings; start with DFA. */
-             char const *next_beg = dfaexec (&dfa, beg, (char *) buflim,
-                                             0, NULL, &backref);
-             if (next_beg == NULL)
-               break;
-             /* Narrow down to the line we've found. */
-             beg = next_beg;
-             if ((end = memchr(beg, eol, buflim - beg)) != NULL)
-               end++;
-              else
-                end = buflim;
-             while (beg > buf && beg[-1] != eol)
-               --beg;
-           }
-         /* Successful, no backreferences encountered! */
-         if (!backref)
-           goto success;
-       }
-      else
-       {
-         /* We are looking for the leftmost (then longest) exact match.
-            We will go through the outer loop only once.  */
-         beg = start_ptr;
-         end = buflim;
-       }
-
-      /* If we've made it to this point, this means DFA has seen
-        a probable match, and we need to run it through Regex. */
-      best_match = end;
-      best_len = 0;
-      for (i = 0; i < pcount; i++)
-       {
-         patterns[i].regexbuf.not_eol = 0;
-         if (0 <= (start = re_search (&(patterns[i].regexbuf),
-                                      buf, end - buf - 1,
-                                      beg - buf, end - beg - 1,
-                                      &(patterns[i].regs))))
-           {
-             len = patterns[i].regs.end[0] - start;
-             match = buf + start;
-             if (match > best_match)
-               continue;
-             if (start_ptr && !match_words)
-               goto assess_pattern_match;
-             if ((!match_lines && !match_words)
-                 || (match_lines && len == end - beg - 1))
-               {
-                 match = beg;
-                 len = end - beg;
-                 goto assess_pattern_match;
-               }
-             /* If -w, check if the match aligns with word boundaries.
-                We do this iteratively because:
-                (a) the line may contain more than one occurence of the
-                pattern, and
-                (b) Several alternatives in the pattern might be valid at a
-                given point, and we may need to consider a shorter one to
-                find a word boundary.  */
-             if (match_words)
-               while (match <= best_match)
-                 {
-                   if ((match == buf || !WCHAR ((unsigned char) match[-1]))
-                       && (len == end - beg - 1
-                           || !WCHAR ((unsigned char) match[len])))
-                     goto assess_pattern_match;
-                   if (len > 0)
-                     {
-                       /* Try a shorter length anchored at the same place. */
-                       --len;
-                       patterns[i].regexbuf.not_eol = 1;
-                       len = re_match (&(patterns[i].regexbuf),
-                                       buf, match + len - beg, match - buf,
-                                       &(patterns[i].regs));
-                     }
-                   if (len <= 0)
-                     {
-                       /* Try looking further on. */
-                       if (match == end - 1)
-                         break;
-                       match++;
-                       patterns[i].regexbuf.not_eol = 0;
-                       start = re_search (&(patterns[i].regexbuf),
-                                          buf, end - buf - 1,
-                                          match - buf, end - match - 1,
-                                          &(patterns[i].regs));
-                       if (start < 0)
-                         break;
-                       len = patterns[i].regs.end[0] - start;
-                       match = buf + start;
-                     }
-                 } /* while (match <= best_match) */
-             continue;
-           assess_pattern_match:
-             if (!start_ptr)
-               {
-                 /* Good enough for a non-exact match.
-                    No need to look at further patterns, if any.  */
-                 beg = match;
-                 goto success_in_len;
-               }
-             if (match < best_match || (match == best_match && len > best_len))
-               {
-                 /* Best exact match:  leftmost, then longest.  */
-                 best_match = match;
-                 best_len = len;
-               }
-           } /* if re_search >= 0 */
-       } /* for Regex patterns.  */
-       if (best_match < end)
-         {
-           /* We have found an exact match.  We were just
-              waiting for the best one (leftmost then longest).  */
-           beg = best_match;
-           len = best_len;
-           goto success_in_len;
-         }
-    } /* for (beg = end ..) */
-
- failure:
-  ret_val = -1;
-  goto out;
-
- success:
-  len = end - beg;
- success_in_len:
-  *match_size = len;
-  ret_val = beg - buf;
- out:
-  return ret_val;
-}
-#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */
-
-#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM)
-static void
-Fcompile (char const *pattern, size_t size)
-{
-  char const *beg, *end, *lim, *err, *pat;
-  size_t psize;
-
-  kwsinit ();
-  psize = size;
-  if (match_icase && MB_CUR_MAX > 1)
-    pat = mbtolower (pattern, &psize);
-  else
-    pat = pattern;
-
-  beg = pat;
-  do
-    {
-      for (lim = beg;; ++lim)
-       {
-         end = lim;
-         if (lim >= pat + psize)
-           break;
-        if (*lim == '\n')
-          {
-            lim++;
-            break;
-          }
-#if HAVE_DOS_FILE_CONTENTS
-        if (*lim == '\r' && lim + 1 < pat + psize && lim[1] == '\n')
-          {
-            lim += 2;
-            break;
-          }
-#endif
-       }
-
-      if ((err = kwsincr (kwset, beg, end - beg)) != NULL)
-       error (EXIT_TROUBLE, 0, "%s", err);
-      beg = lim;
-    }
-  while (beg < pat + psize);
-
-  if ((err = kwsprep (kwset)) != NULL)
-    error (EXIT_TROUBLE, 0, "%s", err);
-}
-
-static size_t
-Fexecute (char const *buf, size_t size, size_t *match_size, char const 
*start_ptr)
-{
-  char const *beg, *try, *end, *mb_start;
-  size_t len;
-  char eol = eolbyte;
-  struct kwsmatch kwsmatch;
-  size_t ret_val;
-#ifdef MBS_SUPPORT
-  if (MB_CUR_MAX > 1)
-    {
-      if (match_icase)
-        {
-          char *case_buf = mbtolower (buf, &size);
-         if (start_ptr)
-           start_ptr = case_buf + (start_ptr - buf);
-          buf = case_buf;
-        }
-    }
-#endif /* MBS_SUPPORT */
-
-  for (mb_start = beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++)
-    {
-      size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
-      if (offset == (size_t) -1)
-       goto failure;
-#ifdef MBS_SUPPORT
-      if (MB_CUR_MAX > 1 && is_mb_middle (&mb_start, beg + offset, buf + size))
-        {
-          beg = mb_start - 1;
-          continue; /* It is a part of multibyte character.  */
-        }
-#endif /* MBS_SUPPORT */
-      beg += offset;
-      len = kwsmatch.size[0];
-      if (start_ptr && !match_words)
-       goto success_in_beg_and_len;
-      if (match_lines)
-       {
-         if (beg > buf && beg[-1] != eol)
-           continue;
-         if (beg + len < buf + size && beg[len] != eol)
-           continue;
-         goto success;
-       }
-      else if (match_words)
-       for (try = beg; len; )
-         {
-           if (try > buf && WCHAR((unsigned char) try[-1]))
-             break;
-           if (try + len < buf + size && WCHAR((unsigned char) try[len]))
-             {
-               offset = kwsexec (kwset, beg, --len, &kwsmatch);
-               if (offset == (size_t) -1)
-                 break;
-               try = beg + offset;
-               len = kwsmatch.size[0];
-             }
-           else if (!start_ptr)
-             goto success;
-           else
-             goto success_in_beg_and_len;
-         } /* for (try) */
-      else
-       goto success;
-    } /* for (beg in buf) */
-
- failure:
-  ret_val = -1;
-  goto out;
-
- success:
-  if ((end = memchr (beg + len, eol, (buf + size) - (beg + len))) != NULL)
-    end++;
-  else
-    end = buf + size;
-  while (buf < beg && beg[-1] != eol)
-    --beg;
-  len = end - beg;
- success_in_beg_and_len:
-  *match_size = len;
-  ret_val = beg - buf;
- out:
-  return ret_val;
-}
-#endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */
-
-#ifdef GREP_PROGRAM
-#if HAVE_LIBPCRE
-/* Compiled internal form of a Perl regular expression.  */
-static pcre *cre;
-
-/* Additional information about the pattern.  */
-static pcre_extra *extra;
-#endif
-
-static void
-Pcompile (char const *pattern, size_t size)
-{
-#if !HAVE_LIBPCRE
-  error (EXIT_TROUBLE, 0, "%s",
-        _("support for the -P option is not compiled into "
-          "this --disable-perl-regexp binary"));
-#else
-  int e;
-  char const *ep;
-  char *re = xmalloc (4 * size + 7);
-  int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
-  char const *patlim = pattern + size;
-  char *n = re;
-  char const *p;
-  char const *pnul;
-
-  /* FIXME: Remove these restrictions.  */
-  if (memchr(pattern, '\n', size))
-    error (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
-
-  *n = '\0';
-  if (match_lines)
-    strcpy (n, "^(");
-  if (match_words)
-    strcpy (n, "\\b(");
-  n += strlen (n);
-
-  /* The PCRE interface doesn't allow NUL bytes in the pattern, so
-     replace each NUL byte in the pattern with the four characters
-     "\000", removing a preceding backslash if there are an odd
-     number of backslashes before the NUL.
-
-     FIXME: This method does not work with some multibyte character
-     encodings, notably Shift-JIS, where a multibyte character can end
-     in a backslash byte.  */
-  for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
-    {
-      memcpy (n, p, pnul - p);
-      n += pnul - p;
-      for (p = pnul; pattern < p && p[-1] == '\\'; p--)
-       continue;
-      n -= (pnul - p) & 1;
-      strcpy (n, "\\000");
-      n += 4;
-    }
-
-  memcpy (n, p, patlim - p);
-  n += patlim - p;
-  *n = '\0';
-  if (match_words)
-    strcpy (n, ")\\b");
-  if (match_lines)
-    strcpy (n, ")$");
-
-  cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
-  if (!cre)
-    error (EXIT_TROUBLE, 0, "%s", ep);
-
-  extra = pcre_study (cre, 0, &ep);
-  if (ep)
-    error (EXIT_TROUBLE, 0, "%s", ep);
-
-  free (re);
-#endif
-}
-
-static size_t
-Pexecute (char const *buf, size_t size, size_t *match_size, char const 
*start_ptr)
-{
-#if !HAVE_LIBPCRE
-  abort ();
-  return -1;
-#else
-  /* This array must have at least two elements; everything after that
-     is just for performance improvement in pcre_exec.  */
-  int sub[300];
-
-  const char *line_buf, *line_end, *line_next;
-  int e = PCRE_ERROR_NOMATCH;
-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
-
-  /* PCRE can't limit the matching to single lines, therefore we have to
-     match each line in the buffer separately.  */
-  for (line_next = buf;
-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
-       start_ofs -= line_next - line_buf)
-    {
-      line_buf = line_next;
-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
-      if (line_end == NULL)
-        line_next = line_end = buf + size;
-      else
-        line_next = line_end + 1;
-
-      if (start_ptr && start_ptr >= line_end)
-        continue;
-
-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
-                     start_ofs < 0 ? 0 : start_ofs, 0,
-                     sub, sizeof sub / sizeof *sub);
-    }
-
-  if (e <= 0)
-    {
-      switch (e)
-       {
-       case PCRE_ERROR_NOMATCH:
-         return -1;
-
-       case PCRE_ERROR_NOMEMORY:
-         error (EXIT_TROUBLE, 0, _("memory exhausted"));
-
-       default:
-         abort ();
-       }
-    }
-  else
-    {
-      /* Narrow down to the line we've found.  */
-      char const *beg = line_buf + sub[0];
-      char const *end = line_buf + sub[1];
-      char const *buflim = buf + size;
-      char eol = eolbyte;
-      if (!start_ptr)
-       {
-         /* FIXME: The case when '\n' is not found indicates a bug:
-            Since grep is line oriented, the match should never contain
-            a newline, so there _must_ be a newline following.
-          */
-         if (!(end = memchr (end, eol, buflim - end)))
-           end = buflim;
-         else
-           end++;
-         while (buf < beg && beg[-1] != eol)
-           --beg;
-       }
-
-      *match_size = end - beg;
-      return beg - buf;
-    }
-#endif
-}
-#endif /* GREP_PROGRAM */
diff --git a/src/search.h b/src/search.h
new file mode 100644
index 0000000..cb3b535
--- /dev/null
+++ b/src/search.h
@@ -0,0 +1,47 @@
+/* search.c - searching subroutines using dfa, kwset and regex for grep.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+#ifndef GREP_SEARCH_H
+#define GREP_SEARCH_H 1
+
+#include <config.h>
+
+#include <sys/types.h>
+
+#include "mbsupport.h"
+#ifdef MBS_SUPPORT
+/* We can handle multibyte strings. */
+# include <wchar.h>
+# include <wctype.h>
+#endif
+
+#include <regex.h>
+#include "system.h"
+#include "grep.h"
+#include "error.h"
+#include "kwset.h"
+#include "xalloc.h"
+
+void kwsinit (kwset_t *);
+
+#ifdef MBS_SUPPORT
+char * mbtolower (const char *, size_t *);
+bool is_mb_middle(const char **, const char *, const char *);
+#endif
+
+#endif /* GREP_SEARCH_H */
diff --git a/src/searchutils.c b/src/searchutils.c
new file mode 100644
index 0000000..ef4fef3
--- /dev/null
+++ b/src/searchutils.c
@@ -0,0 +1,141 @@
+/* searchutils.c - helper subroutines for grep's matchers.
+   Copyright 1992, 1998, 2000, 2007, 2009-2010 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
+   02110-1301, USA.  */
+
+#include <config.h>
+#include "search.h"
+
+#define NCHAR (UCHAR_MAX + 1)
+
+void
+kwsinit (kwset_t *kwset)
+{
+  static char trans[NCHAR];
+  int i;
+
+  if (match_icase && MB_CUR_MAX == 1)
+    {
+      for (i = 0; i < NCHAR; ++i)
+        trans[i] = TOLOWER (i);
+
+      *kwset = kwsalloc (trans);
+    }
+  else
+    *kwset = kwsalloc (NULL);
+
+  if (!*kwset)
+    xalloc_die ();
+}
+
+#ifdef MBS_SUPPORT
+/* Convert the *N-byte string, BEG, to lowercase, and write the
+   NUL-terminated result into malloc'd storage.  Upon success, set *N
+   to the length (in bytes) of the resulting string (not including the
+   trailing NUL byte), and return a pointer to the lowercase string.
+   Upon memory allocation failure, this function exits.
+
+   Note that while this function returns a pointer to malloc'd storage,
+   the caller must not free it, since this function retains a pointer
+   to the buffer and reuses it on any subsequent call.  As a consequence,
+   this function is not thread-safe.  */
+char *
+mbtolower (const char *beg, size_t *n)
+{
+  static char *out;
+  static size_t outalloc;
+  size_t outlen, mb_cur_max;
+  mbstate_t is, os;
+  const char *end;
+  char *p;
+
+  if (*n > outalloc)
+    {
+      out = xrealloc (out, *n);
+      outalloc = *n;
+    }
+
+  memset (&is, 0, sizeof (is));
+  memset (&os, 0, sizeof (os));
+  end = beg + *n;
+
+  mb_cur_max = MB_CUR_MAX;
+  p = out;
+  outlen = 0;
+  while (beg < end)
+    {
+      wchar_t wc;
+      size_t mbclen = mbrtowc(&wc, beg, end - beg, &is);
+      if (outlen + mb_cur_max >= outalloc)
+        {
+          out = x2nrealloc (out, &outalloc, 1);
+          p = out + outlen;
+        }
+
+      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+        {
+          /* An invalid sequence, or a truncated multi-octet character.
+             We treat it as a single-octet character.  */
+          *p++ = *beg++;
+          outlen++;
+          memset (&is, 0, sizeof (is));
+          memset (&os, 0, sizeof (os));
+        }
+      else
+        {
+          beg += mbclen;
+          mbclen = wcrtomb (p, towlower ((wint_t) wc), &os);
+          p += mbclen;
+          outlen += mbclen;
+        }
+    }
+
+  *n = p - out;
+  *p++ = 0;
+  return out;
+}
+
+
+bool
+is_mb_middle(const char **good, const char *buf, const char *end)
+{
+  const char *p = *good;
+  const char *prev = p;
+  mbstate_t cur_state;
+
+  /* TODO: can be optimized for UTF-8.  */
+  memset(&cur_state, 0, sizeof(mbstate_t));
+  while (p < buf)
+    {
+      size_t mbclen = mbrlen(p, end - p, &cur_state);
+
+      /* Store the beginning of the previous complete multibyte character.  */
+      if (mbclen != (size_t) -2)
+        prev = p;
+
+      if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
+       {
+         /* An invalid sequence, or a truncated multibyte character.
+            We treat it as a single byte character.  */
+         mbclen = 1;
+       }
+      p += mbclen;
+    }
+
+  *good = prev;
+  return p > buf;
+}
+#endif /* MBS_SUPPORT */
-- 
1.6.6.1

[PATCH 7/9] grep: split search.c

Reply via email to