I discovered some problems with the argument handling in the patch I posted
earlier today.  Please ignore that one and apply this one instead...

According to Elijah Kagan:
> I run htdig 3.1.5.
> I tried both the Debian package and a compiled one with the same result.
> I am absolutely sure there is something stupid I forgot to put into the
> configuration.

OK, after getting to the bottom of this (I think!), I have backported
the 3.2.0b3 development code for htdig/ExternalParser.cc to version
3.1.5, to fix this and other problems.  Please give this patch file
a try and let me know if it works.  You will probably get a warning
about the wait() function being implicitly declared, unless you manually
define HAVE_WAIT_H or HAVE_SYS_WAIT_H (depending on whether your system
has <wait.h> or <sys/wait.h>).  Also, if your system has the mkstemp()
function, you may want to define HAVE_MKSTEMP manually as well, as this
will enhance security.  I didn't have time to figure out how to patch
aclocal.m4 and configure to add tests for all of these.

The patch fixes the following problems in external_parsers support in
3.1.5:
  - it got confused by "; charset=..." in the Content-Type header,
    as described in "http://www.htdig.org/mail/2000/09/index.html#75".
  - security problems with using popen(), and therefore the shell,
    to parse URL and content-type strings from untrusted sources
    (now uses pipe/fork/exec instead of popen) - PR#542, PR#951.
  - used predictable temporary file name, which could be exploited
    via symlinks - fixed if mkstemp() exists & HAVE_MKSTEMP is defined.
  - binary output from an external converter could get mangled.
  - error messages were sometimes ambiguous or missing altogether.
  - didn't open temporary file in binary mode for non-Unix systems
    (attempts were made to fix this, but it's not clear yet whether
     the security fixes and pipe/fork/exec will port well to Cygwin).

Here's the patch, which you can apply in the main source directory for
htdig-3.1.5 using "patch -p0 < this-file":

--- htdig/ExternalParser.cc.orig        Thu Feb 24 20:29:10 2000
+++ htdig/ExternalParser.cc     Mon Jan 15 17:16:50 2001
@@ -1,14 +1,24 @@
 //
 // ExternalParser.cc
 //
-// Implementation of ExternalParser
-// Allows external programs to parse unknown document formats.
-// The parser is expected to return the document in a specific format.
-// The format is documented in http://www.htdig.org/attrs.html#external_parser
+// ExternalParser: Implementation of ExternalParser
+//                 Allows external programs to parse unknown document formats.
+//                 The parser is expected to return the document in a 
+//                 specific format. The format is documented 
+//                 in http://www.htdig.org/attrs.html#external_parser
 //
-#if RELEASE
-static char RCSid[] = "$Id: ExternalParser.cc,v 1.9.2.3 1999/11/24 02:14:09 grdetil 
Exp $";
-#endif
+// Part of the ht://Dig package   <http://www.htdig.org/>
+// Copyright (c) 1995-2001 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Public License version 2 or later
+// <http://www.gnu.org/copyleft/gpl.html>
+//
+// $Id: ExternalParser.cc,v 1.9.2.4 2001/01/15 17:16:50 grdetil Exp $
+//
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
 
 #include "ExternalParser.h"
 #include "HTML.h"
@@ -19,9 +29,18 @@ static char RCSid[] = "$Id: ExternalPars
 #include "QuotedStringList.h"
 #include "URL.h"
 #include "Dictionary.h"
+#include "good_strtok.h"
+
 #include <ctype.h>
 #include <stdio.h>
-#include "good_strtok.h"
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#ifdef HAVE_WAIT_H
+#include <wait.h>
+#elif HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
 
 static Dictionary      *parsers = 0;
 static Dictionary      *toTypes = 0;
@@ -32,9 +51,18 @@ extern String                configFile;
 //
 ExternalParser::ExternalParser(char *contentType)
 {
+  String mime;
+  int sep;
+
     if (canParse(contentType))
     {
-       currentParser = ((String *)parsers->Find(contentType))->get();
+        String mime = contentType;
+       mime.lowercase();
+       sep = mime.indexOf(';');
+       if (sep != -1)
+         mime = mime.sub(0, sep).get();
+       
+       currentParser = ((String *)parsers->Find(mime))->get();
     }
     ExternalParser::contentType = contentType;
 }
@@ -89,6 +117,8 @@ ExternalParser::readLine(FILE *in, Strin
 int
 ExternalParser::canParse(char *contentType)
 {
+  int                  sep;
+
     if (!parsers)
     {
        parsers = new Dictionary();
@@ -97,7 +127,6 @@ ExternalParser::canParse(char *contentTy
        QuotedStringList        qsl(config["external_parsers"], " \t");
        String                  from, to;
        int                     i;
-       int                     sep;
 
        for (i = 0; qsl[i]; i += 2)
        {
@@ -109,11 +138,22 @@ ExternalParser::canParse(char *contentTy
                to = from.sub(sep+2).get();
                from = from.sub(0, sep).get();
            }
+           from.lowercase();
+           sep = from.indexOf(';');
+           if (sep != -1)
+             from = from.sub(0, sep).get();
+
            parsers->Add(from, new String(qsl[i + 1]));
            toTypes->Add(from, new String(to));
        }
     }
-    return parsers->Exists(contentType);
+
+    String mime = contentType;
+    mime.lowercase();
+    sep = mime.indexOf(';');
+    if (sep != -1)
+      mime = mime.sub(0, sep).get();
+    return parsers->Exists(mime);
 }
 
 //*****************************************************************************
@@ -132,52 +172,128 @@ ExternalParser::parse(Retriever &retriev
     // Write the contents to a temporary file.
     //
     String      path = getenv("TMPDIR");
+    int                fd;
     if (path.length() == 0)
       path = "/tmp";
-    path << "/htdext." << getpid();
+#ifndef HAVE_MKSTEMP
+    path << "/htdext." << getpid(); // This is unfortunately predictable
 
-    FILE       *fl = fopen(path, "w");
-    if (!fl)
+#ifdef O_BINARY
+    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY);
+#else
+    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL);
+#endif
+#else
+    path << "/htdex.XXXXXX";
+    fd = mkstemp((char*)path);
+    // can we force binary mode somehow under Cygwin, if it has mkstemp?
+#endif
+    if (fd < 0)
     {
-       return;
+      if (debug)
+       cout << "External parser error: Can't create temp file "
+            << (char *)path << endl;
+      return;
     }
     
-    fwrite(contents->get(), 1, contents->length(), fl);
-    fclose(fl);
-    
-    //
-    // Now start the external parser.
-    //
-    String     command = currentParser;
-    command << ' ' << path << ' ' << contentType << " \"" << base.get() <<
-       "\" " << configFile;
-
-    FILE       *input = popen(command, "r");
-    if (!input)
-    {
-       unlink(path);
-       return;
-    }
-
-    unsigned int minimum_word_length
-      = config.Value("minimum_word_length", 3);
+    write(fd, contents->get(), contents->length());
+    close(fd);
 
+    unsigned int minimum_word_length = config.Value("minimum_word_length", 3);
     String     line;
     char       *token1, *token2, *token3;
-    int                loc, hd;
+    int                loc = 0, hd = 0;
     URL                url;
-    String     convertToType = ((String *)toTypes->Find(contentType))->get();
+    String mime = contentType;
+    mime.lowercase();
+    int        sep = mime.indexOf(';');
+    if (sep != -1)
+      mime = mime.sub(0, sep).get();
+    String     convertToType = ((String *)toTypes->Find(mime))->get();
     int                get_hdr = (mystrcasecmp(convertToType, "user-defined") == 0);
     int                get_file = (convertToType.length() != 0);
     String     newcontent;
-    while (readLine(input, line))
+
+    StringList cpargs(currentParser);
+    char   **parsargs = new char * [cpargs.Count() + 5];
+    int    argi;
+    for (argi = 0; argi < cpargs.Count(); argi++)
+       parsargs[argi] = (char *)cpargs[argi];
+    parsargs[argi++] = path.get();
+    parsargs[argi++] = contentType.get();
+    parsargs[argi++] = (char *)base.get();
+    parsargs[argi++] = configFile.get();
+    parsargs[argi++] = 0;
+
+    int    stdout_pipe[2];
+    int           fork_result = -1;
+    int           fork_try;
+
+    if (pipe(stdout_pipe) == -1)
+    {
+      if (debug)
+       cout << "External parser error: Can't create pipe!" << endl;
+      unlink((char*)path);
+      delete [] parsargs;
+      return;
+    }
+
+    for (fork_try = 4; --fork_try >= 0;)
+    {
+      fork_result = fork(); // Fork so we can execute in the child process
+      if (fork_result != -1)
+       break;
+      if (fork_try)
+       sleep(3);
+    }
+    if (fork_result == -1)
+    {
+      if (debug)
+       cout << "Fork Failure in ExternalParser" << endl;
+      unlink((char*)path);
+      delete [] parsargs;
+      return;
+    }
+
+    if (fork_result == 0) // Child process
+    {
+       close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
+       dup(stdout_pipe[1]);
+       close(stdout_pipe[0]);
+       close(stdout_pipe[1]);
+       close(STDIN_FILENO); // Close STDIN to replace with file
+       open((char*)path, O_RDONLY);
+
+       // Call External Parser
+       execv(parsargs[0], parsargs);
+
+       exit(EXIT_FAILURE);
+    }
+
+    // Parent Process
+    delete [] parsargs;
+    close(stdout_pipe[1]); // Close STDOUT for writing
+#ifdef O_BINARY
+    FILE *input = fdopen(stdout_pipe[0], "rb");
+#else
+    FILE *input = fdopen(stdout_pipe[0], "r");
+#endif
+    if (input == NULL)
+    {
+      if (debug)
+       cout << "Fdopen Failure in ExternalParser" << endl;
+      unlink((char*)path);
+      return;
+    }
+
+    while ((!get_file || get_hdr) && readLine(input, line))
     {
        if (get_hdr)
        {
            line.chop('\r');
            if (line.length() == 0)
                get_hdr = FALSE;
-           else if (mystrncasecmp(line, "content-type:", 13) == 0)
+           else if (mystrncasecmp((char*)line, "content-type:", 13) == 0)
            {
                token1 = line.get() + 13;
                while (*token1 && isspace(*token1))
@@ -187,24 +303,9 @@ ExternalParser::parse(Retriever &retriev
            }
            continue;
        }
-       if (get_file)
-       {
-           if (newcontent.length() == 0 &&
-               !canParse(convertToType) &&
-               mystrncasecmp(convertToType, "text/", 5) != 0 &&
-               mystrncasecmp(convertToType, "application/pdf", 15) != 0)
-           {
-               if (mystrcasecmp(convertToType, "user-defined") == 0)
-                   cerr << "External parser error: no Content-Type given\n";
-               else
-                   cerr << "External parser error: can't parse Content-Type \""
-                        << convertToType << "\"\n";
-               cerr << " URL: " << base.get() << "\n";
-               break;
-           }
-           newcontent << line << '\n';
-           continue;
-       }
+#ifdef O_BINARY
+       line.chop('\r');
+#endif
        token1 = strtok(line, "\t");
        if (token1 == NULL)
            token1 = "";
@@ -223,7 +324,7 @@ ExternalParser::parse(Retriever &retriev
                        (hd = atoi(token3)) >= 0 && hd < 12)
                  retriever.got_word(token1, loc, hd);
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected word in line "<<line<<"\n" 
+<< " URL: " << base.get() << "\n";
                break;
                
            case 'u':   // href
@@ -236,7 +337,7 @@ ExternalParser::parse(Retriever &retriev
                  retriever.got_href(url, token2);
                }
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected URL in line "<<line<<"\n" << 
+" URL: " << base.get() << "\n";
                break;
                
            case 't':   // title
@@ -244,7 +345,7 @@ ExternalParser::parse(Retriever &retriev
                if (token1 != NULL)
                  retriever.got_title(token1);
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected title in line "<<line<<"\n" 
+<< " URL: " << base.get() << "\n";
                break;
                
            case 'h':   // head
@@ -252,7 +353,7 @@ ExternalParser::parse(Retriever &retriev
                if (token1 != NULL)
                  retriever.got_head(token1);
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected text in line "<<line<<"\n" 
+<< " URL: " << base.get() << "\n";
                break;
                
            case 'a':   // anchor
@@ -260,7 +361,7 @@ ExternalParser::parse(Retriever &retriev
                if (token1 != NULL)
                  retriever.got_anchor(token1);
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected anchor in line "<<line<<"\n" 
+<< " URL: " << base.get() << "\n";
                break;
                
            case 'i':   // image url
@@ -268,7 +369,7 @@ ExternalParser::parse(Retriever &retriev
                if (token1 != NULL)
                  retriever.got_image(token1);
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected image URL in line 
+"<<line<<"\n" << " URL: " << base.get() << "\n";
                break;
 
            case 'm':   // meta
@@ -310,7 +411,7 @@ ExternalParser::parse(Retriever &retriev
                    if (mystrcasecmp(httpEquiv, "refresh") == 0
                        && *content != '\0')
                    {
-                     char *q = mystrcasestr(content, "url=");
+                     char *q = (char*)mystrcasestr(content, "url=");
                      if (q && *q)
                      {
                        q += 4; // skiping "URL="
@@ -365,7 +466,7 @@ ExternalParser::parse(Retriever &retriev
                        meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
                      if (debug > 1)
                        cout << "META Description: " << content << endl;
-                     retriever.got_meta_dsc(meta_dsc);
+                     retriever.got_meta_dsc((char*)meta_dsc);
 
                      //
                      // Now add the words to the word list
@@ -382,17 +483,42 @@ ExternalParser::parse(Retriever &retriev
                  }
                }
                else
-                 cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: expected metadata in line 
+"<<line<<"\n" << " URL: " << base.get() << "\n";
                break;
              }
 
            default:
-               cerr<< "External parser error in line:"<<line<<"\n" << " URL: " << 
base.get() << "\n";
+                 cerr<< "External parser error: unknown field in line "<<line<<"\n" 
+<< " URL: " << base.get() << "\n";
                break;
        }
+    } // while(readLine)
+    if (get_file)
+    {
+       if (!canParse(convertToType) &&
+           mystrncasecmp((char*)convertToType, "text/", 5) != 0 &&
+           mystrncasecmp((char*)convertToType, "application/pdf", 15) != 0)
+       {
+           if (mystrcasecmp((char*)convertToType, "user-defined") == 0)
+               cerr << "External parser error: no Content-Type given\n";
+           else
+               cerr << "External parser error: can't parse Content-Type \""
+                    << convertToType << "\"\n";
+           cerr << " URL: " << base.get() << "\n";
+       }
+       else
+       {
+           char        buffer[2048];
+           int         length;
+           while ((length = fread(buffer, 1, sizeof(buffer), input)) > 0)
+               newcontent.append(buffer, length);
+       }
     }
-    pclose(input);
-    unlink(path);
+    fclose(input);
+    // close(stdout_pipe[0]); // This is closed for us by the fclose()
+    int rpid, status;
+    while ((rpid = wait(&status)) != fork_result && rpid != -1)
+       ;
+    unlink((char*)path);
 
     if (newcontent.length() > 0)
     {
@@ -407,19 +533,19 @@ ExternalParser::parse(Retriever &retriev
            currentParser = ((String *)parsers->Find(contentType))->get();
            parsable = this;
        }
-       else if (mystrncasecmp(contentType, "text/html", 9) == 0)
+       else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
        {
            if (!html)
                html = new HTML();
            parsable = html;
        }
-       else if (mystrncasecmp(contentType, "text/plain", 10) == 0)
+       else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
        {
            if (!plaintext)
                plaintext = new Plaintext();
            parsable = plaintext;
        }
-       else if (mystrncasecmp(contentType, "application/pdf", 15) == 0)
+       else if (mystrncasecmp((char*)contentType, "application/pdf", 15) == 0)
        {
            if (!pdf)
                pdf = new PDF();
@@ -432,7 +558,7 @@ ExternalParser::parse(Retriever &retriev
            parsable = plaintext;
            if (debug)
                cout << "External parser error: \"" << contentType <<
-                       "\" not a recognized type.  Assuming text\n";
+                       "\" not a recognized type.  Assuming text/plain\n";
        }
        parsable->setContents(newcontent.get(), newcontent.length());
        parsable->parse(retriever, base);

-- 
Gilles R. Detillieux              E-mail: <[EMAIL PROTECTED]>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------
To unsubscribe from the htdig mailing list, send a message to
[EMAIL PROTECTED]
You will receive a message to confirm this.
List archives:  <http://www.htdig.org/mail/menu.html>
FAQ:            <http://www.htdig.org/FAQ.html>

Reply via email to