[htdig-dev] Re: pdf2html fork failure

Lachlan Andrew Fri, 09 Apr 2004 21:06:24 -0700

Greetings Michele,

The problem has nothing to do with  pdf2html  itself -- it occurs when 
htdig  is preparing to start that.

The only reasons I can think of are running out of an OS resource 
(process table, number of open files, ...) or running out of memory.  
Are many other jobs running while you are digging?  If you haven't 
rebooted for a long time, there may be some "zombies" taking 
process-table slots.

If you copy the attached  ExternalParser.cc  into the  htdig/ 
subdirectory before compiling, it should give some more information 
on why the fork is failing.

Good luck,
Lachlan

On Thu, 8 Apr 2004 01:05, Michele Keenan wrote:
> I have 3.2.0b5 installed and receive the following line when
> parsing pdf files with pdf2html
>         Fork Failure in ExternalParser
>
> When I run the pdf2html.pl script on an individual pdf file it
> works, so I am confused what could be causing the problem.
>
> Any clues?

-- 
[EMAIL PROTECTED]
ht://Dig developer DownUnder  (http://www.htdig.org)

//
// ExternalParser.cc
//
// ExternalParser: Implementation of ExternalParser
//                 Allows external programs to parse unknown document formats.
//                 The parser is expected to return the document in a 
//                 specific format. The format is documented 
//                 in http://www.htdig.org/attrs.html#external_parser
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1995-2003 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: ExternalParser.cc,v 1.28 2004/01/12 12:48:24 lha Exp $
//

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include <errno.h>

#include "ExternalParser.h"
#include "HTML.h"
#include "Plaintext.h"
#include "htdig.h"
#include "htString.h"
#include "QuotedStringList.h"
#include "URL.h"
#include "Dictionary.h"
#include "good_strtok.h"

#include <ctype.h>
#include <stdio.h>

#ifndef _MSC_VER /* _WIN32 */
#include <unistd.h>
#endif

#include <stdlib.h>
#ifdef HAVE_WAIT_H
#include <wait.h>
#elif HAVE_SYS_WAIT_H
#include <sys/wait.h>
#endif

#ifdef _MSC_VER /* _WIN32 */
#include <process.h>
#endif


#include "defaults.h"

static Dictionary	*parsers = 0;
static Dictionary	*toTypes = 0;
extern String		configFile;

//*****************************************************************************
// ExternalParser::ExternalParser(char *contentType)
//
ExternalParser::ExternalParser(char *contentType)
{
  String mime;
  int sep;

    if (canParse(contentType))
    {
        String mime = contentType;
	mime.lowercase();
	sep = mime.indexOf(';');
	if (sep != -1)
	  mime = mime.sub(0, sep).get();
	
	currentParser = ((String *)parsers->Find(mime))->get();
    }
    ExternalParser::contentType = contentType;
}


//*****************************************************************************
// ExternalParser::~ExternalParser()
//
ExternalParser::~ExternalParser()
{
}


//*****************************************************************************
// int ExternalParser::readLine(FILE *in, String &line)
//
int
ExternalParser::readLine(FILE *in, String &line)
{
    char	buffer[2048];
    int		length;
    
    line = 0; // read(in, buffer, sizeof(buffer)
    while (fgets(buffer, sizeof(buffer), in))
    {
	length = strlen(buffer);
	if (buffer[length - 1] == '\n')
	{
	    //
	    // A full line has been read.  Return it.
	    //
	    line << buffer;
	    line.chop('\n');
	    return 1;
	}
	else
	{
	    //
	    // Only a partial line was read.  Append it to the line
	    // and read some more.
	    //
	    line << buffer;
	}
    }
    return line.length() > 0;
}


//*****************************************************************************
// int ExternalParser::canParse(char *contentType)
//
int
ExternalParser::canParse(char *contentType)
{
  HtConfiguration* config= HtConfiguration::config();
  int			sep;

    if (!parsers)
    {
	parsers = new Dictionary();
	toTypes = new Dictionary();
	
	QuotedStringList	qsl(config->Find("external_parsers"), " \t");
	String			from, to;
	int			i;

	for (i = 0; qsl[i]; i += 2)
	{
	    from = qsl[i];
	    to = "";
	    sep = from.indexOf("->");
	    if (sep != -1)
	    {
		to = from.sub(sep+2).get();
		from = from.sub(0, sep).get();
	    }
	    from.lowercase();
	    sep = from.indexOf(';');
	    if (sep != -1)
	      from = from.sub(0, sep).get();

	    parsers->Add(from, new String(qsl[i + 1]));
	    toTypes->Add(from, new String(to));
	}
    }

    String mime = contentType;
    mime.lowercase();
    sep = mime.indexOf(';');
    if (sep != -1)
      mime = mime.sub(0, sep).get();
    return parsers->Exists(mime);
}

//*****************************************************************************
// void ExternalParser::parse(Retriever &retriever, URL &base)
//
void
ExternalParser::parse(Retriever &retriever, URL &base)
{
// NEAL - ENABLE/REWRITE THIS ASAP FOR WIN32
#ifndef _MSC_VER /* _WIN32 */
	HtConfiguration* config= HtConfiguration::config();
    if (contents == 0 || contents->length() == 0 ||
	currentParser.length() == 0)
    {
	return;
    }

    //
    // Write the contents to a temporary file.
    //
    String      path = getenv("TMPDIR");
    int		fd;
    if (path.length() == 0)
      path = "/tmp";
#ifndef HAVE_MKSTEMP
    path << "/htdext." << getpid(); // This is unfortunately predictable

#ifdef O_BINARY
    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL|O_BINARY);
#else
    fd = open((char*)path, O_WRONLY|O_CREAT|O_EXCL);
#endif
#else
    path << "/htdex.XXXXXX";
    fd = mkstemp((char*)path);
    // can we force binary mode somehow under Cygwin, if it has mkstemp?
#endif
    if (fd < 0)
    {
      if (debug)
	cout << "External parser error: Can't create temp file "
	     << (char *)path << endl;
      return;
    }
    
    write(fd, contents->get(), contents->length());
    close(fd);

//  unsigned int minimum_word_length = config->Value("minimum_word_length", 3);
    String	line;
    char	*token1, *token2, *token3;
    int		loc = 0, hd = 0;
    URL		url;
    String mime = contentType;
    mime.lowercase();
    int	sep = mime.indexOf(';');
    if (sep != -1)
      mime = mime.sub(0, sep).get();
    String	convertToType = ((String *)toTypes->Find(mime))->get();
    int		get_hdr = (convertToType.nocase_compare("user-defined") == 0);
    int		get_file = (convertToType.length() != 0);
    String	newcontent;

    StringList	cpargs(currentParser);
    char   **parsargs = new char * [cpargs.Count() + 5];
    int    argi;
    for (argi = 0; argi < cpargs.Count(); argi++)
	parsargs[argi] = (char *)cpargs[argi];
    parsargs[argi++] = path.get();
    parsargs[argi++] = contentType.get();
    parsargs[argi++] = (char *)base.get().get();
    parsargs[argi++] = configFile.get();
    parsargs[argi++] = 0;

    int    stdout_pipe[2];
    int	   fork_result = -1;
    int	   fork_try;

    if (pipe(stdout_pipe) == -1)
    {
      if (debug)
	cout << "External parser error: Can't create pipe!" << endl;
      unlink((char*)path);
      delete [] parsargs;
      return;
    }

    for (fork_try = 4; --fork_try >= 0;)
    {
      fork_result = fork(); // Fork so we can execute in the child process
      if (fork_result != -1)
	break;
      if (fork_try)
	sleep(3);
    }
    if (fork_result == -1)
    {
      if (debug)
	perror ("Fork Failure in ExternalParser");
      unlink((char*)path);
      delete [] parsargs;
      return;
    }

    if (fork_result == 0) // Child process
    {
	close(STDOUT_FILENO); // Close handle STDOUT to replace with pipe
	dup(stdout_pipe[1]);
	close(stdout_pipe[0]);
	close(stdout_pipe[1]);
	close(STDIN_FILENO); // Close STDIN to replace with file
	open((char*)path, O_RDONLY);

	// Call External Parser
	execv(parsargs[0], parsargs);

	exit(EXIT_FAILURE);
    }

    // Parent Process
    delete [] parsargs;
    close(stdout_pipe[1]); // Close STDOUT for writing
#ifdef O_BINARY
    FILE *input = fdopen(stdout_pipe[0], "rb");
#else
    FILE *input = fdopen(stdout_pipe[0], "r");
#endif
    if (input == NULL)
    {
      if (debug)
	cout << "Fdopen Failure in ExternalParser" << endl;
      unlink((char*)path);
      return;
    }

    while ((!get_file || get_hdr) && readLine(input, line))
    {
	if (get_hdr)
	{
	    line.chop('\r');
	    if (line.length() == 0)
		get_hdr = false;
	    else if (mystrncasecmp((char*)line, "content-type:", 13) == 0)
	    {
		token1 = line.get() + 13;
		while (*token1 && isspace(*token1))
		    token1++;
		token1 = strtok(token1, "\n\t");
		convertToType = token1;
	    }
	    continue;
	}
#ifdef O_BINARY
	line.chop('\r');
#endif
	token1 = strtok(line, "\t");
	if (token1 == NULL)
	    token1 = "";
	token2 = NULL;
	token3 = NULL;
	switch (*token1)
	{
	    case 'w':	// word
		token1 = strtok(0, "\t");
		if (token1 != NULL)
		  token2 = strtok(0, "\t");
		if (token2 != NULL)
		  token3 = strtok(0, "\t");
		if (token1 != NULL && token2 != NULL && token3 != NULL &&
			(loc = atoi(token2)) >= 0 &&
			(hd = atoi(token3)) >= 0 && hd < 12)
		  retriever.got_word(token1, loc, hd);
		else
		  cerr<< "External parser error: expected word in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
		
	    case 'u':	// href
		token1 = strtok(0, "\t");
		if (token1 != NULL)
		  token2 = strtok(0, "\t");
		if (token1 != NULL && token2 != NULL)
		{
		  url.parse(token1);
		  url.hopcount(base.hopcount() + 1);
		  retriever.got_href(url, token2);
		}
		else
		  cerr<< "External parser error: expected URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
		
	    case 't':	// title
		token1 = strtok(0, "\t");
		if (token1 != NULL)
		  retriever.got_title(token1);
		else
		  cerr<< "External parser error: expected title in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
		
	    case 'h':	// head
		token1 = strtok(0, "\t");
		if (token1 != NULL)
		  retriever.got_head(token1);
		else
		  cerr<< "External parser error: expected text in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
		
	    case 'a':	// anchor
		token1 = strtok(0, "\t");
		if (token1 != NULL)
		  retriever.got_anchor(token1);
		else
		  cerr<< "External parser error: expected anchor in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
		
	    case 'i':	// image url
		token1 = strtok(0, "\t");
		if (token1 != NULL)
		  retriever.got_image(token1);
		else
		  cerr<< "External parser error: expected image URL in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;

	    case 'm':	// meta
	      {
		// Using good_strtok means we can accept empty
		// fields.
		char *httpEquiv = good_strtok(token1+2, '\t');
		char *name = good_strtok(0, '\t');
		char *content = good_strtok(0, '\t');

		if (httpEquiv != NULL && name != NULL && content != NULL)
		{
		  // It would be preferable if we could share
		  // this part with HTML.cc, but it has other
		  // chores too, and I do not see a point where to
		  // split it up to get a common shared function
		  // (or class).  This should not stop anybody from
		  // finding a better solution.
		  // For now, there is duplicated code.
		  static StringMatch *keywordsMatch = 0;
		  if (!keywordsMatch)
		  {
			StringList kn(config->Find("keywords_meta_tag_names"), " \t");
			keywordsMatch = new StringMatch();
			keywordsMatch->IgnoreCase();
			keywordsMatch->Pattern(kn.Join('|'));
		  }
		  static StringMatch *descriptionMatch = 0;
		  if (!descriptionMatch)
		  {
			StringList dn(config->Find("description_meta_tag_names"), " \t");
			descriptionMatch = new StringMatch();
			descriptionMatch->IgnoreCase();
			descriptionMatch->Pattern(dn.Join('|'));
		  }
		  static StringMatch *metadatetags = 0;
		  if (!metadatetags)
		  {
			metadatetags = new StringMatch();
			metadatetags->IgnoreCase();
			metadatetags->Pattern("date|dc.date|dc.date.created|dc.data.modified");
		  }
    
		  // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5> 
		  // says that the "name" attribute defaults to
		  // the http-equiv attribute if empty.
		  if (*name == '\0')
		    name = httpEquiv;

		  if (*httpEquiv != '\0')
		  {
		    // <META HTTP-EQUIV=REFRESH case
		    if (mystrcasecmp(httpEquiv, "refresh") == 0
			&& *content != '\0')
		    {
		      char *q = (char*)mystrcasestr(content, "url");
		      if (q && *q)
		      {
			q += 3; // skiping "URL"
			while (*q && ((*q == '=') || isspace(*q))) q++;
			char *qq = q;
			while (*qq && (*qq != ';') && (*qq != '"') &&
			       !isspace(*qq))qq++;
			*qq = 0;
			URL href(q, base);
			// I don't know why anyone would do this, but hey...
			retriever.got_href(href, "");
		      }
		    }
		  }

		  //
		  // Now check for <meta name=...  content=...> tags that
		  // fly with any reasonable DTD out there
		  //
		  if (*name != '\0' && *content != '\0')
		  {
		    if (keywordsMatch->CompareWord(name))
		    {
			int wordindex = 1;
			addKeywordString (retriever, content, wordindex);
//			// can this be merged with Parser::addKeywordString ?
//		      char	*w = strtok(content, " ,\t\r");
//		      while (w)
//		      {
//			if (strlen(w) >= minimum_word_length)
//			  retriever.got_word(w, 1, 9);
//			w = strtok(0, " ,\t\r");
//		      }
		    }
		    if (metadatetags->CompareWord(name) &&
					config->Boolean("use_doc_date", 0))
		    {
		      retriever.got_time(content);
		    }
		    else if (mystrcasecmp(name, "author") == 0)
		    {
			int wordindex = 1;
			retriever.got_author(content);
			addString (retriever, content, wordindex, 11);
		    }
		    else if (mystrcasecmp(name, "htdig-email") == 0)
		    {
		      retriever.got_meta_email(content);
		    }
		    else if (mystrcasecmp(name, "htdig-notification-date") == 0)
		    {
		      retriever.got_meta_notification(content);
		    }
		    else if (mystrcasecmp(name, "htdig-email-subject") == 0)
		    {
		      retriever.got_meta_subject(content);
		    }
		    else if (descriptionMatch->CompareWord(name)
			     && strlen(content) != 0)
		    {
		      //
		      // We need to do two things. First grab the description
		      //
		      String meta_dsc = content;

		      if (meta_dsc.length() > max_meta_description_length)
			meta_dsc = meta_dsc.sub(0, max_meta_description_length).get();
		      if (debug > 1)
			cout << "META Description: " << content << endl;
		      retriever.got_meta_dsc((char*)meta_dsc);

		      //
		      // Now add the words to the word list
		      // (slot 10 is the new slot for this)
		      //
		      int wordindex = 1;
		      addString (retriever, content, wordindex, 10);
//		      // can this be merged with Parser::addString ?
//		      char	  *w = strtok(content, " \t\r");
//		      while (w)
//		      {
//			if (strlen(w) >= minimum_word_length)
//			  retriever.got_word(w, 1, 10);
//			w = strtok(0, " \t\r");
//		      }
		    }
		  }
		}
		else
		  cerr<< "External parser error: expected metadata in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
	      }

	    default:
		  cerr<< "External parser error: unknown field in line "<<line<<"\n" << " URL: " << base.get() << "\n";
		break;
	}
    } // while(readLine)
    if (get_file)
    {
	if (!canParse(convertToType) &&
	    mystrncasecmp((char*)convertToType, "text/", 5) != 0)
	{
	    if (mystrcasecmp((char*)convertToType, "user-defined") == 0)
		cerr << "External parser error: no Content-Type given\n";
	    else
		cerr << "External parser error: can't parse Content-Type \""
		     << convertToType << "\"\n";
	    cerr << " URL: " << base.get() << "\n";
	}
	else
	{
	    char	buffer[2048];
	    int		length;
	    int		nbytes = config->Value("max_doc_size");
	    while (nbytes > 0 &&
			(length = fread(buffer, 1, sizeof(buffer), input)) > 0)
	    {
		nbytes -= length;
		if (nbytes < 0)
		    length += nbytes;
		newcontent.append(buffer, length);
	    }
	}
    }
    fclose(input);
    // close(stdout_pipe[0]); // This is closed for us by the fclose()
    int rpid, status;
    while ((rpid = wait(&status)) != fork_result && rpid != -1)
	;
    unlink((char*)path);

    if (newcontent.length() > 0)
    {
	static HTML			*html = 0;
	static Plaintext		*plaintext = 0;
	Parsable			*parsable = 0;

	contentType = convertToType;
	if (canParse(contentType))
	{
	    currentParser = ((String *)parsers->Find(contentType))->get();
	    parsable = this;
	}
	else if (mystrncasecmp((char*)contentType, "text/html", 9) == 0)
	{
	    if (!html)
		html = new HTML();
	    parsable = html;
	}
	else if (mystrncasecmp((char*)contentType, "text/plain", 10) == 0)
	{
	    if (!plaintext)
		plaintext = new Plaintext();
	    parsable = plaintext;
	}
	else 
	{
	    if (!plaintext)
		plaintext = new Plaintext();
	    parsable = plaintext;
	    if (debug)
		cout << "External parser error: \"" << contentType <<
			"\" not a recognized type.  Assuming text/plain\n";
	}
	parsable->setContents(newcontent.get(), newcontent.length());
	parsable->parse(retriever, base);
    }
#endif //ifndef _MSC_VER /* _WIN32 */
}

[htdig-dev] Re: pdf2html fork failure

Reply via email to