I see that there is a class for parsing pdfs in nutch using pdfbox <http://lucene.apache.org/nutch/apidocs/org/apache/nutch/parse/pdf/packa ge-summary.html> org.apache.nutch.parse.pdf (Nutch 0.7.1 API) but I dont see it in the source of 0.7.1 downloaded I see it on cvs here: http://cvs.sourceforge.net/viewcvs.py/nutch/nutch/src/plugin/parse-pdf/s rc/java/net/nutch/parse/pdf/ but my nutch doesn't seem to run the pdf parse class as my log file shows it fecthing pdfs, but saying nutch is unable to parse content type application/pdf Why is this? Was it left out because of performace? IMO, The class used by nutch (shown below) wont cut it for most pdfs though, as the pdf structure is usually too complicated. Please see some of resources I cited in my last posts including http://www.tamirhassan.dsl.pipex.com/final.pdf and http://www.chilisoftware.net/Private/Christian/ideas_for_extracting_data _from_unstructured_documents.pdf. as they bring up some good algorithms for parsing pdf. 90% of PDFs are unstrcutured, they dont contain any XML content that describes how the pages flow. The content could be in any order, and that might make searching for literals throw innacurate results. the other 10% of PDFs use tagging, and nutch could use this to parse through the tagged ones quite easily using PDFBox. We need to have nutch/lucense parsing pdfs, it is one of the features of google that users value, and there is simplying to much pdf content to ignore. IN addition it would be nice to have nutch be able to show the pdf as html like google does. I think Tamirs paper is a good read on this because he does a good analysis of the googles functionality here and his original objective was to format the PDF as an html, whihc requires correctly parsing the pdf. Some more references http://snowtide.com/home/PDFTextStream/techtips/easy_lucene_integration http://www.jguru.com/faq/view.jsp?EID=862443 /* Copyright (c) 2004 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.parse.pdf; import org.pdfbox.encryption.DocumentEncryption; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; import org.pdfbox.util.PDFTextStripper; import org.pdfbox.exceptions.CryptographyException; import org.pdfbox.exceptions.InvalidPasswordException; import net.nutch.protocol.Content; import net.nutch.util.LogFormatter; import net.nutch.parse.Parser; import net.nutch.parse.Parse; import net.nutch.parse.ParseData; import net.nutch.parse.ParseImpl; import net.nutch.parse.Outlink; import net.nutch.parse.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Properties; import java.util.logging.Logger; import java.io.ByteArrayInputStream; import java.io.IOException; /********************************************* * parser for mime type application/pdf. * It is based on org.pdfbox.*. We have to see how well it does the job. * * @author John Xing * * Note on 20040614 by Xing: * Some codes are stacked here for convenience (see inline comments). * They may be moved to more appropriate places when new codebase * stabilizes, especially after code for indexing is written. * *********************************************/ public class PdfParser implements Parser { public static final Logger LOG = LogFormatter.getLogger("net.nutch.parse.pdf"); public PdfParser () { // redirect org.apache.log4j.Logger to java's native logger, in order // to, at least, suppress annoying log4j warnings. // Note on 20040614 by Xing: // log4j is used by pdfbox. This snippet'd better be moved // to a common place shared by all parsers that use log4j. org.apache.log4j.Logger rootLogger = org.apache.log4j.Logger.getRootLogger(); rootLogger.setLevel(org.apache.log4j.Level.INFO); org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender( new org.apache.log4j.SimpleLayout(), net.nutch.util.LogFormatter.getLogStream( this.LOG, java.util.logging.Level.INFO)); rootLogger.addAppender(appender); } public Parse getParse(Content content) throws ParseException { // check that contentType is one we can handle String contentType = content.getContentType(); if (contentType != null && !contentType.startsWith("application/pdf")) throw new ParseException( "Content-Type not application/pdf: "+contentType); // in memory representation of pdf file PDDocument pdf = null; String text = null; String title = null; try { byte[] raw = content.getContent(); String contentLength = content.get("Content-Length"); if (contentLength != null && raw.length != Integer.parseInt(contentLength)) { throw new ParseException("Content truncated at "+raw.length +" bytes. Parser can't handle incomplete pdf file."); } PDFParser parser = new PDFParser( new ByteArrayInputStream(raw)); parser.parse(); pdf = parser.getPDDocument(); if (pdf.isEncrypted()) { DocumentEncryption decryptor = new DocumentEncryption(pdf); //Just try using the default password and move on decryptor.decryptDocument(""); } // collect text PDFTextStripper stripper = new PDFTextStripper(); text = stripper.getText(pdf); // collect title PDDocumentInformation info = pdf.getDocumentInformation(); title = info.getTitle(); // more useful info, currently not used. please keep them for future use. // pdf.getPageCount(); // info.getAuthor() // info.getSubject() // info.getKeywords() // info.getCreator() // info.getProducer() // info.getTrapped() // formatDate(info.getCreationDate()) // formatDate(info.getModificationDate()) } catch (ParseException e) { throw e; } catch (CryptographyException e) { throw new ParseException("Error decrypting document. "+e); } catch (InvalidPasswordException e) { throw new ParseException("Can't decrypt document. "+e); } catch (Exception e) { // run time exception throw new ParseException("Can't be handled as pdf document. "+e); } finally { try { if (pdf != null) pdf.close(); } catch (IOException e) { // nothing to do } } if (text == null) text = ""; if (title == null) title = ""; // collect outlink Outlink[] outlinks = new Outlink[0]; // collect meta data Properties metadata = new Properties(); metadata.putAll(content.getMetadata()); // copy through ParseData parseData = new ParseData(title, outlinks, metadata); return new ParseImpl(text, parseData); // any filter? //return HtmlParseFilters.filter(content, parse, root); } // format date // currently not used. please keep it for future use. private String formatDate(Calendar date) { String retval = null; if(date != null) { SimpleDateFormat formatter = new SimpleDateFormat(); retval = formatter.format(date.getTime()); } return retval; } } Richard Braman mailto:[EMAIL PROTECTED] 561.748.4002 (voice) http://www.taxcodesoftware.org <http://www.taxcodesoftware.org/> Free Open Source Tax Software