[
https://issues.apache.org/jira/browse/NUTCH-1206?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13158397#comment-13158397
]
Julien Nioche commented on NUTCH-1206:
--------------------------------------
With trunk and httpclient enabled
<property>
<name>plugin.includes</name>
<value>protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
</property>
./nutch parsechecker -dumpText
https://issues.apache.org/jira/secure/attachment/12505323/direct.pdf
{noformat}
Url
---------------
https://issues.apache.org/jira/secure/attachment/12505323/direct.pdf
---------
ParseData
---------
Version: 5
Status: success(1,0)
Title: E-age Banking Channels
Outlinks: 0
Content Metadata: Date=Mon, 28 Nov 2011 12:52:52 GMT Content-Length=26013
Content-Disposition=inline; filename*=utf-8''direct.pdf;
Set-Cookie=atlassian.xsrf.token=A5KQ-2QAV-T4JA-FDED|ef6e4a4b644ef557205c91bc9b1e392746848171|lout;
Path=/jira Connection=close Content-Type=application/pdf;charset=utf-8
X-AUSERNAME=anonymous X-AREQUESTID=772x3514779x2 Server=Apache-Coyote/1.1
Parse Metadata: xmpTPg:NPages=2 Creation-Date=2002-06-09T11:44:37Z created=Sun
Jun 09 12:44:37 BST 2002 subject=E-age Banking Channels Author=Shivram
producer=Acrobat Distiller 3.0 for Windows Last-Modified=2002-02-17T10:57:57Z
Content-Type=application/pdf Keywords= creator=Adobe PageMaker 6.5
---------
ParseText
---------
E-age Banking Channels Application form for e-Age Banking Channels
(Individuals/Sole Proprietorship)We understand your world SOURCE CODE :
NetBanking / WAP Yes, I wish to apply for NetBanking/WAP (Please tick) E-MAIL
ID : MobileBanking using SMS Yes, I wish to apply for MobileBanking (Please
tick) CELLULAR NO. NAME OF CELLULAR SERVICE PROVIDER City PhoneBanking Yes, I
wish to apply for PhoneBanking (Please tick) YOUR MOTHER’S MAIDEN
{noformat}
i.e there is nothing wrong with the parsing of this document. There must be
something wrong in your config or test code. Use the command above instead
> tika parser of nutch 1.3 is failing to prcess pdfs
> --------------------------------------------------
>
> Key: NUTCH-1206
> URL: https://issues.apache.org/jira/browse/NUTCH-1206
> Project: Nutch
> Issue Type: Bug
> Components: parser
> Affects Versions: 1.3
> Environment: Solaris/Linux/Windows
> Reporter: dibyendu ghosh
> Assignee: Chris A. Mattmann
> Attachments: direct.pdf
>
>
> Please refer to this message:
> http://www.mail-archive.com/user%40nutch.apache.org/msg04315.html. Old
> parse-pdf parser seems to be able to parse old pdfs (checked with nutch 1.2)
> though it is not able to parse acrobat 9.0 version of pdfs. nutch 1.3 does
> not have parse-pdf plugin and it is not able to parse even older pdfs.
> my code (TestParse.java):
> ----------------------------
> bash-2.00$ cat TestParse.java
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.FileOutputStream;
> import java.io.PrintStream;
> import java.util.Iterator;
> import java.util.Map;
> import java.util.Map.Entry;
> import org.apache.hadoop.conf.Configuration;
> import org.apache.hadoop.io.Text;
> import org.apache.nutch.metadata.Metadata;
> import org.apache.nutch.parse.ParseResult;
> import org.apache.nutch.parse.Parse;
> import org.apache.nutch.parse.ParseStatus;
> import org.apache.nutch.parse.ParseUtil;
> import org.apache.nutch.parse.ParseData;
> import org.apache.nutch.protocol.Content;
> import org.apache.nutch.util.NutchConfiguration;
> public class TestParse {
> private static Configuration conf = NutchConfiguration.create();
> public TestParse() {
> }
> public static void main(String[] args) {
> String filename = args[0];
> convert(filename);
> }
> public static String convert(String fileName) {
> String newName = "abc.html";
> try {
> System.out.println("Converting " + fileName + " to html.");
> if (convertToHtml(fileName, newName))
> return newName;
> } catch (Exception e) {
> (new File(newName)).delete();
> System.out.println("General exception " + e.getMessage());
> }
> return null;
> }
> private static boolean convertToHtml(String fileName, String newName)
> throws Exception {
> // Read the file
> FileInputStream in = new FileInputStream(fileName);
> byte[] buf = new byte[in.available()];
> in.read(buf);
> in.close();
> // Parse the file
> Content content = new Content("file:" + fileName, "file:" +
> fileName,
> buf, "", new Metadata(), conf);
> ParseResult parseResult = new ParseUtil(conf).parse(content);
> parseResult.filter();
> if (parseResult.isEmpty()) {
> System.out.println("All parsing attempts failed");
> return false;
> }
> Iterator<Map.Entry<Text,Parse>> iterator =
> parseResult.iterator();
> if (iterator == null) {
> System.out.println("Cannot iterate over successful parse
> results");
> return false;
> }
> Parse parse = null;
> ParseData parseData = null;
> while (iterator.hasNext()) {
> parse = parseResult.get((Text)iterator.next().getKey());
> parseData = parse.getData();
> ParseStatus status = parseData.getStatus();
> // If Parse failed then bail
> if (!ParseStatus.STATUS_SUCCESS.equals(status)) {
> System.out.println("Could not parse " + fileName + ". " +
> status.getMessage());
> return false;
> }
> }
> // Start writing to newName
> FileOutputStream fout = new FileOutputStream(newName);
> PrintStream out = new PrintStream(fout, true, "UTF-8");
> // Start Document
> out.println("<html>");
> // Start Header
> out.println("<head>");
> // Write Title
> String title = parseData.getTitle();
> if (title != null && title.trim().length() > 0) {
> out.println("<title>" + parseData.getTitle() + "</title>");
> }
> // Write out Meta tags
> Metadata metaData = parseData.getContentMeta();
> String[] names = metaData.names();
> for (String name : names) {
> String[] subvalues = metaData.getValues(name);
> String values = null;
> for (String subvalue : subvalues) {
> values += subvalue;
> }
> if (values.length() > 0)
> out.printf("<meta name=\"%s\" content=\"%s\"/>\n",
> name, values);
> }
> out.println("<meta http-equiv=\"Content-Type\"
> content=\"text/html;charset=UTF-8\"/>");
> // End Meta tags
> out.println("</head>"); // End Header
> // Start Body
> out.println("<body>");
> out.print(parse.getText());
> out.println("</body>"); // End Body
> out.println("</html>"); // End Document
> out.close(); // Close the file
> return true;
> }
> }
> ----------------------------
> command:
> ======
> bash-2.00$ java -classpath
> conf:runtime/local/lib/nutch-1.3.jar:runtime/local/lib/hadoop-core-0.20.2.jar:runtime/local/lib/commons-logging-api-1.0.4.jar:runtime/local/lib/tika-core-0.9.jar:runtime/local/lib/log4j-1.2.15.jar:runtime/local/lib/oro-2.0.8.jar:.
> TestParse direct.pdf
> ======
> output:
> _____
> Converting direct.pdf to html.
> Oct 19, 2011 5:05:19 PM org.apache.hadoop.conf.Configuration
> getConfResourceAsInputStream
> INFO: found resource tika-mimetypes.xml at
> file:/path/to/nutch/1.3/conf/tika-mimetypes.xml
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginManifestParser
> parsePluginFolder
> INFO: Plugins: looking in: /path/to/nutch/1.3/runtime/local/plugins
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Plugin Auto-activation mode: [true]
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Registered Plugins:
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: the nutch core extension points (nutch-extensionpoints)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Tika Parser Plug-in (parse-tika)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Registered Extension-Points:
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch URL Normalizer
> (org.apache.nutch.net.URLNormalizer)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch Protocol (org.apache.nutch.protocol.Protocol)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch Segment Merge Filter
> (org.apache.nutch.segment.SegmentMergeFilter)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch URL Filter (org.apache.nutch.net.URLFilter)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch Indexing Filter
> (org.apache.nutch.indexer.IndexingFilter)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: HTML Parse Filter
> (org.apache.nutch.parse.HtmlParseFilter)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch Content Parser (org.apache.nutch.parse.Parser)
> Oct 19, 2011 5:05:20 PM org.apache.nutch.plugin.PluginRepository
> displayStatusINFO: Nutch Scoring (org.apache.nutch.scoring.ScoringFilter)
> Oct 19, 2011 5:05:20 PM org.apache.hadoop.conf.Configuration
> getConfResourceAsInputStream
> INFO: found resource parse-plugins.xml at
> file:/path/to/nutch/1.3/conf/parse-plugins.xml
> Oct 19, 2011 5:05:20 PM org.apache.nutch.parse.ParserFactory matchExtensions
> INFO: The parsing plugins: [org.apache.nutch.parse.tika.TikaParser] are
> enabled via the plugin.includes system property, and all claim to support
> the content type application/pdf, but they are not mapped to it in the
> parse-plugins.xml file
> Oct 19, 2011 5:05:21 PM org.apache.nutch.parse.ParseUtil parse
> WARNING: Unable to successfully parse content file:direct.pdf of type
> application/pdf
> Oct 19, 2011 5:05:21 PM org.apache.nutch.parse.ParseResult filter
> WARNING: file:direct.pdf is not parsed successfully, filtering
> All parsing attempts failed
> _____
> my customized nutch-site.xml:
> ~~~~~~~~~~~~~~~~~~~~
> bash-2.00$ cat conf/nutch-site.xml
> <?xml version="1.0"?>
> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
> <configuration>
> <property>
> <name>plugin.folders</name>
> <value>runtime/local/plugins</value>
> <description>Directories where nutch plugins are located. Each
> element may be a relative or absolute path. If absolute, it is used
> as is. If relative, it is searched for on the classpath.</description>
> </property>
> <property>
> <name>plugin.includes</name>
> <value>parse-tika</value>
> <description>Regular expression naming plugin directory names to
> include. Any plugin not matching this expression is excluded.
> </description>
> </property>
> </configuration>
> ~~~~~~~~~~~~~~~~~~~~
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators:
https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira