To try it, copy the attached plugin.xml file to
build/plugins/parse-ext/plugin.xml
then copy the attached parse-pdf.sh script to
bin/parse-pdf.sh
and make it executable
chmod +x bin/parse-pdf.sh
finally, include the parse-ext plugin in your nutch-site.xml.
What do you think?
Doug
<?xml version="1.0" encoding="UTF-8"?> <plugin id="parse-ext" name="External Parser Plug-in" version="1.0.0" provider-name="nutch.org">
<extension-point
id="org.apache.nutch.parse.Parser"
name="Nutch Content Parser"/>
<runtime>
<library name="parse-ext.jar">
<export name="*"/>
</library>
</runtime>
<extension id="org.apache.nutch.parse.ext"
name="ExtParse"
point="org.apache.nutch.parse.Parser">
<implementation id="ExtParser"
class="org.apache.nutch.parse.ext.ExtParser"
contentType="application/pdf"
pathSuffix=""
command="bin/parse-pdf.sh"
timeout="30"/>
</extension>
</plugin>
parse-pdf.sh
Description: application/shellscript
