Nutch-87 Setup
I am looking to create a vertical/regional search application and the
Nutch-87 plugin sounds perfect for what I want to do. However, this
is all VERY new to me (java, ant, tomcat, nutch etc. but I was able
to hack my way through the installation and have a working copy of
Nutch working.
I am having problems trying to install and build the plugin. I have
read the docs but it's totally clear on the steps to add a new plugin
into nutch.
Can anyone give me any pointers as what's happening here. Please
bear in mind I am a nutch newbie.
Here are the steps I have taken:
1.) I downloaded the oc-0[1].3.2.zip file.
2.) FTP'd the zip to the server
3.) unziped in: "/caribbeanlinks.com/nutch/nutch/src/plugin/"
4.) Created the "/epile/src/java" folder and
placed "/crawl/plugin/whitelisturlfilter" directory and added
WhitelistURLFilter.java
/caribbeanlinks.com/nutch/nutch/src/plugin/epile/src/java/crawl/plugin/whitelisturlfilter
5.) Created the build.xml and plugin.xml files in
"/caribbeanlinks.com/nutch/nutch/src/plugin/epile" (see examples below)
6.) ran "ant"
[caribmag]$ ant
Buildfile: build.xml
compile:
[javac] Compiling 51 source files to
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/classes
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/CrawlSeedSource.java:21:
<identifier> expected
[javac] Iterator<SeedURL> getSeedURLs() throws IOException;
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/CrawlSeedSource.java:21:
= expected
[javac] Iterator<SeedURL> getSeedURLs() throws IOException;
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/DefaultFetchList.java:38:
<identifier> expected
[javac] private HashMap<String, HostQueue> hosts = new
HashMap<String, HostQueue>();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/DefaultFetchList.java:49:
<identifier> expected
[javac] private TreeSet<HostQueue> blockedHosts = new
TreeSet(new Comparator<HostQueue>() {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FetcherThread.java:51:
<identifier> expected
[javac] protected LinkedHashMap<URL, ScheduledURL> linkQueue =
new LinkedHashMap();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FileCrawlSeedSource.java:15:
<identifier> expected
[javac] protected ArrayList<SeedURL> seeds = new ArrayList();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FileCrawlSeedSource.java:31:
<identifier> expected
[javac] public Iterator<SeedURL> getSeedURLs() throws IOException {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FileCrawlSeedSource.java:38:
';' expected
[javac] }
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:27:
<identifier> expected
[javac] private LinkedList<ScheduledURL> pages = new LinkedList();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:32:
<identifier> expected
[javac] private TreeSet<Long> checksums = new TreeSet();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:105:
<identifier> expected
[javac] public LinkedList<ScheduledURL> getPages() {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:146:
';' expected
[javac] }
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/InMemoryFetchedURLs.java:14:
<identifier> expected
[javac] private Set<String> fetched = new HashSet();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/NutchFetchListCrawlSeedSource.java:74:
<identifier> expected
[javac] public Iterator<SeedURL> getSeedURLs() throws IOException {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/NutchFetchListCrawlSeedSource.java:91:
';' expected
[javac] }
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:13:
<identifier> expected
[javac] private List<PostFetchProcessor> processors;
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:15:
';' expected
[javac] for(PostFetchProcessor pp : processors) {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:18:
illegal start of expression
[javac] }
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:21:
';' expected
[javac] for(PostFetchProcessor pp : processors) {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:24:
illegal start of expression
[javac] }
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:26:
<identifier> expected
[javac] public void setProcessors(List<PostFetchProcessor> processors) {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:29:
')' expected
[javac] }
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/http/HttpResponse.java:39:
<identifier> expected
[javac] static Map<String, Byte> serverHttpVersion = new Hashtable();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/http/HttpResponse.java:44:
<identifier> expected
[javac] protected Map<String, Integer> codes = new HashMap();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/AbstractScope.java:11:
'{' expected
[javac] public abstract class AbstractScope<T> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/AbstractScope.java:54:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/FetchListScope.java:11:
'{' expected
[javac] public class FetchListScope extends
AbstractScope<FetchListScope.Input> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/MapFileContentSeenFilter.java:17:
'{' expected
[javac] implements ScopeFilter<PostFetchScope.Input>,
PostFetchProcessor {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/MapFileContentSeenFilter.java:51:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/NutchUrlFLFilter.java:9:
'{' expected
[javac] public class NutchUrlFLFilter implements
ScopeFilter<FetchListScope.Input>{
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/NutchUrlFLFilter.java:20:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/OneExternalLinkFLFilter.java:12:
'{' expected
[javac] public class OneExternalLinkFLFilter implements
ScopeFilter<FetchListScope.Input>{
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/OneExternalLinkFLFilter.java:32:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ParseScope.java:8:
'{' expected
[javac] public class ParseScope extends AbstractScope<FetcherOutput>{
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ParseScope.java:10:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/PostFetchScope.java:10:
'{' expected
[javac] public class PostFetchScope extends
AbstractScope<PostFetchScope.Input> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentHostFLFilter.java:8:
'{' expected
[javac] public class SameParentHostFLFilter implements
ScopeFilter<FetchListScope.Input>{
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentHostFLFilter.java:16:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentPathFLFilter.java:6:
'{' expected
[javac] public class SameParentPathFLFilter implements
ScopeFilter<FetchListScope.Input> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentPathFLFilter.java:20:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentTLDFLFilter.java:9:
'{' expected
[javac] public class SameParentTLDFLFilter implements
ScopeFilter<FetchListScope.Input> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentTLDFLFilter.java:30:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ScopeFilter.java:6:
'{' expected
[javac] public interface ScopeFilter<T> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ScopeFilter.java:19:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SizeConstrainedFLFilter.java:8:
'{' expected
[javac] public class SizeConstrainedFLFilter implements
ScopeFilter<FetchListScope.Input>{
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SizeConstrainedFLFilter.java:24:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/WebDBContentSeenFilter.java:13:
'{' expected
[javac] public class WebDBContentSeenFilter implements
ScopeFilter<FetcherOutput> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/WebDBContentSeenFilter.java:38:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongLongPersister.java:8:
'{' expected
[javac] public class LongLongPersister extends
MapFilePersister<Long, Long> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongLongPersister.java:73:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongPersister.java:13:
'{' expected
[javac] public class LongPersister extends
MapFilePersister<Long, NullWritable> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongPersister.java:61:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MD5Persister.java:21:
'{' expected
[javac] public class MD5Persister extends
MapFilePersister<MD5Hash, NullWritable> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MD5Persister.java:56:
'}' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:20:
'{' expected
[javac] public abstract class MapFilePersister <K, V> {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:22:
<identifier> expected
[javac] LogFormatter.getLogger(MapFilePersister.class.getName());
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:22:
'{' expected
[javac] LogFormatter.getLogger(MapFilePersister.class.getName());
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:46:
<identifier> expected
[javac] protected TreeMap<K, V> buffer = new
TreeMap(getTypeComparator());
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:102:
<identifier> expected
[javac] protected abstract Comparator<K> getTypeComparator();
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:219:
';' expected
[javac] for (K k : buffer.keySet()) {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:229:
illegal start of expression
[javac] } finally {
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:228:
')' expected
[javac] ^
[javac]
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:308:
'}' expected
[javac] ^
[javac] 63 errors
BUILD FAILED
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/build.xml:24:
Compile failed; see the compiler error output for details.
Total time: 4 seconds
[caribmag]$
build.xml
---------------------------
<?xml version="1.0"?>
<project name="WhitelistURLFilter" default="jar">
<import file="../build-plugin.xml"/>
</project>
plugin.xml
---------------------------
<?xml version="1.0" encoding="UTF-8"?>
<plugin
id="epile-whitelisturlfilter"
name="Epile whitelist URL filter"
version="1.0.0"
provider-name="teamgigabyte.com">
<extension-point
id="org.apache.nutch.net.URLFilter"
name="Nutch URL Filter"/>
<runtime></runtime>
<extension id="org.apache.nutch.net.urlfiler"
name="Epile Whitelist URL Filter"
point="org.apache.nutch.net.URLFilter">
<implementation id="WhitelistURLFilter"
class="epile.crawl.plugin.WhitelistURLFilter"/>
</extension>
</plugin>