Nutch-87 Setup

I am looking to create a vertical/regional search application and the Nutch-87 plugin sounds perfect for what I want to do. However, this is all VERY new to me (java, ant, tomcat, nutch etc. but I was able to hack my way through the installation and have a working copy of Nutch working.

I am having problems trying to install and build the plugin. I have read the docs but it's totally clear on the steps to add a new plugin into nutch.

Can anyone give me any pointers as what's happening here. Please bear in mind I am a nutch newbie.


Here are the steps I have taken:
1.) I downloaded the oc-0[1].3.2.zip file.
2.) FTP'd the zip to the server
3.)  unziped in:  "/caribbeanlinks.com/nutch/nutch/src/plugin/"
4.) Created the "/epile/src/java" folder and placed "/crawl/plugin/whitelisturlfilter" directory and added WhitelistURLFilter.java
/caribbeanlinks.com/nutch/nutch/src/plugin/epile/src/java/crawl/plugin/whitelisturlfilter
5.) Created the build.xml and plugin.xml files in "/caribbeanlinks.com/nutch/nutch/src/plugin/epile" (see examples below)
6.) ran "ant"

[caribmag]$ ant
Buildfile: build.xml

compile:
[javac] Compiling 51 source files to /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/classes [javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/CrawlSeedSource.java:21: <identifier> expected
    [javac]   Iterator<SeedURL> getSeedURLs() throws IOException;
    [javac]           ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/CrawlSeedSource.java:21: = expected
    [javac]   Iterator<SeedURL> getSeedURLs() throws IOException;
    [javac]                                                     ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/DefaultFetchList.java:38: <identifier> expected [javac] private HashMap<String, HostQueue> hosts = new HashMap<String, HostQueue>();
    [javac]                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/DefaultFetchList.java:49: <identifier> expected [javac] private TreeSet<HostQueue> blockedHosts = new TreeSet(new Comparator<HostQueue>() {
    [javac]                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FetcherThread.java:51: <identifier> expected [javac] protected LinkedHashMap<URL, ScheduledURL> linkQueue = new LinkedHashMap();
    [javac]                          ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FileCrawlSeedSource.java:15: <identifier> expected
    [javac]   protected ArrayList<SeedURL> seeds = new ArrayList();
    [javac]                      ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FileCrawlSeedSource.java:31: <identifier> expected
    [javac]   public Iterator<SeedURL> getSeedURLs() throws IOException {
    [javac]                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/FileCrawlSeedSource.java:38: ';' expected
    [javac] }
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:27: <identifier> expected
    [javac]   private LinkedList<ScheduledURL> pages = new LinkedList();
    [javac]                     ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:32: <identifier> expected
    [javac]   private TreeSet<Long> checksums = new TreeSet();
    [javac]                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:105: <identifier> expected
    [javac]   public LinkedList<ScheduledURL> getPages() {
    [javac]                    ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/HostQueue.java:146: ';' expected
    [javac] }
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/InMemoryFetchedURLs.java:14: <identifier> expected
    [javac]   private Set<String> fetched = new HashSet();
    [javac]              ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/NutchFetchListCrawlSeedSource.java:74: <identifier> expected
    [javac]   public Iterator<SeedURL> getSeedURLs() throws IOException {
    [javac]                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/NutchFetchListCrawlSeedSource.java:91: ';' expected
    [javac] }
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:13: <identifier> expected
    [javac]   private List<PostFetchProcessor> processors;
    [javac]               ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:15: ';' expected
    [javac]     for(PostFetchProcessor pp : processors) {
    [javac]                               ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:18: illegal start of expression
    [javac]   }
    [javac]   ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:21: ';' expected
    [javac]     for(PostFetchProcessor pp : processors) {
    [javac]                               ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:24: illegal start of expression
    [javac]   }
    [javac]   ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:26: <identifier> expected
    [javac]   public void setProcessors(List<PostFetchProcessor> processors) {
    [javac]                                 ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/PostFetchProcessorChain.java:29: ')' expected
    [javac] }
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/http/HttpResponse.java:39: <identifier> expected
    [javac]   static Map<String, Byte> serverHttpVersion = new Hashtable();
    [javac]             ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/http/HttpResponse.java:44: <identifier> expected
    [javac]   protected Map<String, Integer> codes = new HashMap();
    [javac]                ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/AbstractScope.java:11: '{' expected
    [javac] public abstract class AbstractScope<T> {
    [javac]                                    ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/AbstractScope.java:54: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/FetchListScope.java:11: '{' expected [javac] public class FetchListScope extends AbstractScope<FetchListScope.Input> {
    [javac]                                                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/MapFileContentSeenFilter.java:17: '{' expected [javac] implements ScopeFilter<PostFetchScope.Input>, PostFetchProcessor {
    [javac]                           ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/MapFileContentSeenFilter.java:51: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/NutchUrlFLFilter.java:9: '{' expected [javac] public class NutchUrlFLFilter implements ScopeFilter<FetchListScope.Input>{
    [javac]                                                     ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/NutchUrlFLFilter.java:20: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/OneExternalLinkFLFilter.java:12: '{' expected [javac] public class OneExternalLinkFLFilter implements ScopeFilter<FetchListScope.Input>{
    [javac]                                                            ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/OneExternalLinkFLFilter.java:32: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ParseScope.java:8: '{' expected
    [javac] public class ParseScope extends AbstractScope<FetcherOutput>{
    [javac]                                              ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ParseScope.java:10: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/PostFetchScope.java:10: '{' expected [javac] public class PostFetchScope extends AbstractScope<PostFetchScope.Input> {
    [javac]                                                  ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentHostFLFilter.java:8: '{' expected [javac] public class SameParentHostFLFilter implements ScopeFilter<FetchListScope.Input>{
    [javac]                                                           ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentHostFLFilter.java:16: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentPathFLFilter.java:6: '{' expected [javac] public class SameParentPathFLFilter implements ScopeFilter<FetchListScope.Input> {
    [javac]                                                           ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentPathFLFilter.java:20: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentTLDFLFilter.java:9: '{' expected [javac] public class SameParentTLDFLFilter implements ScopeFilter<FetchListScope.Input> {
    [javac]                                                          ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SameParentTLDFLFilter.java:30: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ScopeFilter.java:6: '{' expected
    [javac] public interface ScopeFilter<T> {
    [javac]                             ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/ScopeFilter.java:19: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SizeConstrainedFLFilter.java:8: '{' expected [javac] public class SizeConstrainedFLFilter implements ScopeFilter<FetchListScope.Input>{
    [javac]                                                            ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/SizeConstrainedFLFilter.java:24: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/WebDBContentSeenFilter.java:13: '{' expected [javac] public class WebDBContentSeenFilter implements ScopeFilter<FetcherOutput> {
    [javac]                                                           ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/scope/WebDBContentSeenFilter.java:38: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongLongPersister.java:8: '{' expected [javac] public class LongLongPersister extends MapFilePersister<Long, Long> {
    [javac]                                                        ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongLongPersister.java:73: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongPersister.java:13: '{' expected [javac] public class LongPersister extends MapFilePersister<Long, NullWritable> {
    [javac]                                                    ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/LongPersister.java:61: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MD5Persister.java:21: '{' expected [javac] public class MD5Persister extends MapFilePersister<MD5Hash, NullWritable> {
    [javac]                                                   ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MD5Persister.java:56: '}' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:20: '{' expected
    [javac] public abstract class MapFilePersister <K, V> {
    [javac]                                        ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:22: <identifier> expected
    [javac]       LogFormatter.getLogger(MapFilePersister.class.getName());
    [javac]                                                    ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:22: '{' expected
    [javac]       LogFormatter.getLogger(MapFilePersister.class.getName());
    [javac]                                                               ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:46: <identifier> expected [javac] protected TreeMap<K, V> buffer = new TreeMap(getTypeComparator());
    [javac]                    ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:102: <identifier> expected
    [javac]   protected abstract Comparator<K> getTypeComparator();
    [javac]                                ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:219: ';' expected
    [javac]       for (K k : buffer.keySet()) {
    [javac]                ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:229: illegal start of expression
    [javac]     } finally {
    [javac]     ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:228: ')' expected
    [javac] ^
[javac] /home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/src/java/org/supermind/crawl/util/MapFilePersister.java:308: '}' expected
    [javac] ^
    [javac] 63 errors

BUILD FAILED
/home/1/caribmag/caribbeanlinks.com/nutch/nutch/oc/build.xml:24: Compile failed; see the compiler error output for details.

Total time: 4 seconds
[caribmag]$






build.xml
---------------------------
<?xml version="1.0"?>

<project name="WhitelistURLFilter" default="jar">

  <import file="../build-plugin.xml"/>

</project>



plugin.xml
---------------------------
<?xml version="1.0" encoding="UTF-8"?>
<plugin
   id="epile-whitelisturlfilter"
   name="Epile whitelist URL filter"
   version="1.0.0"
   provider-name="teamgigabyte.com">

   <extension-point
      id="org.apache.nutch.net.URLFilter"
      name="Nutch URL Filter"/>

   <runtime></runtime>

   <extension id="org.apache.nutch.net.urlfiler"
      name="Epile Whitelist URL Filter"
      point="org.apache.nutch.net.URLFilter">

      <implementation id="WhitelistURLFilter"
         class="epile.crawl.plugin.WhitelistURLFilter"/>
   </extension>
</plugin>


Reply via email to