Problem creating a simple Plugin

Alaak Sun, 12 Aug 2012 02:58:59 -0700

Hi

I need to create a simple extension to Nutch indexing only web pagesmatching certain criteria.

I followed the explanation on how to setup Nutch using Eclipse and got arunning basic system. Then I followed the explanations on setting up asimple plugin here: http://wiki.apache.org/nutch/WritingPluginExample.However after adding the Plugin I always get output with the followingexception which basically tells me nothing:


...
Fetcher: finished at 2012-08-12 11:06:47, elapsed: 00:00:07
ParseSegment: starting at 2012-08-12 11:06:47
ParseSegment: segment: crawl/segments/20120812110633
Exception in thread "main" java.io.IOException: Job failed!
    at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1265)
    at org.apache.nutch.parse.ParseSegment.parse(ParseSegment.java:209)
    at org.apache.nutch.crawl.Crawl.run(Crawl.java:138)
    at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
    at org.apache.nutch.crawl.Crawl.main(Crawl.java:55)

I wanted to simplify the example by using only on extension which simplyprints out "test" for every crawled page. Here is the code for my pluginclass:


package testplugin;

import java.util.Collection;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class SimpleFilter implements IndexingFilter {

public static final Logger LOGGER =LoggerFactory.getLogger(SimpleFilter.class);

public static final Logger LOGGER =LoggerFactory.getLogger(FocusedForumCrawler.class);

    private Configuration conf;

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;

        if (conf == null)
            return;
    }

    @Override

public NutchDocument filter(NutchDocument doc, Parse parse, Texturl, CrawlDatum datum, Inlinks inlinks)

            throws IndexingException {
        LOGGER.info("test");
        return doc;
    }

}

I also adapted the plugin.xml to look like:

<?xml version="1.0" encoding="UTF-8"?>

<plugin id="simpletestplugin" name="URL Meta Indexing Filter"version="1.0.0" provider-name="alaak">

    <runtime>
        <library name="simpletestplugin.jar">
            <export name="*"/>
        </library>
    </runtime>

    <requires>
        <import plugin="nutch-extensionpoints"/>
    </requires>

<extension id="testplugin" name="Some Simple Test Plugin"point="org.apache.nutch.segment.SegmentMergeFilter">

        <implementation id="page-filter" class="testplugin.SimpleFilter"/>
    </extension>
</plugin>

Can someone please give me a clue what I am doing wrong or whichadditional information you would need to help me?


Thanks and regards.

Problem creating a simple Plugin

Reply via email to