hello,
I have written a parser and indexer for dublin core metadata. is there
anyone who has worked on it and can help me out where i have gone wrong. I
have followed the instructions on the write plugin page and written and
compiled the following two plugins.
----Indexer-------------------------
package org.apache.nutch.parse.dcmeta;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.hadoop.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
public class DCMetaIndexingFilter implements IndexingFilter {
public static final Log LOG = LogFactory.getLog(
DCMetaIndexingFilter.class.getName());
private Configuration conf;
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
throws IndexingException {
String dc_title = parse.getData().getMeta("DC.title");
if (dc_title != null) {
LOG.info("found DC.title "+dc_title);
doc.add(new Field("DC_title", dc_title, Field.Store.YES,
Field.Index.TOKENIZED));
}
return doc;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
--------------------------------------------------------------------------------------------------------
-Parser---
package org.apache.nutch.parse.dcmeta;
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.DocumentFragment;
public class DCMetaParseFilter implements HtmlParseFilter {
private static final Log LOG = LogFactory.getLog(
DCMetaParseFilter.class.getName());
private Configuration conf;
public Parse filter(Content content, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
String recommendation = null;
Properties generalMetaTags = metaTags.getGeneralTags();
for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
String tagName = (String)tagNames.nextElement();
if (tagName.startsWith("DC.")) {
parse.getData().getContentMeta().set(tagName,
generalMetaTags.getProperty(tagName));
LOG.info("Found DC metadata " + tagName + " : " +
generalMetaTags.getProperty(tagName));
}
}
return parse;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
hello,
I have written a parser and indexer for dublin core metadata. is there
anyone who has worked on it and can help me out where i have gone wrong. I
have followed the instructions on the write plugin page and written and
compiled the following two plugins.
----Indexer-------------------------
package org.apache.nutch.parse.dcmeta;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.hadoop.io.UTF8;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
public class DCMetaIndexingFilter implements IndexingFilter {
public static final Log LOG = LogFactory.getLog(
DCMetaIndexingFilter.class.getName());
private Configuration conf;
public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
throws IndexingException {
String dc_title = parse.getData().getMeta("DC.title");
if (dc_title != null) {
LOG.info("found DC.title "+dc_title);
doc.add(new Field("DC_title", dc_title, Field.Store.YES,
Field.Index.TOKENIZED));
}
return doc;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
--------------------------------------------------------------------------------------------------------
-Parser---
package org.apache.nutch.parse.dcmeta;
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.w3c.dom.DocumentFragment;
public class DCMetaParseFilter implements HtmlParseFilter {
private static final Log LOG = LogFactory.getLog(
DCMetaParseFilter.class.getName());
private Configuration conf;
public Parse filter(Content content, Parse parse,
HTMLMetaTags metaTags, DocumentFragment doc) {
String recommendation = null;
Properties generalMetaTags = metaTags.getGeneralTags();
for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
String tagName = (String)tagNames.nextElement();
if (tagName.startsWith("DC.")) {
parse.getData().getContentMeta().set(tagName,
generalMetaTags.getProperty(tagName));
LOG.info("Found DC metadata " + tagName + " : " +
generalMetaTags.getProperty(tagName));
}
}
return parse;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
thankyou
thankyou