yes i did it, and it also includes it when parsing...but the result is the
same...i cant search for dc fields.
On Thu, Feb 28, 2008 at 3:34 PM, Brian Ulicny <[EMAIL PROTECTED]> wrote:
> Did you include it in the list of plugins to be invoked in
> nutch-site.xml? What happens when you try it?
>
> Brian Ulicny
>
> On Tue, 26 Feb 2008 20:28:22 +0100, "Syed Ahmed"
> <[EMAIL PROTECTED]> said:
> > hello,
> > I have written a parser and indexer for dublin core metadata. is there
> > anyone who has worked on it and can help me out where i have gone wrong.
> > I
> > have followed the instructions on the write plugin page and written and
> > compiled the following two plugins.
> >
> >
> >
> > ----Indexer-------------------------
> > package org.apache.nutch.parse.dcmeta;
> >
> > import org.apache.commons.logging.Log;
> > import org.apache.commons.logging.LogFactory;
> >
> > import org.apache.lucene.document.Document;
> > import org.apache.lucene.document.Field;
> >
> > import org.apache.nutch.parse.Parse;
> >
> > import org.apache.nutch.indexer.IndexingFilter;
> > import org.apache.nutch.indexer.IndexingException;
> > import org.apache.hadoop.io.UTF8;
> >
> > import org.apache.nutch.crawl.CrawlDatum;
> > import org.apache.nutch.crawl.Inlinks;
> >
> > import java.io.IOException;
> > import java.net.MalformedURLException;
> > import java.net.URL;
> > import org.apache.hadoop.conf.Configuration;
> >
> > public class DCMetaIndexingFilter implements IndexingFilter {
> >
> > public static final Log LOG = LogFactory.getLog(
> > DCMetaIndexingFilter.class.getName());
> >
> > private Configuration conf;
> >
> > public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
> > datum, Inlinks inlinks)
> > throws IndexingException {
> >
> > String dc_title = parse.getData().getMeta("DC.title");
> > if (dc_title != null) {
> > LOG.info("found DC.title "+dc_title);
> > doc.add(new Field("DC_title", dc_title, Field.Store.YES,
> > Field.Index.TOKENIZED));
> > }
> >
> > return doc;
> > }
> >
> > public void setConf(Configuration conf) {
> > this.conf = conf;
> > }
> >
> > public Configuration getConf() {
> > return this.conf;
> > }
> >
> > }
> >
> >
> --------------------------------------------------------------------------------------------------------
> >
> > -Parser---
> >
> > package org.apache.nutch.parse.dcmeta;
> >
> > import java.util.Enumeration;
> > import java.util.Properties;
> > import java.util.logging.Logger;
> >
> > import org.apache.hadoop.conf.Configuration;
> > import org.apache.nutch.parse.HTMLMetaTags;
> > import org.apache.nutch.parse.Parse;
> > import org.apache.nutch.parse.HtmlParseFilter;
> > import org.apache.nutch.protocol.Content;
> >
> > import org.apache.commons.logging.Log;
> > import org.apache.commons.logging.LogFactory;
> >
> > import org.w3c.dom.DocumentFragment;
> >
> > public class DCMetaParseFilter implements HtmlParseFilter {
> >
> > private static final Log LOG = LogFactory.getLog(
> > DCMetaParseFilter.class.getName());
> >
> > private Configuration conf;
> >
> > public Parse filter(Content content, Parse parse,
> > HTMLMetaTags metaTags, DocumentFragment doc) {
> > String recommendation = null;
> >
> > Properties generalMetaTags = metaTags.getGeneralTags();
> >
> > for (Enumeration tagNames = generalMetaTags.propertyNames();
> > tagNames.hasMoreElements(); ) {
> > String tagName = (String)tagNames.nextElement();
> > if (tagName.startsWith("DC.")) {
> > parse.getData().getContentMeta().set(tagName,
> > generalMetaTags.getProperty(tagName));
> > LOG.info("Found DC metadata " + tagName + " : " +
> > generalMetaTags.getProperty(tagName));
> > }
> > }
> >
> > return parse;
> > }
> >
> >
> > public void setConf(Configuration conf) {
> >
> > this.conf = conf;
> > }
> >
> > public Configuration getConf() {
> > return this.conf;
> > }
> > }
> >
> >
> >
> > thankyou
> --
> Brian Ulicny
> bulicny at alum dot mit dot edu
> home: 781-721-5746
> fax: 360-361-5746
>
>
>