dc metadata

Syed Ahmed Wed, 27 Feb 2008 03:55:19 -0800

hello,
I have written a parser and indexer for dublin core metadata. is there
anyone who has worked on it and can help me out where i have gone wrong. I
have followed the instructions on the write plugin page and written and
compiled the following two plugins.




----Indexer-------------------------
package org.apache.nutch.parse.dcmeta;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import org.apache.nutch.parse.Parse;

import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.hadoop.io.UTF8;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;

public class DCMetaIndexingFilter implements IndexingFilter {

public static final Log LOG = LogFactory.getLog(
DCMetaIndexingFilter.class.getName());

  private Configuration conf;

  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
    throws IndexingException {

        String dc_title = parse.getData().getMeta("DC.title");
        if (dc_title != null) {
                LOG.info("found DC.title "+dc_title);
                doc.add(new Field("DC_title", dc_title, Field.Store.YES,
Field.Index.TOKENIZED));
        }

    return doc;
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
  }

 public Configuration getConf() {
    return this.conf;
  }

}

--------------------------------------------------------------------------------------------------------

-Parser---

package org.apache.nutch.parse.dcmeta;

import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.w3c.dom.DocumentFragment;

public class DCMetaParseFilter implements HtmlParseFilter {

  private static final Log LOG = LogFactory.getLog(
DCMetaParseFilter.class.getName());

  private Configuration conf;

public Parse filter(Content content, Parse parse,
    HTMLMetaTags metaTags, DocumentFragment doc) {
    String recommendation = null;

    Properties generalMetaTags = metaTags.getGeneralTags();

    for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
        String tagName = (String)tagNames.nextElement();
        if (tagName.startsWith("DC.")) {
           parse.getData().getContentMeta().set(tagName,
generalMetaTags.getProperty(tagName));
           LOG.info("Found DC metadata " + tagName + " : " +
generalMetaTags.getProperty(tagName));
        }
    }

    return parse;
  }


  public void setConf(Configuration conf) {

this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }
}
hello,
I have written a parser and indexer for dublin core metadata. is there
anyone who has worked on it and can help me out where i have gone wrong. I
have followed the instructions on the write plugin page and written and
compiled the following two plugins.



----Indexer-------------------------
package org.apache.nutch.parse.dcmeta;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import org.apache.nutch.parse.Parse;

import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.hadoop.io.UTF8;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;

public class DCMetaIndexingFilter implements IndexingFilter {

public static final Log LOG = LogFactory.getLog(
DCMetaIndexingFilter.class.getName());

  private Configuration conf;

  public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum
datum, Inlinks inlinks)
    throws IndexingException {

        String dc_title = parse.getData().getMeta("DC.title");
        if (dc_title != null) {
                LOG.info("found DC.title "+dc_title);
                doc.add(new Field("DC_title", dc_title, Field.Store.YES,
Field.Index.TOKENIZED));
        }

    return doc;
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
  }

 public Configuration getConf() {
    return this.conf;
  }

}

--------------------------------------------------------------------------------------------------------

-Parser---

package org.apache.nutch.parse.dcmeta;

import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;

import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.w3c.dom.DocumentFragment;

public class DCMetaParseFilter implements HtmlParseFilter {

  private static final Log LOG = LogFactory.getLog(
DCMetaParseFilter.class.getName());

  private Configuration conf;

public Parse filter(Content content, Parse parse,
    HTMLMetaTags metaTags, DocumentFragment doc) {
    String recommendation = null;

    Properties generalMetaTags = metaTags.getGeneralTags();

    for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
        String tagName = (String)tagNames.nextElement();
        if (tagName.startsWith("DC.")) {
           parse.getData().getContentMeta().set(tagName,
generalMetaTags.getProperty(tagName));
           LOG.info("Found DC metadata " + tagName + " : " +
generalMetaTags.getProperty(tagName));
        }
    }

    return parse;
  }


  public void setConf(Configuration conf) {

this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }
}



thankyou


thankyou

dc metadata

Reply via email to