hi everybody, i wrote a plugin named description which only index those
pages containg content-type meta-tag with value "text/html; charset=UTF-8"
package org.apache.nutch.parse.description;

// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// W3C imports
import org.w3c.dom.DocumentFragment;

public class DescriptionParser implements HtmlParseFilter {

  private static final Log LOG =
LogFactory.getLog(DescriptionParser.class.getName());

  private Configuration conf;

  /** The Description meta data attribute name */
  public static final String META_DESCRIPTION_NAME = "content-type";

  /**
   * Scan the HTML document looking for a description meta tag.
   */
  public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {



              Parse parse = parseResult.get(content.getUrl());

    // Trying to find the document's description tag
    String desc = null;

    Properties HttpMetaTags = metaTags.getHttpEquivTags() ;

    for (Enumeration tagNames = HttpMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
        if (tagNames.nextElement().equals("content-type")) {
           desc = HttpMetaTags.getProperty("content-type");
           if(desc == null)
                   LOG.info("No http-equiv tag for this page");
           else if(desc.equals("")) {
                   LOG.info("Found an empty http-equiv tag");
           } else {
                   LOG.info("Found an http-equiv tag; contents: " + desc);
           }
        }
    }

    if( desc.equals("text/html; charset=UTF-8") ) {
        LOG.info("Adding http-equiv; contents: " + desc);
       parse.getData().getContentMeta().set(META_DESCRIPTION_NAME, desc);

    }


    return parseResult;
  }


  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }
}
and DescriptionIndexer is
package org.apache.nutch.parse.description;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.*;
import org.apache.lucene.document.DateTools;

import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;

import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;

import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;


public class DescriptionIndexer implements IndexingFilter {

        public static final Log LOG =
LogFactory.getLog(DescriptionIndexer.class.getName());

        private Configuration conf;

        public DescriptionIndexer() {

        }

        public NutchDocument filter(NutchDocument doc, Parse parse, Text
url, CrawlDatum datum, Inlinks inlinks)  {

                String desc = parse.getData().getMeta("content-type");

                if(!(desc.equals("text/html; charset=UTF-8")) ) {
                return null;
                }

                return doc;
        }
public void addIndexBackendOptions(Configuration conf) {
LuceneWriter.addFieldOptions("content-type", LuceneWriter.STORE.YES,
LuceneWriter.INDEX.NO, conf);

}
        public void setConf(Configuration conf) {
                this.conf = conf;
        }

        public Configuration getConf() {
                return this.conf;
        }
}

when i started to crawl
the following excetptions occur which shows that desc string is null
but i ve seed two urls one with content type "text/html; charset=UTF-8" and
other of text/html; charset=iso-8859-1
please help me to solve this problem
how can i see what is in desc during the crawl

Regards
Amna Waqar

Reply via email to