hi everybody, i wrote a plugin named description which only index those
pages containg content-type meta-tag with value "text/html; charset=UTF-8"
package org.apache.nutch.parse.description;
// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// W3C imports
import org.w3c.dom.DocumentFragment;
public class DescriptionParser implements HtmlParseFilter {
private static final Log LOG =
LogFactory.getLog(DescriptionParser.class.getName());
private Configuration conf;
/** The Description meta data attribute name */
public static final String META_DESCRIPTION_NAME = "content-type";
/**
* Scan the HTML document looking for a description meta tag.
*/
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
// Trying to find the document's description tag
String desc = null;
Properties HttpMetaTags = metaTags.getHttpEquivTags() ;
for (Enumeration tagNames = HttpMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
if (tagNames.nextElement().equals("content-type")) {
desc = HttpMetaTags.getProperty("content-type");
if(desc == null)
LOG.info("No http-equiv tag for this page");
else if(desc.equals("")) {
LOG.info("Found an empty http-equiv tag");
} else {
LOG.info("Found an http-equiv tag; contents: " + desc);
}
}
}
if( desc.equals("text/html; charset=UTF-8") ) {
LOG.info("Adding http-equiv; contents: " + desc);
parse.getData().getContentMeta().set(META_DESCRIPTION_NAME, desc);
}
return parseResult;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
and DescriptionIndexer is
package org.apache.nutch.parse.description;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.*;
import org.apache.lucene.document.DateTools;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
public class DescriptionIndexer implements IndexingFilter {
public static final Log LOG =
LogFactory.getLog(DescriptionIndexer.class.getName());
private Configuration conf;
public DescriptionIndexer() {
}
public NutchDocument filter(NutchDocument doc, Parse parse, Text
url, CrawlDatum datum, Inlinks inlinks) {
String desc = parse.getData().getMeta("content-type");
if(!(desc.equals("text/html; charset=UTF-8")) ) {
return null;
}
return doc;
}
public void addIndexBackendOptions(Configuration conf) {
LuceneWriter.addFieldOptions("content-type", LuceneWriter.STORE.YES,
LuceneWriter.INDEX.NO, conf);
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
}
when i started to crawl
the following excetptions occur which shows that desc string is null
but i ve seed two urls one with content type "text/html; charset=UTF-8" and
other of text/html; charset=iso-8859-1
please help me to solve this problem
how can i see what is in desc during the crawl
Regards
Amna Waqar