Sorry, I've been away from Nutch email for a few days.
Your analysis is correct: at present summaries are only available as HTML. The simplest workaround might be to use, e.g., a SAX-based parser to extract plain text from this. This could even be done by the NutchBean, as a convenience method. I've attached a patch which implements this. Is that good enough? Is it too much of a hack?
A longer term fix might be to add an option to construct summaries directly as plain text. This might be done as follows:
- replace Summary.toString() with both toText() and toHtml() methods;
- replace HitSummarizer.getSummary() with getHtmlSummary() and getTextSummary(). This would require changes to NutchBean, DistributedSearch, etc.
If you're interested in this latter approach, you could either file a bug report, requesting this, and someone may implement it, or you could implement it yourself and contribute it as a patch.
Doug
Dawid Weiss wrote:
Hi there.
I'm from the Carrot2 project (a clustering front-end and components) and we'd love to add a Nutch adapter to our project (or directly to Nutch -- this is up to you). I've seen some posts that mentioned Carrot2 -- glad to hear you want to experiment with it.
Anyway, the adapter is actually already finished with an exception of one thing: when I retrieve hits' summaries using:
((NutchBean)bean).getSummary(details, query)
The result is _already_ HTML-escaped. I'd rather have the access to hit's content as a string, or to the summary as a string. Right now in FetchedSegments class you have:
public String getSummary(HitDetails details, Query query) throws IOException {
String text = getSegment(details).getText(getDocNo(details));
return new Summarizer().getSummary(text, query).toString(); }
And toString() on a Summary iterates over Fragments, appending them to a StringBuffer... only the Fragment's toString method encodes everything into HTML entities:
/** A fragment of text within a summary. */ public static class Fragment { private String text;
[snip]
/** Returns the text of this fragment. */ public String getText() { return text; } /** Returns an HTML representation of this fragment. */ public String toString() { return Entities.encode(text); } }
Maybe I'm blind... but how can I access unescaped summary of a hit?
Dawid
-- Carrot2 Project: http://www.cs.put.poznan.pl/dweiss/carrot
------------------------------------------------------- The SF.Net email is sponsored by EclipseCon 2004 Premiere Conference on Open Tools Development and Integration See the breadth of Eclipse activity. February 3-5 in Anaheim, CA. http://www.eclipsecon.org/osdn _______________________________________________ Nutch-developers mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/nutch-developers
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.searcher;
import java.io.*;
import java.net.InetSocketAddress;
import java.util.logging.Logger;
import javax.servlet.ServletContext;
import net.nutch.util.LogFormatter;
import net.nutch.util.NutchConf;
import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.HTMLConfiguration;
import org.xml.sax.*;
import org.xml.sax.helpers.DefaultHandler;
/** One stop shopping for search-related functionality. */
public class NutchBean
implements Searcher, HitDetailer, HitSummarizer, HitContent {
public static final Logger LOG =
LogFormatter.getLogger("net.nutch.searcher.NutchBean");
static {
LogFormatter.setShowThreadIDs(true);
}
private String[] segmentNames;
private Searcher searcher;
private HitDetailer detailer;
private HitSummarizer summarizer;
private HitContent content;
/** Cache in servlet context. */
public static NutchBean get(ServletContext app) throws IOException {
NutchBean bean = (NutchBean)app.getAttribute("nutchBean");
if (bean == null) {
LOG.info("creating new bean");
bean = new NutchBean();
app.setAttribute("nutchBean", bean);
}
return bean;
}
/** Construct reading from connected directory. */
public NutchBean() throws IOException {
this(new File(NutchConf.get("searcher.dir", ".")));
}
/** Construct in a named directory. */
public NutchBean(File dir) throws IOException {
File servers = new File(dir, "search-servers.txt");
if (servers.exists()) {
LOG.info("searching servers in " + servers.getCanonicalPath());
init(new DistributedSearch.Client(servers));
} else {
init(new File(dir, "index"), new File(dir, "segments"));
}
}
private void init(File indexDir, File segmentsDir) throws IOException {
IndexSearcher indexSearcher;
if (indexDir.exists()) {
LOG.info("opening merged index in " + indexDir.getCanonicalPath());
indexSearcher = new IndexSearcher(indexDir.toString());
} else {
LOG.info("opening segment indexes in " + segmentsDir.getCanonicalPath());
indexSearcher = new IndexSearcher(segmentsDir.listFiles());
}
FetchedSegments segments = new FetchedSegments(segmentsDir.toString());
this.segmentNames = segments.getSegmentNames();
this.searcher = indexSearcher;
this.detailer = indexSearcher;
this.summarizer = segments;
this.content = segments;
}
private void init(DistributedSearch.Client client) throws IOException {
this.segmentNames = client.getSegmentNames();
this.searcher = client;
this.detailer = client;
this.summarizer = client;
this.content = client;
}
public String[] getSegmentNames() {
return segmentNames;
}
public Hits search(Query query, int numHits) throws IOException {
return searcher.search(query, numHits);
}
public String getExplanation(Query query, Hit hit) throws IOException {
return searcher.getExplanation(query, hit);
}
public HitDetails getDetails(Hit hit) throws IOException {
return detailer.getDetails(hit);
}
public HitDetails[] getDetails(Hit[] hits) throws IOException {
return detailer.getDetails(hits);
}
public String getSummary(HitDetails hit, Query query) throws IOException {
return summarizer.getSummary(hit, query);
}
public String[] getSummary(HitDetails[] hits, Query query)
throws IOException {
return summarizer.getSummary(hits, query);
}
public String getSummaryText(HitDetails hit, Query query)
throws IOException {
return htmlToText(getSummary(hit, query));
}
public String[] getSummaryText(HitDetails[] hits, Query query)
throws IOException {
String[] htmls = getSummary(hits, query);
String[] result = new String[htmls.length];
for (int i = 0; i < htmls.length; i++) {
result[i] = htmlToText(htmls[i]);
}
return result;
}
private StringBuffer textBuffer = new StringBuffer();
private AbstractSAXParser parser;
{
try {
parser = new AbstractSAXParser(new HTMLConfiguration()){};
parser.setContentHandler(new DefaultHandler() {
public void characters(char[] chars, int start, int length)
throws SAXException {
textBuffer.append(chars, start, length);
}
});
} catch (Exception e) {
throw new RuntimeException(e.toString(), e);
}
}
private String htmlToText(String html) {
synchronized (parser) {
textBuffer.setLength(0);
try {
parser.parse(new InputSource(new StringReader(html)));
} catch (Exception e) { // shouldn't happen
throw new RuntimeException(e.toString(), e);
}
return textBuffer.toString();
}
}
public byte[] getContent(HitDetails hit) throws IOException {
return content.getContent(hit);
}
public String[] getAnchors(HitDetails hit) throws IOException {
return content.getAnchors(hit);
}
/** For debugging. */
public static void main(String[] args) throws Exception {
String usage = "NutchBean query";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
NutchBean bean = new NutchBean();
Query query = Query.parse(args[0]);
Hits hits = bean.search(query, 10);
System.out.println("Total hits: " + hits.getTotal());
int length = (int)Math.min(hits.getTotal(), 10);
Hit[] show = hits.getHits(0, length);
HitDetails[] details = bean.getDetails(show);
String[] summaries = bean.getSummaryText(details, query);
for (int i = 0; i < hits.getLength(); i++) {
System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]);
}
}
}
