Author: jerome Date: Tue May 9 16:04:40 2006 New Revision: 405565 URL: http://svn.apache.org/viewcvs?rev=405565&view=rev Log: NUTCH-134 : Summary improvements: * Summary no more returns HTML code. * Summary is now Writable * HitSummarizer now returns Summary instead of String * Added some Summary unit tests
Added: lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestSummary.java (with props) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitSummarizer.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java lucene/nutch/trunk/src/web/jsp/cluster.jsp lucene/nutch/trunk/src/web/jsp/search.jsp Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Tue May 9 16:04:40 2006 @@ -276,11 +276,11 @@ } - public String getSummary(HitDetails hit, Query query) throws IOException { + public Summary getSummary(HitDetails hit, Query query) throws IOException { return getRemote(hit).getSummary(hit, query); } - public String[] getSummary(HitDetails[] hits, Query query) + public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException { InetSocketAddress[] addrs = new InetSocketAddress[hits.length]; Object[][] params = new Object[hits.length][2]; @@ -291,7 +291,7 @@ params[i][0] = hit; params[i][1] = query; } - return (String[])RPC.call(SUMMARY, params, addrs, conf); + return (Summary[])RPC.call(SUMMARY, params, addrs, conf); } public byte[] getContent(HitDetails hit) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Tue May 9 16:04:40 2006 @@ -35,6 +35,7 @@ public class FetchedSegments implements HitSummarizer, HitContent { private static class Segment implements Closeable { + private static final Partitioner PARTITIONER = new HashPartitioner(); private FileSystem fs; @@ -150,19 +151,19 @@ return getSegment(details).getParseText(getUrl(details)); } - public String getSummary(HitDetails details, Query query) + public Summary getSummary(HitDetails details, Query query) throws IOException { - if (this.summarizer == null) { return ""; } + if (this.summarizer == null) { return new Summary(); } String text = getSegment(details).getParseText(getUrl(details)).getText(); - return this.summarizer.getSummary(text, query).toString(); + return this.summarizer.getSummary(text, query); } private class SummaryThread extends Thread { private HitDetails details; private Query query; - private String summary; + private Summary summary; private Throwable throwable; public SummaryThread(HitDetails details, Query query) { @@ -181,7 +182,7 @@ } - public String[] getSummary(HitDetails[] details, Query query) + public Summary[] getSummary(HitDetails[] details, Query query) throws IOException { SummaryThread[] threads = new SummaryThread[details.length]; for (int i = 0; i < threads.length; i++) { @@ -189,7 +190,7 @@ threads[i].start(); } - String[] results = new String[details.length]; + Summary[] results = new Summary[details.length]; for (int i = 0; i < threads.length; i++) { try { threads[i].join(); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitSummarizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitSummarizer.java?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitSummarizer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitSummarizer.java Tue May 9 16:04:40 2006 @@ -20,17 +20,20 @@ /** Service that builds a summary for a hit on a query. */ public interface HitSummarizer { - /** Returns a summary for the given hit details. + + /** + * Returns a summary for the given hit details. * * @param details the details of the hit to be summarized * @param query indicates what should be higlighted in the summary text */ - String getSummary(HitDetails details, Query query) throws IOException; + Summary getSummary(HitDetails details, Query query) throws IOException; - /** Returns summaries for a set of details. Hook for parallel IPC calls. + /** + * Returns summaries for a set of details. Hook for parallel IPC calls. * * @param details the details of hits to be summarized * @param query indicates what should be higlighted in the summary text */ - String[] getSummary(HitDetails[] details, Query query) throws IOException; + Summary[] getSummary(HitDetails[] details, Query query) throws IOException; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java Tue May 9 16:04:40 2006 @@ -320,11 +320,11 @@ return detailer.getDetails(hits); } - public String getSummary(HitDetails hit, Query query) throws IOException { + public Summary getSummary(HitDetails hit, Query query) throws IOException { return summarizer.getSummary(hit, query); } - public String[] getSummary(HitDetails[] hits, Query query) + public Summary[] getSummary(HitDetails[] hits, Query query) throws IOException { return summarizer.getSummary(hits, query); } @@ -377,7 +377,7 @@ int length = (int)Math.min(hits.getTotal(), 10); Hit[] show = hits.getHits(0, length); HitDetails[] details = bean.getDetails(show); - String[] summaries = bean.getSummary(details, query); + Summary[] summaries = bean.getSummary(details, query); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" "+i+" "+ details[i] + "\n" + summaries[i]); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Tue May 9 16:04:40 2006 @@ -145,7 +145,7 @@ Hit[] show = hits.getHits(start, end-start); HitDetails[] details = bean.getDetails(show); - String[] summaries = bean.getSummary(details, query); + Summary[] summaries = bean.getSummary(details, query); String requestUrl = request.getRequestURL().toString(); String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); @@ -204,13 +204,14 @@ String url = detail.getValue("url"); String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); - if (title == null || title.equals("")) // use url for docs w/o title + if (title == null || title.equals("")) { // use url for docs w/o title title = url; - + } + Element item = addNode(doc, channel, "item"); addNode(doc, item, "title", title); - addNode(doc, item, "description", summaries[i]); + addNode(doc, item, "description", summaries[i].toString()); addNode(doc, item, "link", url); addNode(doc, item, "nutch", "site", hit.getDedupValue()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java Tue May 9 16:04:40 2006 @@ -16,12 +16,27 @@ package org.apache.nutch.searcher; +// JDK imports +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; import java.util.ArrayList; + +// Hadoop imports +import org.apache.hadoop.io.UTF8; +import org.apache.hadoop.io.Writable; + +// Nutch imports import org.apache.nutch.html.Entities; -/** A document summary dynamically generated to match a query. */ -public class Summary { +/** A document summary dynamically generated to match a query. */ +public class Summary implements Writable { + + private final static int FRAGMENT = 0; + private final static int HIGHLIGHT = 1; + private final static int ELLIPSIS = 2; + /** A fragment of text within a summary. */ public static class Fragment { private String text; @@ -38,8 +53,20 @@ /** Returns true iff this fragment is an ellipsis. */ public boolean isEllipsis() { return false; } - /** Returns an HTML representation of this fragment. */ - public String toString() { return Entities.encode(text); } + /** Returns a textual representation of this fragment. */ + public String toString() { return getText(); } + + // Inherited Javadoc + public boolean equals(Object o) { + try { + Fragment f = (Fragment) o; + return f.getText().equals(getText()) && + f.isHighlight() == isHighlight() && + f.isEllipsis() == isEllipsis(); + } catch (Exception e) { + return false; + } + } } /** A highlighted fragment of text within a summary. */ @@ -49,9 +76,6 @@ /** Returns true. */ public boolean isHighlight() { return true; } - - /** Returns an HTML representation of this fragment. */ - public String toString() { return "<span class=\"highlight\">" + super.toString() + "</span>"; } } /** An ellipsis fragment within a summary. */ @@ -61,9 +85,6 @@ /** Returns true. */ public boolean isEllipsis() { return true; } - - /** Returns an HTML representation of this fragment. */ - public String toString() { return "<span class=\"ellipsis\"> ... </span>"; } } private ArrayList fragments = new ArrayList(); @@ -81,7 +102,7 @@ return (Fragment[])fragments.toArray(FRAGMENT_PROTO); } - /** Returns an HTML representation of this fragment. */ + /** Returns an String representation of this Summary. */ public String toString() { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < fragments.size(); i++) { @@ -89,4 +110,82 @@ } return buffer.toString(); } + + // Inherited Javadoc + public boolean equals(Object o) { + if (!(o instanceof Summary)) { return false; } + Fragment[] fragments1 = ((Summary) o).getFragments(); + Fragment[] fragments2 = getFragments(); + if (fragments1.length != fragments2.length) { return false; } + for (int i=0; i<fragments1.length; i++) { + if (!fragments1[i].equals(fragments2[i])) { + return false; + } + } + return true; + } + + /** + * Helper method that return a String representation for each + * specified Summary. + */ + public static String[] toStrings(Summary[] summaries) { + if (summaries == null) { return null; } + String[] strs = new String[summaries.length]; + for (int i=0; i<summaries.length; i++) { + strs[i] = summaries[i].toString(); + } + return strs; + } + + public static Summary read(DataInput in) throws IOException { + Summary summary = new Summary(); + summary.readFields(in); + return summary; + } + + + /* ------------------------- * + * <implementation:Writable> * + * ------------------------- */ + + // Inherited Javadoc + public void write(DataOutput out) throws IOException { + out.writeInt(fragments.size()); + Fragment fragment = null; + for (int i=0; i<fragments.size(); i++) { + fragment = (Fragment) fragments.get(i); + if (fragment.isHighlight()) { + out.writeByte(HIGHLIGHT); + UTF8.writeString(out, fragment.getText()); + } else if (fragment.isEllipsis()) { + out.writeByte(ELLIPSIS); + } else { + out.writeByte(FRAGMENT); + UTF8.writeString(out, fragment.getText()); + } + } + } + + // Inherited Javadoc + public void readFields(DataInput in) throws IOException { + int nbFragments = in.readInt(); + Fragment fragment = null; + for (int i=0; i<nbFragments; i++) { + int type = in.readByte(); + if (type == HIGHLIGHT) { + fragment = new Highlight(UTF8.readString(in)); + } else if (type == ELLIPSIS) { + fragment = new Ellipsis(); + } else { + fragment = new Fragment(UTF8.readString(in)); + } + fragments.add(fragment); + } + } + + /* -------------------------- * + * </implementation:Writable> * + * -------------------------- */ + } Added: lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestSummary.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestSummary.java?rev=405565&view=auto ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestSummary.java (added) +++ lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestSummary.java Tue May 9 16:04:40 2006 @@ -0,0 +1,170 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.searcher; + +// JUnit imports +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +// Nutch imports +import org.apache.nutch.searcher.Summary.Ellipsis; +import org.apache.nutch.searcher.Summary.Fragment; +import org.apache.nutch.searcher.Summary.Highlight; +import org.apache.nutch.util.WritableTestUtils; + + +/** + * JUnit based test of class [EMAIL PROTECTED] Summary}. + * + * @author Jérôme Charron + */ +public class TestSummary extends TestCase { + + public TestSummary(String testName) { + super(testName); + } + + public static Test suite() { + return new TestSuite(TestSummary.class); + } + + + /** Test of <code>Fragment</code> inner class */ + public void testFragment() { + Fragment fragment = new Fragment("fragment text"); + assertEquals("fragment text", fragment.getText()); + assertEquals("fragment text", fragment.toString()); + assertFalse(fragment.isEllipsis()); + assertFalse(fragment.isHighlight()); + assertTrue(fragment.equals(new Fragment("fragment text"))); + assertFalse(fragment.equals(new Fragment("some text"))); + assertFalse(fragment.equals(new Ellipsis())); + assertFalse(fragment.equals(new Highlight("fragment text"))); + } + + /** Test of <code>Ellipsis</code> inner class */ + public void testEllipsis() { + Fragment fragment = new Ellipsis(); + assertEquals(" ... ", fragment.getText()); + assertEquals(" ... ", fragment.toString()); + assertTrue(fragment.isEllipsis()); + assertFalse(fragment.isHighlight()); + assertFalse(fragment.equals(new Fragment("fragment text"))); + assertTrue(fragment.equals(new Ellipsis())); + assertFalse(fragment.equals(new Highlight("fragment text"))); + } + + /** Test of <code>Highlight</code> inner class */ + public void testHighlight() { + Fragment fragment = new Highlight("highlight text"); + assertEquals("highlight text", fragment.getText()); + assertEquals("highlight text", fragment.toString()); + assertFalse(fragment.isEllipsis()); + assertTrue(fragment.isHighlight()); + assertFalse(fragment.equals(new Fragment("fragment text"))); + assertFalse(fragment.equals(new Ellipsis())); + assertFalse(fragment.equals(new Highlight("fragment text"))); + assertTrue(fragment.equals(new Highlight("highlight text"))); + } + + /** Test of <code>add</code> / <code>get</code> methods */ + public void testAdd() { + Fragment[] fragments = null; + Summary summary = new Summary(); + summary.add(new Fragment("fragment1")); + fragments = summary.getFragments(); + assertEquals(1, fragments.length); + assertEquals("fragment1", fragments[0].toString()); + summary.add(new Fragment("fragment2")); + fragments = summary.getFragments(); + assertEquals(2, fragments.length); + assertEquals("fragment1", fragments[0].toString()); + assertEquals("fragment2", fragments[1].toString()); + summary.add(new Fragment("fragment3")); + fragments = summary.getFragments(); + assertEquals(3, fragments.length); + assertEquals("fragment1", fragments[0].toString()); + assertEquals("fragment2", fragments[1].toString()); + assertEquals("fragment3", fragments[2].toString()); + } + + /** Test of <code>toString</code> method. */ + public void testToString() { + Summary summary = new Summary(); + assertEquals("", summary.toString()); + summary.add(new Fragment("fragment1")); + assertEquals("fragment1", summary.toString()); + summary.add(new Ellipsis()); + assertEquals("fragment1 ... ", summary.toString()); + summary.add(new Highlight("highlight")); + assertEquals("fragment1 ... highlight", summary.toString()); + summary.add(new Fragment("fragment2")); + assertEquals("fragment1 ... highlightfragment2", summary.toString()); + } + + /** Test of <code>toStrings</code>. */ + public void testToStrings() { + Summary[] summaries = { new Summary(), new Summary() }; + summaries[0].add(new Fragment("fragment1.1")); + summaries[0].add(new Ellipsis()); + summaries[0].add(new Highlight("highlight1")); + summaries[0].add(new Fragment("fragment1.2")); + summaries[1].add(new Fragment("fragment2.1")); + summaries[1].add(new Ellipsis()); + summaries[1].add(new Highlight("highlight2")); + summaries[1].add(new Fragment("fragment2.2")); + String[] strings = Summary.toStrings(summaries); + assertEquals(2, strings.length); + assertEquals("fragment1.1 ... highlight1fragment1.2", strings[0]); + assertEquals("fragment2.1 ... highlight2fragment2.2", strings[1]); + } + + /** Test of <code>equals</code> method. */ + public void testEquals() { + Summary summary1 = new Summary(); + Summary summary2 = new Summary(); + assertFalse(summary1.equals(null)); + assertFalse(summary1.equals("")); + assertTrue(summary1.equals(summary2)); + summary1.add(new Fragment("text fragment")); + assertFalse(summary1.equals(summary2)); + summary2.add(new Fragment("text fragment")); + assertTrue(summary1.equals(summary2)); + summary1.add(new Ellipsis()); + assertFalse(summary1.equals(summary2)); + summary2.add(new Ellipsis()); + assertTrue(summary1.equals(summary2)); + summary1.add(new Highlight("highlight")); + assertFalse(summary1.equals(summary2)); + summary2.add(new Highlight("highlight")); + assertTrue(summary1.equals(summary2)); + summary1.add(new Fragment("text fragment")); + summary2.add(new Fragment("fragment text")); + assertFalse(summary1.equals(summary2)); + } + + /** Test of <code>writable</code> implementation. */ + public void testWritable() throws Exception { + Summary summary = new Summary(); + summary.add(new Fragment("fragment1.1")); + summary.add(new Ellipsis()); + summary.add(new Highlight("highlight1")); + summary.add(new Fragment("fragment1.2")); + WritableTestUtils.testWritable(summary); + } + +} Propchange: lucene/nutch/trunk/src/test/org/apache/nutch/searcher/TestSummary.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: lucene/nutch/trunk/src/web/jsp/cluster.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/cluster.jsp?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/web/jsp/cluster.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/cluster.jsp Tue May 9 16:04:40 2006 @@ -23,7 +23,7 @@ if (clusterer != null) { final long clusteringStart = System.currentTimeMillis(); try { - clusters = clusterer.clusterHits( details, summaries ); + clusters = clusterer.clusterHits( details, Summary.toStrings(summaries) ); final long clusteringDuration = System.currentTimeMillis() - clusteringStart; bean.LOG.info("Clustering took: " + clusteringDuration + " milliseconds."); } catch (Exception e) { Modified: lucene/nutch/trunk/src/web/jsp/search.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/search.jsp?rev=405565&r1=405564&r2=405565&view=diff ============================================================================== --- lucene/nutch/trunk/src/web/jsp/search.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/search.jsp Tue May 9 16:04:40 2006 @@ -9,6 +9,7 @@ import="org.apache.nutch.html.Entities" import="org.apache.nutch.searcher.*" + import="org.apache.nutch.searcher.Summary.Fragment" import="org.apache.nutch.plugin.*" import="org.apache.nutch.clustering.*" import="org.apache.hadoop.conf.*" @@ -177,7 +178,7 @@ Hit[] show = hits.getHits(start, realEnd-start); HitDetails[] details = bean.getDetails(show); - String[] summaries = bean.getSummary(details, query); + Summary[] summaries = bean.getSummary(details, query); bean.LOG.info("total hits: " + hits.getTotal()); %> @@ -210,11 +211,27 @@ HitDetails detail = details[i]; String title = detail.getValue("title"); String url = detail.getValue("url"); - String summary = summaries[i]; String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); - if (title == null || title.equals("")) // use url for docs w/o title + if (title == null || title.equals("")) { // use url for docs w/o title title = url; + } + + // Build the summary + StringBuffer sum = new StringBuffer(); + Fragment[] fragments = summaries[i].getFragments(); + for (int j=0; j<fragments.length; j++) { + if (fragments[j].isHighlight()) { + sum.append("<span class=\"highlight\">") + .append(Entities.encode(fragments[j].getText())) + .append("</span>"); + } else if (fragments[j].isEllipsis()) { + sum.append("<span class=\"ellipsis\"> ... </span>"); + } else { + sum.append(Entities.encode(fragments[j].getText())); + } + } + String summary = sum.toString(); %> <b><a href="<%=url%>"><%=Entities.encode(title)%></a></b> <%@ include file="more.jsp" %> ------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642 _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs